In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import pickle
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
import string
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\UMEGS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
base_url = "https://yts.mx/api"
details_url = base_url + "/v2/movie_details.json"

def get_movie_details(movie_id):
    params = {"movie_id": movie_id, "with_images": "true", "with_cast": "true"}
    try:
        response = requests.get(details_url, params=params)
        response_json = response.json()
        movie_details = response_json["data"]["movie"]
        if movie_details.get("id") == movie_id:
            movie = {
                "id": movie_details.get("id"),
                "imdb_code": movie_details.get("imdb_code"),
                "title": movie_details.get("title"),
                "title_english": movie_details.get("title_english"),
                "title_long": movie_details.get("title_long"),
                "year": movie_details.get("year"),
                "rating": movie_details.get("rating"),
                "runtime": movie_details.get("runtime"),
                "genres": movie_details.get("genres", []),
                "download_count": movie_details.get("download_count"),
                "like_count": movie_details.get("like_count"),
                "description_full": movie_details.get("description_full"),
                "language": movie_details.get("language"),
                "cast": [{"name": cast.get("name"), "character_name": cast.get("character_name")} for cast in
                         movie_details.get("cast", [])],
            }
            return movie
        else:
            return {
                "id": movie_id,
                "imdb_code": 'None',
                "title": 'None',
                "title_english": 'None',
                "title_long": '',
                "year": '',
                "rating": '',
                "runtime": '',
                "genres": 'None',
                "download_count": 'None',
                "like_count": 'None',
                "description_full": 'None',
                "language": 'None',
                "cast": [],
            }
    except Exception as e:
        return {
                "id": movie_id,
                "imdb_code": 'Error',
                "title": 'Error',
                "title_english": 'Error',
                "title_long": '',
                "year": '',
                "rating": '',
                "runtime": '',
                "genres": 'Error',
                "download_count": 'Error',
                "like_count": 'Error',
                "description_full": 'Error',
                "language": 'Error',
                "cast": [],
            }


movies = pd.read_excel('dataset/movies.xlsx')

error_id = movies[movies.imdb_code == 'Error'].id

movies.drop(index=movies[movies.imdb_code == 'None'].index,inplace=True)
movies.drop(index=movies[movies.imdb_code == 'Error'].index,inplace=True)

m = []
for i in error_id:
    m.append(get_movie_details(i))

mo = pd.concat([pd.DataFrame(m),movies],)
mo.to_excel('dataset/movies_v2.xlsx')

In [64]:
movies = pd.read_excel('dataset/movies_v2.xlsx')

In [65]:
movies.head()

Unnamed: 0.1,Unnamed: 0,id,imdb_code,title,title_english,title_long,year,rating,runtime,genres,download_count,like_count,description_full,language,cast
0,0,3830,tt0450345,The Wicker Man,The Wicker Man,The Wicker Man (2006),2006.0,3.7,102.0,"['Action', 'Horror', 'Mystery', 'Thriller']",26395,62,A sheriff investigating the disappearance of a...,en,"[{'name': 'Nicolas Cage', 'character_name': 'E..."
1,1,3831,tt0073906,The Wind and the Lion,The Wind and the Lion,The Wind and the Lion (1975),1975.0,6.8,119.0,"['Action', 'Adventure', 'Drama']",43729,102,At the beginning of the 20th century an Americ...,en,"[{'name': 'Sean Connery', 'character_name': 'R..."
2,2,15867,tt8430676,Shepard,Shepard,Shepard (2020),2020.0,4.3,71.0,"['Action', 'Thriller']",12221,5,A troubled teen crosses paths with a charismat...,en,"[{'name': 'Kareem J. Grimes', 'character_name'..."
3,3,15868,tt3321300,MI-5,MI-5,MI-5 (2015),2015.0,6.2,104.0,"['Action', 'Drama', 'Thriller']",43329,16,MI5 personnel are caught up in a traffic jam i...,en,"[{'name': 'Kit Harington', 'character_name': '..."
4,4,19850,tt10833270,Upside-Down Magic,Upside-Down Magic,Upside-Down Magic (2020),2020.0,5.3,96.0,"['Action', 'Family', 'Fantasy']",119887,33,"In ""Upside-Down Magic, a Disney Channel Origin...",en,"[{'name': 'Vicki Lewis', 'character_name': 'He..."


In [66]:
movies.columns

Index(['Unnamed: 0', 'id', 'imdb_code', 'title', 'title_english', 'title_long',
       'year', 'rating', 'runtime', 'genres', 'download_count', 'like_count',
       'description_full', 'language', 'cast'],
      dtype='object')

In [68]:
movies[movies.title_english.isnull()]

Unnamed: 0,id,imdb_code,title,title_english,title_long,year,rating,runtime,genres,download_count,like_count,description_full,language,cast
19334,19651,tt1849087,,,(2012),2012.0,0.0,0.0,"['Action', 'Drama']",3232,2,Spring Eddy is set in a small Texas town follo...,en,"[{'name': 'Gabriel Luna', 'character_name': 'E..."
44003,44635,tt10260672,,,(2019),2019.0,0.0,90.0,['Documentary'],4040,0,,,[]


In [150]:
movies.drop(columns=['Unnamed: 0'],inplace=True)
required_columns = ['id', 'title_english', 'genres', 'description_full', 'cast']
movie_marged_required_col = movies[required_columns]

In [151]:
movie_marged_required_col.head()

Unnamed: 0,id,title_english,genres,description_full,cast
0,3830,The Wicker Man,"['Action', 'Horror', 'Mystery', 'Thriller']",A sheriff investigating the disappearance of a...,"[{'name': 'Nicolas Cage', 'character_name': 'E..."
1,3831,The Wind and the Lion,"['Action', 'Adventure', 'Drama']",At the beginning of the 20th century an Americ...,"[{'name': 'Sean Connery', 'character_name': 'R..."
2,15867,Shepard,"['Action', 'Thriller']",A troubled teen crosses paths with a charismat...,"[{'name': 'Kareem J. Grimes', 'character_name'..."
3,15868,MI-5,"['Action', 'Drama', 'Thriller']",MI5 personnel are caught up in a traffic jam i...,"[{'name': 'Kit Harington', 'character_name': '..."
4,19850,Upside-Down Magic,"['Action', 'Family', 'Fantasy']","In ""Upside-Down Magic, a Disney Channel Origin...","[{'name': 'Vicki Lewis', 'character_name': 'He..."


In [134]:
movie_marged_required_col.isnull().sum()

id                    0
title_english         2
genres                0
description_full    407
cast                  0
dtype: int64

In [135]:
movie_marged_required_col[movie_marged_required_col.title_english.isnull()].index

Int64Index([19334, 44003], dtype='int64')

In [136]:
movie_marged_required_col.drop(index=movie_marged_required_col[movie_marged_required_col.title_english.isnull()].index,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_marged_required_col.drop(index=movie_marged_required_col[movie_marged_required_col.title_english.isnull()].index,inplace=True)


In [137]:
movie_marged_required_col[movie_marged_required_col.title_english.isnull()]

Unnamed: 0,id,title_english,genres,description_full,cast


In [138]:
movie_marged_required_col.duplicated().sum()

0

In [139]:
eval(movie_marged_required_col.genres[0])

['Action', 'Horror', 'Mystery', 'Thriller']

In [140]:
movie_marged_required_col['genres'] = movie_marged_required_col['genres'].apply(eval)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_marged_required_col['genres'] = movie_marged_required_col['genres'].apply(eval)


In [141]:
movie_marged_required_col.head()

Unnamed: 0,id,title_english,genres,description_full,cast
0,3830,The Wicker Man,"[Action, Horror, Mystery, Thriller]",A sheriff investigating the disappearance of a...,"[{'name': 'Nicolas Cage', 'character_name': 'E..."
1,3831,The Wind and the Lion,"[Action, Adventure, Drama]",At the beginning of the 20th century an Americ...,"[{'name': 'Sean Connery', 'character_name': 'R..."
2,15867,Shepard,"[Action, Thriller]",A troubled teen crosses paths with a charismat...,"[{'name': 'Kareem J. Grimes', 'character_name'..."
3,15868,MI-5,"[Action, Drama, Thriller]",MI5 personnel are caught up in a traffic jam i...,"[{'name': 'Kit Harington', 'character_name': '..."
4,19850,Upside-Down Magic,"[Action, Family, Fantasy]","In ""Upside-Down Magic, a Disney Channel Origin...","[{'name': 'Vicki Lewis', 'character_name': 'He..."


In [142]:
movie_marged_required_col.cast[0]

"[{'name': 'Nicolas Cage', 'character_name': 'Edward Malus'}, {'name': 'James Franco', 'character_name': 'Bar Guy #1'}, {'name': 'Leelee Sobieski', 'character_name': 'Sister Honey'}, {'name': 'Ellen Burstyn', 'character_name': 'Sister SummersIsle'}]"

In [143]:
eval(movie_marged_required_col.cast[0])

[{'name': 'Nicolas Cage', 'character_name': 'Edward Malus'},
 {'name': 'James Franco', 'character_name': 'Bar Guy #1'},
 {'name': 'Leelee Sobieski', 'character_name': 'Sister Honey'},
 {'name': 'Ellen Burstyn', 'character_name': 'Sister SummersIsle'}]

In [148]:
def get_top_3_cast(x):
    return [value for cast in eval(x)[:3] for value in (cast['name'], cast['character_name'])]

In [145]:
movie_marged_required_col['cast'] = movie_marged_required_col['cast'].apply(get_top_3_cast)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_marged_required_col['cast'] = movie_marged_required_col['cast'].apply(get_top_3_cast)


In [146]:
movie_marged_required_col.head()

Unnamed: 0,id,title_english,genres,description_full,cast
0,3830,The Wicker Man,"[Action, Horror, Mystery, Thriller]",A sheriff investigating the disappearance of a...,"[Nicolas Cage, Edward Malus, James Franco, Bar..."
1,3831,The Wind and the Lion,"[Action, Adventure, Drama]",At the beginning of the 20th century an Americ...,"[Sean Connery, Raisuli, Candice Bergen, Eden P..."
2,15867,Shepard,"[Action, Thriller]",A troubled teen crosses paths with a charismat...,"[Kareem J. Grimes, Dwyer, Ashley Nicole Willia..."
3,15868,MI-5,"[Action, Drama, Thriller]",MI5 personnel are caught up in a traffic jam i...,"[Kit Harington, Will Holloway, Tuppence Middle..."
4,19850,Upside-Down Magic,"[Action, Family, Fantasy]","In ""Upside-Down Magic, a Disney Channel Origin...","[Vicki Lewis, Headmaster Knightslinger, Yasmee..."


In [147]:
movie_marged_required_col.description_full.fillna('',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_marged_required_col.description_full.fillna('',inplace=True)


In [110]:
movie_marged_required_col.description_full

0        A sheriff investigating the disappearance of a...
1        At the beginning of the 20th century an Americ...
2        A troubled teen crosses paths with a charismat...
3        MI5 personnel are caught up in a traffic jam i...
4        In "Upside-Down Magic, a Disney Channel Origin...
                               ...                        
45314    Three criminals disguise themselves as Buddhis...
45315    The spirit of a deceased mother takes over her...
45316    Mama's Boy: A Story from Our Americas is the t...
45317    In the village of Gavaldon, two misfits and be...
45318    A friendship forms between two strangers. For ...
Name: description_full, Length: 45317, dtype: object

In [111]:
def description_preprossing(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = text.replace('-', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    ' '.join([word for word in text.split() if word not in stopwords])

    return text

In [112]:
movie_marged_required_col.description_full = movie_marged_required_col.description_full.apply(description_preprossing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_marged_required_col.description_full = movie_marged_required_col.description_full.apply(description_preprossing)


In [113]:
movie_marged_required_col['description_full'] = movie_marged_required_col['description_full'].apply(lambda x: x.split())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_marged_required_col['description_full'] = movie_marged_required_col['description_full'].apply(lambda x: x.split())


In [114]:
movie_marged_required_col.head()

Unnamed: 0,id,title_english,genres,description_full,cast
0,3830,The Wicker Man,"[Action, Horror, Mystery, Thriller]","[A, sheriff, investigating, the, disappearance...","[{'name': 'Nicolas Cage', 'character_name': 'E..."
1,3831,The Wind and the Lion,"[Action, Adventure, Drama]","[At, the, beginning, of, the, 20th, century, a...","[{'name': 'Sean Connery', 'character_name': 'R..."
2,15867,Shepard,"[Action, Thriller]","[A, troubled, teen, crosses, paths, with, a, c...","[{'name': 'Kareem J. Grimes', 'character_name'..."
3,15868,MI-5,"[Action, Drama, Thriller]","[MI5, personnel, are, caught, up, in, a, traff...","[{'name': 'Kit Harington', 'character_name': '..."
4,19850,Upside-Down Magic,"[Action, Family, Fantasy]","[In, Upside, Down, Magic, a, Disney, Channel, ...","[{'name': 'Vicki Lewis', 'character_name': 'He..."


In [60]:
movie_marged_required_col.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_marged_required_col.dropna(inplace=True)


In [61]:
movie_marged_required_col['genres'] = movie_marged_required_col['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movie_marged_required_col['description_full'] = movie_marged_required_col['description_full'].apply(lambda x: [i.replace(" ", "") for i in x])
movie_marged_required_col['cast'] = movie_marged_required_col['cast'].apply(lambda x: [i.replace(" ", "") for i in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_marged_required_col['genres'] = movie_marged_required_col['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_marged_required_col['description_full'] = movie_marged_required_col['description_full'].apply(lambda x: [i.replace(" ", "") for i in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stab

In [149]:
movie_marged_required_col.head()

Unnamed: 0,id,title_english,genres,description_full,cast
0,3830,The Wicker Man,"[Action, Horror, Mystery, Thriller]",A sheriff investigating the disappearance of a...,"[Nicolas Cage, Edward Malus, James Franco, Bar..."
1,3831,The Wind and the Lion,"[Action, Adventure, Drama]",At the beginning of the 20th century an Americ...,"[Sean Connery, Raisuli, Candice Bergen, Eden P..."
2,15867,Shepard,"[Action, Thriller]",A troubled teen crosses paths with a charismat...,"[Kareem J. Grimes, Dwyer, Ashley Nicole Willia..."
3,15868,MI-5,"[Action, Drama, Thriller]",MI5 personnel are caught up in a traffic jam i...,"[Kit Harington, Will Holloway, Tuppence Middle..."
4,19850,Upside-Down Magic,"[Action, Family, Fantasy]","In ""Upside-Down Magic, a Disney Channel Origin...","[Vicki Lewis, Headmaster Knightslinger, Yasmee..."


In [116]:
movie = movie_marged_required_col.copy(deep=True)

In [117]:
movie.columns

Index(['id', 'title_english', 'genres', 'description_full', 'cast'], dtype='object')

In [118]:
movie['tag'] = movie['genres'] + movie['description_full'] + movie['cast']

TypeError: can only concatenate list (not "str") to list

In [None]:
movie.head()

In [None]:
movie = movie[['id', 'title_english', 'tag']]

In [70]:
movie.head()

Unnamed: 0,id,title_english,tag
0,3830,The Wicker Man,"[Action, Horror, Mystery, Thriller, A, sheriff..."
1,3831,The Wind and the Lion,"[Action, Adventure, Drama, At, the, beginning,..."
2,15867,Shepard,"[Action, Thriller, A, troubled, teen, crosses,..."
3,15868,MI-5,"[Action, Drama, Thriller, MI5, personnel, are,..."
4,19850,Upside-Down Magic,"[Action, Family, Fantasy, In, ""Upside-Down, Ma..."


In [71]:
movie['tag'] = movie['tag'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie['tag'] = movie['tag'].apply(lambda x: " ".join(x))


In [72]:
movie.head()

Unnamed: 0,id,title_english,tag
0,3830,The Wicker Man,Action Horror Mystery Thriller A sheriff inves...
1,3831,The Wind and the Lion,Action Adventure Drama At the beginning of the...
2,15867,Shepard,Action Thriller A troubled teen crosses paths ...
3,15868,MI-5,Action Drama Thriller MI5 personnel are caught...
4,19850,Upside-Down Magic,"Action Family Fantasy In ""Upside-Down Magic, a..."


In [73]:
movie['tag'] = movie['tag'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie['tag'] = movie['tag'].apply(lambda x: x.lower())


In [74]:
movie.head()

Unnamed: 0,id,title_english,tag
0,3830,The Wicker Man,action horror mystery thriller a sheriff inves...
1,3831,The Wind and the Lion,action adventure drama at the beginning of the...
2,15867,Shepard,action thriller a troubled teen crosses paths ...
3,15868,MI-5,action drama thriller mi5 personnel are caught...
4,19850,Upside-Down Magic,"action family fantasy in ""upside-down magic, a..."


In [244]:
def movie_prerossing(df):
    df = df.copy(deep=True)
    stopwords = nltk.corpus.stopwords.words('english')
    ps = PorterStemmer()

    def description_preprossing(text):
        text = text.replace('-', ' ').translate(str.maketrans('', '', string.punctuation))
        text = ' '.join([word for word in text.split() if word not in stopwords])
        return text

    def get_top_3_cast(x):
        return [value for cast in eval(x)[:3] for value in (cast['name'], cast['character_name'])]

    def staming(text):
        return " ".join([ps.stem(word) for word in text.split()])

    print('drop null title')
    df.drop(index=df[(df.title_english.isnull()) | (df.title_english == 'None')].index, inplace=True)

    print('genres eval')
    df.genres = df.genres.apply(eval)

    print('top 3 cast')
    df.cast = df.cast.apply(get_top_3_cast)

    print('description fill na')
    df.description_full.fillna('', inplace=True)

    print('description preprossing')
    df.description_full = df.description_full.apply(description_preprossing)

    print('description split')
    df.description_full = df.description_full.apply(lambda x: x.split())

    print('genres replace')
    df.genres = df.genres.apply(lambda x: [i.replace(" ", "") for i in x])

    print('description replace')
    df.description_full = df.description_full.apply(lambda x: [i.replace(" ", "") for i in x])

    print('cast split')
    df.cast = df.cast.apply(lambda x: [i.replace(" ", "") for i in x])

    print('making tag')
    df['tag'] = df.genres + df.description_full + df.cast

    print('join and to lower')
    df.tag = df.tag.apply(lambda x: " ".join(x).lower())

    print('staming')
    df.tag = df['tag'].apply(staming)

    return df[['id', 'title_english', 'tag']]

## Stemming

In [237]:
# ['love', 'Loved', 'loving'] = love

In [238]:
ps = PorterStemmer()

In [239]:
def staming(text):
    stemed_text = [ps.stem(word) for word in text.split()]
    return " ".join(stemed_text)

In [240]:
movie['tag'] = movie['tag'].apply(staming)

In [241]:
movie.head()

Unnamed: 0,id,title_english,tag
0,3830,The Wicker Man,action horror mysteri thriller a sheriff inves...
1,3831,The Wind and the Lion,action adventur drama at begin 20th centuri am...
2,15867,Shepard,action thriller a troubl teen cross path chari...
3,15868,MI-5,action drama thriller mi5 personnel caught tra...
4,19850,Upside-Down Magic,action famili fantasi in upsid down magic disn...


In [245]:
# movies.drop(columns=['Unnamed: 0'],inplace=True)
required_columns = ['id', 'title_english', 'genres', 'description_full', 'cast']
movie_marged_required_col = movies[required_columns]

movie = movie_prerossing(movie_marged_required_col)

drop null title
genres eval
top 3 cast
description fill na
description preprossing
description split
genres replace
description replace
cast split
making tag
join and to lower
staming


In [246]:
movie.head()

Unnamed: 0,id,title_english,tag
0,3830,The Wicker Man,action horror mysteri thriller a sheriff inves...
1,3831,The Wind and the Lion,action adventur drama at begin 20th centuri am...
2,15867,Shepard,action thriller a troubl teen cross path chari...
3,15868,MI-5,action drama thriller mi5 personnel caught tra...
4,19850,Upside-Down Magic,action famili fantasi in upsid down magic disn...


In [247]:
movie.to_csv('dataset/movies_data_id_title_tag_stemming.csv',index=False)

In [248]:
movie = pd.read_csv('dataset/movies_data_id_title_tag_stemming.csv')

In [249]:
movie.dropna(inplace=True)

In [255]:
movie.tag[4]

'action famili fantasi in upsid down magic disney channel origin movi 13 year old nori boxwood horac discov flux anim best friend reina carvaj manipul flame togeth enter sage academi magic studi reina expert abil har power fire land top class flare nori wonki magic procliv turn dritten half kitten half dragon land class upsid down magic otherwis known udm while headmast knightsling believ udm unconvent power leav vulner danger evil shadow magic nori fellow udm classmat set prove upsid down magic beat right side vickilewi headmasterknightsling yasmeenfletch chandra sienaagudong reinacarvaj'

In [266]:
# cv = CountVectorizer(max_features=5000, stop_words='english')
cv = TfidfVectorizer(ngram_range=(1,2), stop_words='english',analyzer='word', min_df=0,)

In [267]:
# limited_data = movie['tag'][:20000]
# vectorized_tag = cv.fit_transform(limited_data)
vectorized_tag = cv.fit_transform(movie['tag'])

In [268]:
vectorized_tag.shape

(45313, 1541876)

In [269]:
'dragon land' in cv.get_feature_names_out()

True

In [270]:
df_vect = pd.DataFrame(vectorized_tag.toarray(), columns=cv.get_feature_names_out())

MemoryError: Unable to allocate 521. GiB for an array with shape (45313, 1541876) and data type float64

In [265]:
df_vect

Unnamed: 0,10,10 year,100,11,11 year,12,12 year,13,13 year,14,...,youtub,yu,yun,zach,zealand,zero,zoe,zombi,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.340048,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.120807,0.133697,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
45309,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
45310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
45311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [47]:
import string

text = "".join([ch for ch in t if ch not in string.punctuation])
text = staming(text)

In [22]:
movies_cosine_similarity = cosine_similarity(vectorized_tag)

MemoryError: Unable to allocate 2.98 GiB for an array with shape (20000, 20000) and data type float64

In [None]:
movies_cosine_similarity.shape

In [None]:

def recommend(movie_guess):
    movie_index = movie[movie['title_english'] == movie_guess].index[0]
    distances = movies_cosine_similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    print(movie_list)
    for i in movie_list:
        print(movie.iloc[i[0]].title_english,i[0], sep=" : ")


In [122]:
recommend('The Dark Knight Rises')

[(1462, 0.35355339059327384), (3069, 0.3466876226407682), (3443, 0.3310610959858654), (3854, 0.3240906080438344), (2704, 0.3217979514674191)]
Hellraiser: Inferno : 1462
The Believers : 3069
The Last Boy Scout : 3443
The Young Savages : 3854
Shanghai : 2704


In [12]:
movie[['title_english', 'id']].to_pickle('movie_list.pkl')

In [None]:
pickle.dump(movies_cosine_similarity, open('movies_cosine_similarity.pkl', 'wb'))

In [139]:
movie_ = pickle.load(open('movie_list.pkl', 'rb'))

movie_

Unnamed: 0,title,movie_id
0,Avatar,19995
1,Pirates of the Caribbean: At World's End,285
2,Spectre,206647
3,The Dark Knight Rises,49026
4,John Carter,49529
...,...,...
4804,El Mariachi,9367
4805,Newlyweds,72766
4806,"Signed, Sealed, Delivered",231617
4807,Shanghai Calling,126186


In [163]:
pd.DataFrame(movies_cosine_similarity).to_pickle('../model/movies_cosine_similarity.pkl', compression='zip')

In [None]:
['infer', None, 'bz2', 'gzip', 'xz', 'zip', 'zstd']

In [7]:
pd.read_pickle('../model/movies_cosine_similarity.pkl', compression='zip')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4796,4797,4798,4799,4800,4801,4802,4803,4804,4805
0,1.000000,0.083462,0.086031,0.073472,0.189299,0.108389,0.040242,0.146735,0.059235,0.096730,...,0.000000,0.000000,0.042239,0.052632,0.000000,0.019252,0.046829,0.044992,0.000000,0.000000
1,0.083462,1.000000,0.060634,0.038837,0.075047,0.114587,0.021272,0.129272,0.062622,0.102262,...,0.000000,0.000000,0.022327,0.027821,0.000000,0.040706,0.000000,0.023783,0.000000,0.026153
2,0.086031,0.060634,1.000000,0.060048,0.077357,0.070868,0.021926,0.133250,0.064550,0.105409,...,0.085749,0.000000,0.000000,0.000000,0.017590,0.041959,0.000000,0.024515,0.000000,0.000000
3,0.073472,0.038837,0.060048,1.000000,0.033032,0.060523,0.056177,0.068279,0.041345,0.202548,...,0.027462,0.027462,0.058964,0.055104,0.022533,0.067188,0.000000,0.031404,0.048526,0.086335
4,0.189299,0.075047,0.077357,0.033032,1.000000,0.097460,0.054277,0.197910,0.079894,0.108721,...,0.035377,0.000000,0.075960,0.023662,0.145141,0.155799,0.000000,0.020228,0.083351,0.044488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4801,0.019252,0.040706,0.041959,0.067188,0.155799,0.079295,0.029440,0.143131,0.130005,0.035383,...,0.000000,0.057567,0.139055,0.057756,0.259796,1.000000,0.000000,0.000000,0.152586,0.126688
4802,0.046829,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.070014,0.000000,0.000000,0.028724,0.000000,1.000000,0.120096,0.000000,0.000000
4803,0.044992,0.023783,0.024515,0.031404,0.020228,0.018531,0.000000,0.000000,0.050637,0.020672,...,0.067267,0.033634,0.018054,0.044992,0.013799,0.000000,0.120096,1.000000,0.039621,0.042295
4804,0.000000,0.000000,0.000000,0.048526,0.083351,0.057270,0.035438,0.043073,0.104328,0.021296,...,0.000000,0.034648,0.092992,0.000000,0.142148,0.152586,0.000000,0.039621,1.000000,0.087142
