In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('final_data.csv')

In [3]:
df

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,combined
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,CCH Pounder Joel David Moore Wes Studi James C...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,Johnny Depp Orlando Bloom Jack Davenport Gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,Christoph Waltz Rory Kinnear Stephanie Sigman ...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,Tom Hardy Christian Bale Joseph Gordon-Levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,Doug Walker Rob Walker unknown Doug Walker Doc...
...,...,...,...,...,...,...,...
5629,Etan Cohen,Will Ferrell,John C. Reilly,Rebecca Hall,Mystery Adventure Comedy Crime,holmes & watson,Will Ferrell John C. Reilly Rebecca Hall Etan ...
5630,Adam McKay,Christian Bale,Amy Adams,Steve Carell,Thriller Science Fiction Action Adventure,vice,Christian Bale Amy Adams Steve Carell Adam McK...
5631,Mimi Leder,Felicity Jones,Armie Hammer,Justin Theroux,Drama History,on the basis of sex,Felicity Jones Armie Hammer Justin Theroux Mim...
5632,Karyn Kusama,Nicole Kidman,Sebastian Stan,Toby Kebbell,Thriller Crime Drama Action,destroyer,Nicole Kidman Sebastian Stan Toby Kebbell Kary...


In [4]:
df['combined'] = df['combined'].apply(lambda x:x.lower())

In [5]:
df['combined'][0]

'cch pounder joel david moore wes studi james cameron action adventure fantasy sci-fi'

In [6]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [7]:
import nltk

In [8]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [9]:
def stem(text):
    y = []
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)

In [10]:
df['combined'] = df['combined'].apply(stem)

In [11]:
df['combined'][0]

'cch pounder joel david moor we studi jame cameron action adventur fantasi sci-fi'

In [12]:
df['combined'][1]

'johnni depp orlando bloom jack davenport gore verbinski action adventur fantasi'

In [13]:
%pip install scikit-learn




In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [15]:
df.shape

(5634, 7)

In [16]:
vectors = cv.fit_transform(df['combined']).toarray()

In [17]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [19]:
cv.get_feature_names()



['50',
 'aaliyah',
 'aarniokoski',
 'aaron',
 'aasif',
 'abbi',
 'abbott',
 'abdalla',
 'abel',
 'abhishek',
 'abigail',
 'abl',
 'abo',
 'abraham',
 'abrahamson',
 'abram',
 'ackerman',
 'ackl',
 'action',
 'adam',
 'adamson',
 'addi',
 'adel',
 'adelaid',
 'adelstein',
 'aden',
 'adhir',
 'aditya',
 'adjani',
 'adkin',
 'adler',
 'adrian',
 'adriana',
 'adriano',
 'adrienn',
 'adsit',
 'adventur',
 'adèl',
 'afabl',
 'affleck',
 'agata',
 'agn',
 'agnieszka',
 'aguilar',
 'agutt',
 'ahm',
 'ahmad',
 'ahn',
 'aida',
 'aidan',
 'aiello',
 'aiken',
 'aime',
 'aisha',
 'aj',
 'aja',
 'ajay',
 'aki',
 'akil',
 'akin',
 'akinshina',
 'akira',
 'akiva',
 'akon',
 'aksel',
 'akshay',
 'al',
 'alain',
 'alan',
 'alanna',
 'alastair',
 'albert',
 'albertini',
 'alcázar',
 'alden',
 'aldi',
 'alec',
 'alejandro',
 'aleksandr',
 'aleksey',
 'alessandra',
 'alessandro',
 'alex',
 'alexa',
 'alexand',
 'alexandr',
 'alexandra',
 'alexi',
 'alexia',
 'alfi',
 'alfonso',
 'alfr',
 'algar',
 'ali',
 

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
similarity = cosine_similarity(vectors)

In [22]:
print(similarity)

[[1.         0.2508726  0.1754116  ... 0.         0.08006408 0.08006408]
 [0.2508726  1.         0.19069252 ... 0.         0.08703883 0.        ]
 [0.1754116  0.19069252 1.         ... 0.         0.18257419 0.09128709]
 ...
 [0.         0.         0.         ... 1.         0.09128709 0.09128709]
 [0.08006408 0.08703883 0.18257419 ... 0.09128709 1.         0.16666667]
 [0.08006408 0.         0.09128709 ... 0.09128709 0.16666667 1.        ]]


In [23]:
similarity.shape

(5634, 5634)

In [24]:
def recommend(movie):
    movie_index = df[df['movie_title'] == movie.lower()].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key = lambda x:x[1])[1:11]
    
    for i in movies_list:
        print(df.iloc[i[0]].movie_title)

In [33]:
recommend("iron man")

iron man 2
iron man 3
deep impact
zathura: a space adventure
made
avengers: age of ultron
captain america: civil war
the avengers
tron: legacy
g.i. joe: retaliation


In [26]:
df.iloc[1216].movie_title

'bless the child'

In [27]:
import pickle

In [28]:
df['movie_title'].values

array(['avatar', "pirates of the caribbean: at world's end", 'spectre',
       ..., 'on the basis of sex', 'destroyer',
       'black mirror: bandersnatch'], dtype=object)

In [29]:
pickle.dump(df.to_dict(), open('movies_dict.pkl', 'wb'))

In [30]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [31]:
df

Unnamed: 0,director_name,actor_1_name,actor_2_name,actor_3_name,genres,movie_title,combined
0,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi,avatar,cch pounder joel david moor we studi jame came...
1,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy,pirates of the caribbean: at world's end,johnni depp orlando bloom jack davenport gore ...
2,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller,spectre,christoph waltz rori kinnear stephani sigman s...
3,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller,the dark knight rises,tom hardi christian bale joseph gordon-levitt ...
4,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary,star wars: episode vii - the force awakens ...,doug walker rob walker unknown doug walker doc...
...,...,...,...,...,...,...,...
5629,Etan Cohen,Will Ferrell,John C. Reilly,Rebecca Hall,Mystery Adventure Comedy Crime,holmes & watson,will ferrel john c. reilli rebecca hall etan c...
5630,Adam McKay,Christian Bale,Amy Adams,Steve Carell,Thriller Science Fiction Action Adventure,vice,christian bale ami adam steve carel adam mckay...
5631,Mimi Leder,Felicity Jones,Armie Hammer,Justin Theroux,Drama History,on the basis of sex,felic jone armi hammer justin theroux mimi led...
5632,Karyn Kusama,Nicole Kidman,Sebastian Stan,Toby Kebbell,Thriller Crime Drama Action,destroyer,nicol kidman sebastian stan tobi kebbel karyn ...


In [32]:
df.to_csv('main_data.csv',index=False)