In [19]:
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
movie_data=pd.read_csv('/content/movies.csv')

In [21]:
movie_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [22]:
movie_data.tail()

Unnamed: 0,movieId,title,genres
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)
62422,209171,Women of Devil's Island (1962),Action|Adventure|Drama


In [23]:
movie_data.shape

(62423, 3)

In [24]:
movie_data.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [59]:
from itertools import dropwhile
movie_data=movie_data.sample(10000).reset_index(drop=True)

In [60]:
movie_data.head()

Unnamed: 0,movieId,title,genres
0,153464,sundown: the vampire in retreat (1989),Action|Comedy|Horror|Western
1,175627,july rain (1966),Drama|Romance
2,197715,don't come back from the moon (2019),Drama
3,144222,bros before hos (2013),Comedy
4,189819,escape plan 2: hades (2018),Action|Crime|Thriller


In [61]:
movie_data['title']

0       sundown: the vampire in retreat (1989)
1                             july rain (1966)
2         don't come back from the moon (2019)
3                       bros before hos (2013)
4                  escape plan 2: hades (2018)
                         ...                  
9995     warrior and the sorceress, the (1984)
9996                      unforgettable (1996)
9997                      kiss and tell (1945)
9998    it's gradiva who is calling you (2006)
9999      wild wild west revisited, the (1979)
Name: title, Length: 10000, dtype: object

In [63]:
movie_data.shape

(10000, 3)

In [64]:
movie_data['title']=movie_data['title'].str.lower().replace(r'^\w\s','',regex=True)

In [65]:
movie_data.head(10000)

Unnamed: 0,movieId,title,genres
0,153464,sundown: the vampire in retreat (1989),Action|Comedy|Horror|Western
1,175627,july rain (1966),Drama|Romance
2,197715,don't come back from the moon (2019),Drama
3,144222,bros before hos (2013),Comedy
4,189819,escape plan 2: hades (2018),Action|Crime|Thriller
...,...,...,...
9995,99739,"warrior and the sorceress, the (1984)",Action|Adventure|Fantasy|Sci-Fi
9996,103,unforgettable (1996),Mystery|Sci-Fi|Thriller
9997,98643,kiss and tell (1945),Comedy
9998,154644,it's gradiva who is calling you (2006),Drama


In [66]:
stemmer=PorterStemmer()

In [67]:
def token(title):
  token=nltk.word_tokenize(title)
  a=[stemmer.stem(w) for w in token]
  return" ".join(a)

In [68]:
token("kiss and tell (1945)	")

'kiss and tell ( 1945 )'

In [69]:
movie_data['title'].apply(lambda X:token(X))

0       sundown : the vampir in retreat ( 1989 )
1                             juli rain ( 1966 )
2        do n't come back from the moon ( 2019 )
3                          bro befor ho ( 2013 )
4                   escap plan 2 : hade ( 2018 )
                          ...                   
9995    warrior and the sorceress , the ( 1984 )
9996                          unforgett ( 1996 )
9997                      kiss and tell ( 1945 )
9998      it 's gradiva who is call you ( 2006 )
9999       wild wild west revisit , the ( 1979 )
Name: title, Length: 10000, dtype: object

In [70]:
tfid=TfidfVectorizer(analyzer='word',stop_words='english')

In [71]:
matrix=tfid.fit_transform(movie_data['title'])

In [72]:
viewer=cosine_similarity(matrix)

In [73]:
viewer[0]

array([1., 0., 0., ..., 0., 0., 0.])

In [78]:
movie_data[movie_data['title']=='kiss and tell (1945)	']

Unnamed: 0,movieId,title,genres


In [79]:
def recommender(movie_name):
  index=movie_data[movie_data['title']==movie_name].index[0]
  dist=sorted(list(enumerate(viewer[index])),reverse=True,key=lambda X:X[1])
  title=[]
  for title_id in dist[1:60000]:
    title.append(movie_data.iloc[title_id[0]].genres)
  return title

In [81]:
recommender("kiss and tell (1945)")

['Horror|Thriller',
 'Action|Crime|Drama',
 'Comedy|Romance',
 'Drama|Romance',
 'Horror',
 'Drama|Romance',
 'Crime|Romance|Thriller',
 'Action|Thriller',
 'Documentary',
 'Crime|Drama|Horror|Mystery|Romance',
 'Drama',
 'Action|Comedy|Crime|Romance',
 'Action|Crime|Romance',
 'Drama|Romance',
 'Drama',
 'Comedy',
 'Drama|Thriller',
 'Horror',
 'Horror|Thriller',
 'Drama',
 'Comedy|Romance',
 'Drama',
 'Drama|Thriller',
 '(no genres listed)',
 'Comedy|Fantasy',
 'Horror|Mystery',
 'Comedy|Drama|Romance',
 'Documentary',
 'Animation',
 'Drama|War',
 'Animation|Children|Musical',
 'Drama|Film-Noir',
 'Crime|Film-Noir',
 'Drama|War',
 '(no genres listed)',
 'Comedy|Crime|Film-Noir|Mystery|Romance|Thriller',
 'Animation',
 'Drama|War',
 'Comedy|Crime|Mystery',
 'Drama|Romance',
 'Animation',
 'Comedy|Romance|Western',
 'Animation',
 'Drama',
 'Mystery|Thriller',
 'Comedy|Drama|War',
 'Comedy',
 'Children|Fantasy',
 'Animation|Comedy',
 'Drama|War',
 'Western',
 'Documentary',
 'Comedy|Cri

In [82]:
pickle.dump(viewer, open("similarity", "wb"))