In [121]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import string
from gensim.parsing.preprocessing import remove_stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [122]:
raw_df=pd.read_csv('netflix_titles.csv')
raw_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [123]:
df=raw_df.copy()
df=df.drop(['show_id','type','country','date_added','rating','release_year','duration'], axis=1)
df.head()

Unnamed: 0,title,director,cast,listed_in,description
0,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,Jandino: Whatever it Takes,,Jandino Asporaat,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",Kids' TV,"With the help of three human allies, the Autob..."
3,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",Kids' TV,When a prison ship crash unleashes hundreds of...
4,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",Comedies,When nerdy high schooler Dani finally attracts...


In [124]:
print('Shape of the Dataframe is ',df.shape)

Shape of the Dataframe is  (6234, 5)


In [125]:
# Checking the Description to check its content and how to deal with it.
df['description'][0]

'Before planning an awesome wedding for his grandfather, a polar bear king must take back a stolen artifact from an evil archaeologist first.'

In [126]:
#Defining the Function  to transform the text in the description Column
def transform(text):
  text=text.lower()
  text = text.translate(str.maketrans('', '', string.punctuation))
  text = remove_stopwords(text)
  text=word_tokenize(text)
  ps = PorterStemmer()
  new=[]
  for ele in range(len(text)):
    new.append(ps.stem(text[ele]))
  text=new
  text=" ".join(text)
  return text

#Applying the Function on the description Column
df['description']=df['description'].apply(lambda x: transform(x))
df['description'].head()

0    plan awesom wed grandfath polar bear king stol...
1    jandino asporaat riff challeng rais kid serena...
2    help human alli autobot protect earth onslaugh...
3    prison ship crash unleash hundr decepticon ear...
4    nerdi high schooler dani final attract longtim...
Name: description, dtype: object

In [127]:
#Checking for the missing values in the Dataframe
df.isna().sum()

title             0
director       1969
cast            570
listed_in         0
description       0
dtype: int64

In [128]:
#Replacing the NaN values with the empty text.
df = df.fillna('')
df.isnull().sum()

title          0
director       0
cast           0
listed_in      0
description    0
dtype: int64

In [129]:
#Lets take a look at the new transformed DataFrame
df.head()

Unnamed: 0,title,director,cast,listed_in,description
0,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","Children & Family Movies, Comedies",plan awesom wed grandfath polar bear king stol...
1,Jandino: Whatever it Takes,,Jandino Asporaat,Stand-Up Comedy,jandino asporaat riff challeng rais kid serena...
2,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",Kids' TV,help human alli autobot protect earth onslaugh...
3,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",Kids' TV,prison ship crash unleash hundr decepticon ear...
4,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",Comedies,nerdi high schooler dani final attract longtim...


In [130]:
#Add all the important features and store it into new column
features=df['title']+ ' ' + df['director'] + ' ' + df['cast'] + ' ' +df['listed_in']+' '+ df['description']
features=features.apply(lambda x: x.lower())
features

0       norm of the north: king sized adventure richar...
1       jandino: whatever it takes  jandino asporaat s...
2       transformers prime  peter cullen, sumalee mont...
3       transformers: robots in disguise  will friedle...
4       #realityhigh fernando lebrija nesta cooper, ka...
                              ...                        
6229    red vs. blue  burnie burns, jason saldaña, gus...
6230    maron  marc maron, judd hirsch, josh brener, n...
6231    little baby bum: nursery rhyme friends   movie...
6232    a young doctor's notebook and other stories  d...
6233    friends  jennifer aniston, courteney cox, lisa...
Length: 6234, dtype: object

In [None]:
Tfidf_vect = TfidfVectorizer()
vector_matrix = Tfidf_vect.fit_transform(features)
vector_matrix

In [None]:
tokens = Tfidf_vect.get_feature_names()
df_vec=pd.DataFrame(vector_matrix.toarray(),columns=tokens)
df_vec.head()

In [132]:
cos_sim = cosine_similarity(vector_matrix, vector_matrix)
print(cos_sim.shape)
cos_sim

(6234, 6234)


array([[1.        , 0.        , 0.00673934, ..., 0.0120289 , 0.01744265,
        0.04176334],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00673934, 0.        , 1.        , ..., 0.        , 0.00767924,
        0.00636265],
       ...,
       [0.0120289 , 0.        , 0.        , ..., 1.        , 0.        ,
        0.04210261],
       [0.01744265, 0.        , 0.00767924, ..., 0.        , 1.        ,
        0.03742769],
       [0.04176334, 0.        , 0.00636265, ..., 0.04210261, 0.03742769,
        1.        ]])

In [147]:
df['lower']=df['title'].apply(lambda x: x.lower())
titles_list =df['lower'].tolist()
print(titles_list)



In [150]:
inp= input(' Enter your favourite movie name : ')
movie=inp.lower()
close_matches = difflib.get_close_matches(movie, titles_list)
print('\nAvailable Movies Closest to : ',inp)
print('\n0) {0} \n1) {1} \n2) {2} \n3) Exit the Operation '.format(close_matches[0],close_matches[1],close_matches[2]))
inp2=int(input('\nInput the number to get your recommendation: '))
if inp2 >3 :
  print('\nInvalid Input')
elif inp2==3:
  print('\n____Thank you For Using Our Service!____')
else:
  print('\nSelected Movie: ',close_matches[inp2])
Slct=close_matches[inp2]

 Enter your favourite movie name : transformer

Available Movies Closest to :  transformer

0) transformer 
1) transfers 
2) transformers prime 
3) Exit the Operation 

Input the number to get your recommendation: 2

Selected Movie:  transformers prime


In [151]:
#_______RUN THIS CELL TO GET THE RECOMMENDED MOVIES_________


#Lets find the index of selected movie in the DataFrame
Slct=close_matches[inp2]
index_of_the_movie = df.index[df.lower == Slct].tolist()
idx=index_of_the_movie[0]

sim_mov= list(enumerate(cos_sim[idx]))

#Sort the movies in descending order
similar_movies = sorted(sim_mov, key = lambda x:x[1], reverse = True) 

#Displaying the Similar Movies

print(" Top Ten Recommended movies \n")
i = 1
for mov in similar_movies:
  index = mov[0]
  title = df[df.index==index]['title'].values[0]
  if (i<10):
    print(i, '.',title)
    i+=1

 Top Ten Recommended movies 

1 . Transformers Prime
2 . Transformers: Robots in Disguise
3 . All Hail King Julien: Exiled
4 . Kulipari: Dream Walker
5 . Transformers: Rescue Bots
6 . 3Below: Tales of Arcadia
7 . A Fairly Odd Summer
8 . Kulipari: An Army of Frogs
9 . Transformers: Cyberverse
