In [69]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [70]:
movies=pd.read_csv('/content/netflix_dataset.csv')

In [71]:
movies.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [72]:
movies.tail()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."
8806,s8807,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,"March 2, 2019",2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...


In [73]:
movies.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,2634
cast,825
country,831
date_added,10
release_year,0
rating,4
duration,3


In [74]:
movies['director'] = movies['director'].fillna('')

movies['cast'] = movies['cast'].fillna('')

In [75]:
movies.isnull().sum()

Unnamed: 0,0
show_id,0
type,0
title,0
director,0
cast,0
country,831
date_added,10
release_year,0
rating,4
duration,3


No more NaN values in important features columns

In [76]:
movies.insert(0,'index',movies.index)

In [77]:
movies.head()

Unnamed: 0,index,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [78]:
movies['combined_features'] = movies['director'] + ' '+movies['cast'] + ' '+movies['listed_in']+ ' '+movies['description']

In [79]:
data = movies[['index', 'title', 'combined_features']]

Refining the combined_features column

In [80]:
stop_words = set(stopwords.words('english'))

In [81]:
def refined_text(text):
  text = re.sub(r"[^a-zA-Z\s]", "", text) #removing special chars & nums
  text = text.lower()
  #tokenizing i.e., dividing into separate words
  tokens = word_tokenize(text)
  tokens = [word for word in tokens if word.lower() not in stop_words] #removing stop_words
  return " ".join(tokens)

In [None]:
data['redefined_text'] = data['combined_features'].apply(refined_text)

In [83]:
data.head()

Unnamed: 0,index,title,combined_features,redefined_text
0,0,Dick Johnson Is Dead,Kirsten Johnson Documentaries As her father n...,kirsten johnson documentaries father nears end...
1,1,Blood & Water,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaba...",ama qamata khosi ngema gail mabalane thabang m...
2,2,Ganglands,"Julien Leclercq Sami Bouajila, Tracy Gotoas, S...",julien leclercq sami bouajila tracy gotoas sam...
3,3,Jailbirds New Orleans,"Docuseries, Reality TV Feuds, flirtations an...",docuseries reality tv feuds flirtations toilet...
4,4,Kota Factory,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam ...",mayur jitendra kumar ranjan raj alam khan ahsa...


Converting from textual to numerical data

In [84]:
tfidf=TfidfVectorizer()

feature_vectors=tfidf.fit_transform(data['redefined_text'])

In [85]:
similarity=cosine_similarity(feature_vectors)

In [86]:
similarity.shape

(8807, 8807)

In [87]:
#recommendation function
def recommend_movies(movie_name, similarity=similarity, df=data, top_n=10):
  list_of_titles=df['title'].tolist()
  close_match = difflib.get_close_matches(movie_name, list_of_titles)
  closest_match=close_match[0]
  index_of_movie = df[df.title==closest_match]['index'].values[0]

  #get similarity scores
  similarity_score = list(enumerate(similarity[index_of_movie]))
  sorted_similar_movies =sorted(similarity_score, key=lambda x:x[1], reverse=True)
  similarity_score = sorted_similar_movies[1:top_n+1]

  movie_indices = [i[0] for i in similarity_score]

  return df['title'].iloc[movie_indices]

Testing the working of the model

In [88]:
movie_name = 'Zombieland'
print(movie_name)

Zombieland


In [89]:
print(f"Recommendation for the movie: {movie_name}")
recommendations = recommend_movies(movie_name)
print(recommendations)

Recommendation for the movie: Zombieland
6012         30 Minutes or Less
7210                    Kingpin
8494              THE RUM DIARY
7584                Night Moves
1485                      Rango
8288        The End of the Tour
7592            No Reservations
3343                 Santa Girl
5881    A Very Murray Christmas
1841     The Last Kids on Earth
Name: title, dtype: object
