# Movie Recommendation System

# Import Dependencies

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from pandarallel import pandarallel
import pickle

# Import Dataset

In [2]:
df=pd.read_csv("netflix_movies_list.csv")

# EDA

In [3]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,07:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [4]:
#checking for Null values
df.isna().sum()

show_id            0
type               0
title              0
director        2389
cast             718
country          507
date_added        10
release_year       0
rating             7
duration           0
listed_in          0
description        0
dtype: int64

In [5]:
#droping unwanted columns
df.drop(columns=['date_added','rating','duration','release_year','director'],inplace=True)

In [6]:
df.head()

Unnamed: 0,show_id,type,title,cast,country,listed_in,description
0,s1,TV Show,3%,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,07:19,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,Dramas,A brilliant group of students become card-coun...


In [7]:
df.duplicated().sum()

0

In [8]:
pandarallel.initialize()

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [9]:
 df['description']=df['description'].parallel_apply(lambda x:x.split())

In [10]:
df['cast']=df['cast'].astype(str)
cast=[]
for i in df['cast']:
    cast.append(i.replace(" ",""))
df['cast']=cast

In [11]:
listed_in=[]
for i in df['listed_in']:
    listed_in.append(i.replace(" ",""))
df['listed_in']=listed_in

In [12]:
df['description']=df['description'].parallel_apply(lambda x:" ".join(x))

In [13]:
df=df[df['cast']!="nan"]

In [14]:
df['tags']=df['cast']+" "+df['listed_in']+" "+df['description']+" "+df['type']+" "+df['country']

In [15]:
df.head()

Unnamed: 0,show_id,type,title,cast,country,listed_in,description,tags
0,s1,TV Show,3%,"JoãoMiguel,BiancaComparato,MichelGomes,Rodolfo...",Brazil,"InternationalTVShows,TVDramas,TVSci-Fi&Fantasy",In a future where the elite inhabit an island ...,"JoãoMiguel,BiancaComparato,MichelGomes,Rodolfo..."
1,s2,Movie,07:19,"DemiánBichir,HéctorBonilla,OscarSerrano,Azalia...",Mexico,"Dramas,InternationalMovies",After a devastating earthquake hits Mexico Cit...,"DemiánBichir,HéctorBonilla,OscarSerrano,Azalia..."
2,s3,Movie,23:59,"TeddChan,StellaChung,HenleyHii,LawrenceKoh,Tom...",Singapore,"HorrorMovies,InternationalMovies","When an army recruit is found dead, his fellow...","TeddChan,StellaChung,HenleyHii,LawrenceKoh,Tom..."
3,s4,Movie,9,"ElijahWood,JohnC.Reilly,JenniferConnelly,Chris...",United States,"Action&Adventure,IndependentMovies,Sci-Fi&Fantasy","In a postapocalyptic world, rag-doll robots hi...","ElijahWood,JohnC.Reilly,JenniferConnelly,Chris..."
4,s5,Movie,21,"JimSturgess,KevinSpacey,KateBosworth,AaronYoo,...",United States,Dramas,A brilliant group of students become card-coun...,"JimSturgess,KevinSpacey,KateBosworth,AaronYoo,..."


In [16]:
#creating a new dataframe
new_df=df[['show_id',"title","tags"]]

In [17]:
new_df

Unnamed: 0,show_id,title,tags
0,s1,3%,"JoãoMiguel,BiancaComparato,MichelGomes,Rodolfo..."
1,s2,07:19,"DemiánBichir,HéctorBonilla,OscarSerrano,Azalia..."
2,s3,23:59,"TeddChan,StellaChung,HenleyHii,LawrenceKoh,Tom..."
3,s4,9,"ElijahWood,JohnC.Reilly,JenniferConnelly,Chris..."
4,s5,21,"JimSturgess,KevinSpacey,KateBosworth,AaronYoo,..."
...,...,...,...
7781,s7782,Zoom,"TimAllen,CourteneyCox,ChevyChase,KateMara,Ryan..."
7782,s7783,Zozo,"ImadCreidi,AntoinetteTurk,EliasGergi,CarmenLeb..."
7783,s7784,Zubaan,"VickyKaushal,Sarah-JaneDias,RaaghavChanana,Man..."
7784,s7785,Zulu Man in Japan,


In [18]:
new_df.shape

(7069, 3)

In [19]:
new_df.isna().sum()

show_id      0
title        0
tags       411
dtype: int64

In [20]:
#droping null values
new_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.dropna(inplace=True)


In [21]:
new_df.reset_index(drop=True,inplace=True)

In [22]:
new_df['tags'][0]

'JoãoMiguel,BiancaComparato,MichelGomes,RodolfoValente,VanezaOliveira,RafaelLozano,VivianePorto,MelFronckowiak,SergioMamberti,ZezéMotta,CelsoFrateschi InternationalTVShows,TVDramas,TVSci-Fi&Fantasy In a future where the elite inhabit an island paradise far from the crowded slums, you get one chance to join the 3% saved from squalor. TV Show Brazil'

In [23]:
new_df['tags']=new_df['tags'].parallel_apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].parallel_apply(lambda x:x.lower())


In [24]:
new_df.shape

(6658, 3)

# Creating a CountVectorizer Variable

In [25]:
cv=CountVectorizer(max_features=6658,stop_words='english')

In [26]:
vector=cv.fit_transform(new_df['tags']).toarray()

# Creating a PorterStemmer Variable

In [28]:
ps=PorterStemmer()

In [29]:
ps

<PorterStemmer>

# Buliding a method to apply PortStemmer

In [30]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [31]:
new_df['tags']=new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(stem)


In [32]:
new_df

Unnamed: 0,show_id,title,tags
0,s1,3%,"joãomiguel,biancacomparato,michelgomes,rodolfo..."
1,s2,07:19,"demiánbichir,héctorbonilla,oscarserrano,azalia..."
2,s3,23:59,"teddchan,stellachung,henleyhii,lawrencekoh,tom..."
3,s4,9,"elijahwood,johnc.reilly,jenniferconnelly,chris..."
4,s5,21,"jimsturgess,kevinspacey,katebosworth,aaronyoo,..."
...,...,...,...
6653,s7781,Zoo,"shashankarora,shwetatripathi,rahulkumar,gopalk..."
6654,s7782,Zoom,"timallen,courteneycox,chevychase,katemara,ryan..."
6655,s7783,Zozo,"imadcreidi,antoinetteturk,eliasgergi,carmenleb..."
6656,s7784,Zubaan,"vickykaushal,sarah-janedias,raaghavchanana,man..."


In [33]:
similarity=cosine_similarity(vector)

In [34]:
similarity.shape

(6658, 6658)

In [35]:
list(enumerate(similarity[0]))[1:5]

[(1, 0.0), (2, 0.05564148840746572), (3, 0.09901475429766744), (4, 0.0)]

In [36]:
d=sorted(list(enumerate(similarity[3754])),reverse=True,key=lambda x:x[1])
for i in d[0:5]:
    print(new_df.iloc[i[0]].title)

Nailed It! France
Nailed It! Germany
The Circle Brazil
Zumbo's Just Desserts
Crazy Delicious


In [37]:
new_df[new_df['title']=="Nailed It! France"].index[0]

3754

# Building a method to recommend movies

In [38]:
def recommend(movie):
    movie_index=new_df[new_df['title']==movie].index[0]#gives the index of the movie name
    distance=similarity[movie_index]#gives the simililar movie index
    movies_list=sorted(list(enumerate(distance)),reverse=True,key=lambda vector:vector[1])[1:6]#give the top 5 matched movie index
    for i in movies_list:
        print(new_df.iloc[i[0]].title)#c

In [39]:
recommend("Nailed It! Germany")

Nailed It! France
Nailed It! Spain
Sing On! Germany
The Chefs' Line
The Circle Brazil


In [40]:
recommend("The Chefs' Line")

Crazy Delicious
Nailed It! Germany
Zumbo's Just Desserts
Nailed It! France
The Circle Brazil


In [41]:
recommend("Crazy Delicious")

The Big Family Cooking Showdown
My Hotter Half
Baby Ballroom
Cabins in the Wild with Dick Strawbridge
Botched Up Bodies


# Saving the new dataset and cosine similarity

In [42]:
import pickle 
pickle.dump(new_df,open("netflix_dataset.pkl","wb"))
pickle.dump(similarity,open("similarity.pkl","wb"))