# Netflix Movie Recommendation

## Overview data

In [1]:
import pandas as pd
import numpy as np
import warnings
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from collections import Counter
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('netflix_titles.csv')

# Data Analysis and Data cleaning

In [3]:
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [4]:
#The given number of features and data points
#no. of data points = 7787
#no. of features = 12
data.shape

(7787, 12)

In [5]:
#finding out all feature names
data.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [6]:
#Keeping Relevant features to used for recommendation system building using nlp
data = data[['title','director','cast','listed_in','description']]

#### Missing data

In [7]:
#removing all Nan title null(if movie name not present means movie does not exist in list) 
data = data[~data['title'].isnull()] 
data.shape

(7787, 5)

In [8]:
#description required for words based vector formation for finding similarity between two movies
data[~data['description'].isnull()]
data.shape

(7787, 5)

#### Duplicate data

In [10]:
#checking for title duplicates = none exists as the sum is 0
#Therefore there are no duplicate title in movie/TV series data to remove
sum(data.duplicated('title'))

0

In [11]:
#does not make sense to business sense to remove duplicate cast,listed_in(basically genre) 
#as they might be the same for two different movies

In [12]:
# Trying filling for all NAN values with an empty string
data.fillna(" ", inplace = True)
data.shape

(7787, 5)

In [13]:
#Saving data we are going to work with
data.to_pickle('Pickle/7k_data_points')

In [65]:
data = pd.read_pickle('Pickle/7k_data_points')

#### Basic stats for data

In [66]:
#There area a total of 6832 unique actor casts in our movies
#718 movies whos cast are not known in our data
data['cast'].describe()

count     7787
unique    6832
top           
freq       718
Name: cast, dtype: object

In [67]:
#There area a total of 4050 unique directors
#2389 movies whos directors are not known in our data
data['director'].describe()

count     7787
unique    4050
top           
freq      2389
Name: director, dtype: object

In [68]:
director_count = Counter(data['director'])
director_count.most_common(10)

[(' ', 2389),
 ('Raúl Campos, Jan Suter', 18),
 ('Marcus Raboy', 16),
 ('Jay Karas', 14),
 ('Cathy Garcia-Molina', 13),
 ('Youssef Chahine', 12),
 ('Martin Scorsese', 12),
 ('Jay Chapman', 12),
 ('Steven Spielberg', 10),
 ('David Dhawan', 9)]

# Text Preprocessing

#### Taking care of Stopwords

In [69]:
stop_words = stopwords.words('english')
print('These are all the stop words', list(stop_words))

def nlp_processing(text,index,column):
    string = " "
    for word in text.split():
        words = ("".join(e for e in word if e.isalnum()))
        words.lower()
        if words not in stop_words:
            string += ' ' + words 
    data[column][index] = string

These are all the stop words ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', '

In [70]:
for index,row in data.iterrows():
    nlp_processing(row['description'],index,'description')

In [71]:
data.head()

Unnamed: 0,title,director,cast,listed_in,description
0,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...","International TV Shows, TV Dramas, TV Sci-Fi &...",In future elite inhabit island paradise far ...
1,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...","Dramas, International Movies",After devastating earthquake hits Mexico Cit...
2,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...","Horror Movies, International Movies",When army recruit found dead fellow soldiers...
3,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...","Action & Adventure, Independent Movies, Sci-Fi...",In postapocalyptic world ragdoll robots hide...
4,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",Dramas,A brilliant group students become cardcounti...


In [72]:
# keeping just the top four cast names
data['cast'] = data['cast'].map(lambda x : x.split(',')[:3])

#seprating the listed_in category
data['listed_in'] = data['listed_in'].map(lambda x : x.split(','))
 
#joining the director surnames and first names
data['director'] = data['director'].map(lambda x: x.replace(' ','').lower())

In [73]:
#joining the actor surnames and names 
for index,rows in data.iterrows():
    
    rows['cast'] = [x.lower().replace(' ','') for x in rows['cast']] 

In [74]:
#setting the new index as movie 'title'
data.set_index('title',inplace=True)
data.head()

Unnamed: 0_level_0,director,cast,listed_in,description
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3%,,"[joãomiguel, biancacomparato, michelgomes]","[International TV Shows, TV Dramas, TV Sci-F...",In future elite inhabit island paradise far ...
7:19,jorgemichelgrau,"[demiánbichir, héctorbonilla, oscarserrano]","[Dramas, International Movies]",After devastating earthquake hits Mexico Cit...
23:59,gilbertchan,"[teddchan, stellachung, henleyhii]","[Horror Movies, International Movies]",When army recruit found dead fellow soldiers...
9,shaneacker,"[elijahwood, johnc.reilly, jenniferconnelly]","[Action & Adventure, Independent Movies, Sci...",In postapocalyptic world ragdoll robots hide...
21,robertluketic,"[jimsturgess, kevinspacey, katebosworth]",[Dramas],A brilliant group students become cardcounti...


# Word corpus

### Making a new column containg all the features in a single string  

In [None]:
#Form a column text such that it contains all the columns merged in string format