In [4]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text 
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
data=pd.read_csv("C:/Users/sushm/OneDrive/Desktop/Data Analyst/Projects/Netflix recommendation DA Project/netflixData.csv")
print(data.head())

                                Show Id                          Title  \
0  cc1b6ed9-cf9e-4057-8303-34577fb54477                       (Un)Well   
1  e2ef4e91-fb25-42ab-b485-be8e3b23dedb                         #Alive   
2  b01b73b7-81f6-47a7-86d8-acb63080d525  #AnneFrank - Parallel Stories   
3  b6611af0-f53c-4a08-9ffa-9716dc57eb9c                       #blackAF   
4  7f2d4170-bab8-4d75-adc2-197f7124c070               #cats_the_mewvie   

                                         Description  \
0  This docuseries takes a deep dive into the luc...   
1  As a grisly virus rampages a city, a lone man ...   
2  Through her diary, Anne Frank's story is retol...   
3  Kenya Barris and his family navigate relations...   
4  This pawesome documentary explores how our fel...   

                      Director  \
0                          NaN   
1                       Cho Il   
2  Sabina Fedeli, Anna Migotto   
3                          NaN   
4             Michael Margolis   

             

In [6]:
#checking for null values 

In [7]:
print(data.isnull().sum())

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64


In [8]:
#there are null values inb the dataset , before removing null values, 
#we will pull data required to build netflix recommendation system 

In [9]:
data=data[["Title","Description","Content Type","Genres"]]
print(data.head())

                           Title  \
0                       (Un)Well   
1                         #Alive   
2  #AnneFrank - Parallel Stories   
3                       #blackAF   
4               #cats_the_mewvie   

                                         Description Content Type  \
0  This docuseries takes a deep dive into the luc...      TV Show   
1  As a grisly virus rampages a city, a lone man ...        Movie   
2  Through her diary, Anne Frank's story is retol...        Movie   
3  Kenya Barris and his family navigate relations...      TV Show   
4  This pawesome documentary explores how our fel...        Movie   

                                           Genres  
0                                      Reality TV  
1  Horror Movies, International Movies, Thrillers  
2             Documentaries, International Movies  
3                                     TV Comedies  
4             Documentaries, International Movies  


In [10]:
##As the name suggests:

##The title column contains the titles of movies and TV shows on Netflix
##Description column describes the plot of the TV shows and movies
##The Content Type column tells us if it’s a movie or a TV show
##The Genre column contains all the genres of the TV show or the movie

In [11]:
#dropping null values

In [13]:
data=data.dropna()

In [14]:
#data preperation for title column

In [15]:
import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["Title"] = data["Title"].apply(clean)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sushm\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [16]:
print(data.Title.sample(10))

2554    kingsglaiv final fantasi xv
4705               cat hat know lot
93                   christma catch
1586           fateextra last encor
4891                         hollow
5310                           wind
876                  can't complain
2687                            len
165                   way back home
2170                        michael
Name: Title, dtype: object


In [17]:
#using the Genres column as the feature to recommend similar content to the user. I will use the concept of cosine similarity 

In [18]:
feature = data["Genres"].tolist()
tfidf = text.TfidfVectorizer(input=feature, stop_words="english")
tfidf_matrix = tfidf.fit_transform(feature)
similarity = cosine_similarity(tfidf_matrix)

In [19]:
indices = pd.Series(data.index, 
                    index=data['Title']).drop_duplicates()

In [20]:
#writing a function to recommend Movies and TV shows on Netflix:

In [21]:
def netFlix_recommendation(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices]

print(netFlix_recommendation("girlfriend"))

3                          blackaf
285                     washington
417                 arrest develop
434     astronomi club sketch show
451    aunti donna big ol hous fun
656                      big mouth
752                bojack horseman
805                   brew brother
935                       champion
937                  chappell show
Name: Title, dtype: object


# Summary
The recommendation system of Netflix predicts a personalised catalogue for you based on factors like your viewing history, 
the viewing history of other users with similar tastes and preferences,
and the genres, category, descriptions, and more information of the content you watched