### Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Prerana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Data

In [2]:
#The dataset contains information about all the movies and TV shows on Netflix as of 2021.

data = pd.read_csv("netflixData.csv") 
print(data.head())

                                Show Id                          Title  \
0  cc1b6ed9-cf9e-4057-8303-34577fb54477                       (Un)Well   
1  e2ef4e91-fb25-42ab-b485-be8e3b23dedb                         #Alive   
2  b01b73b7-81f6-47a7-86d8-acb63080d525  #AnneFrank - Parallel Stories   
3  b6611af0-f53c-4a08-9ffa-9716dc57eb9c                       #blackAF   
4  7f2d4170-bab8-4d75-adc2-197f7124c070               #cats_the_mewvie   

                                         Description  \
0  This docuseries takes a deep dive into the luc...   
1  As a grisly virus rampages a city, a lone man ...   
2  Through her diary, Anne Frank's story is retol...   
3  Kenya Barris and his family navigate relations...   
4  This pawesome documentary explores how our fel...   

                      Director  \
0                          NaN   
1                       Cho Il   
2  Sabina Fedeli, Anna Migotto   
3                          NaN   
4             Michael Margolis   

             

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5967 entries, 0 to 5966
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Show Id             5967 non-null   object 
 1   Title               5967 non-null   object 
 2   Description         5967 non-null   object 
 3   Director            3903 non-null   object 
 4   Genres              5967 non-null   object 
 5   Cast                5437 non-null   object 
 6   Production Country  5408 non-null   object 
 7   Release Date        5964 non-null   float64
 8   Rating              5963 non-null   object 
 9   Duration            5964 non-null   object 
 10  Imdb Score          5359 non-null   object 
 11  Content Type        5967 non-null   object 
 12  Date Added          4632 non-null   object 
dtypes: float64(1), object(12)
memory usage: 606.1+ KB


In [4]:
data.columns

Index(['Show Id', 'Title', 'Description', 'Director', 'Genres', 'Cast',
       'Production Country', 'Release Date', 'Rating', 'Duration',
       'Imdb Score', 'Content Type', 'Date Added'],
      dtype='object')

### EDA

In [5]:
#check null
print(data.isnull().sum())

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64


In [24]:
data['Title'].sample(5)

1327                                  disjoint
3541                              onepunch man
2092                                 holi man 
2677    lego marvel super hero guardian galaxi
546                       barbi life dreamhous
Name: Title, dtype: object

In [7]:
data['Description']

0       This docuseries takes a deep dive into the luc...
1       As a grisly virus rampages a city, a lone man ...
2       Through her diary, Anne Frank's story is retol...
3       Kenya Barris and his family navigate relations...
4       This pawesome documentary explores how our fel...
                              ...                        
5962    On his wedding day, an arrogant, greedy accoun...
5963    A group of women leaves Kuwait to attend unive...
5964    Two brothers start a new life in Singapore, wh...
5965    After people in his town start turning up dead...
5966    Miniforce, a special task force of elite range...
Name: Description, Length: 5967, dtype: object

In [8]:
data['Genres']

0                                           Reality TV
1       Horror Movies, International Movies, Thrillers
2                  Documentaries, International Movies
3                                          TV Comedies
4                  Documentaries, International Movies
                             ...                      
5962            Comedies, Dramas, International Movies
5963                 International TV Shows, TV Dramas
5964                 International TV Shows, TV Dramas
5965           Dramas, International Movies, Thrillers
5966                          Children & Family Movies
Name: Genres, Length: 5967, dtype: object

In [9]:
data['Rating']

0       TV-MA
1       TV-MA
2       TV-14
3       TV-MA
4       TV-14
        ...  
5962    TV-14
5963    TV-14
5964    TV-14
5965    TV-MA
5966    TV-Y7
Name: Rating, Length: 5967, dtype: object

In [10]:
data['Content Type']

0       TV Show
1         Movie
2         Movie
3       TV Show
4         Movie
         ...   
5962      Movie
5963    TV Show
5964    TV Show
5965      Movie
5966      Movie
Name: Content Type, Length: 5967, dtype: object

In [11]:
#required data to recommend

data = data[["Title", "Description", "Content Type", "Genres"]]
print(data.head())

                           Title  \
0                       (Un)Well   
1                         #Alive   
2  #AnneFrank - Parallel Stories   
3                       #blackAF   
4               #cats_the_mewvie   

                                         Description Content Type  \
0  This docuseries takes a deep dive into the luc...      TV Show   
1  As a grisly virus rampages a city, a lone man ...        Movie   
2  Through her diary, Anne Frank's story is retol...        Movie   
3  Kenya Barris and his family navigate relations...      TV Show   
4  This pawesome documentary explores how our fel...        Movie   

                                           Genres  
0                                      Reality TV  
1  Horror Movies, International Movies, Thrillers  
2             Documentaries, International Movies  
3                                     TV Comedies  
4             Documentaries, International Movies  


In [12]:
data.shape

(5967, 4)

In [13]:
data.isnull().sum()

Title           0
Description     0
Content Type    0
Genres          0
dtype: int64

In [14]:
#cleaning title data

def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["Title"] = data["Title"].apply(clean)

In [15]:
print(data.Title.sample(10))

2462                 kaali khuhi
242                         ajji
1829             girl und panzer
4044                     sabotag
3987                 rodney king
804      brené brown call courag
4859    goop lab gwyneth paltrow
1047                   club crow
197                 act vengeanc
5069                movi made us
Name: Title, dtype: object


In [17]:
#use the Genres column as the feature to recommend similar content to the user.
#using cosine similarity
feature = data["Genres"].tolist()
tfidf = text.TfidfVectorizer(input=feature, stop_words="english")
tfidf_matrix = tfidf.fit_transform(feature)
similarity = cosine_similarity(tfidf_matrix)

In [18]:
#set title column as index to find similar content
indices = pd.Series(data.index, 
                    index=data['Title']).drop_duplicates()

In [22]:
indices.head(5)

Title
unwel                        0
aliv                         1
annefrank  parallel stori    2
blackaf                      3
catsthemewvi                 4
dtype: int64

In [26]:
def netFlix_recommendation(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:5]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices]

In [32]:
netFlix_recommendation("barbi life dreamhous")

262                 alexa  kati
278                    alien tv
290            hail king julien
372                  angri bird
402    archibald next big thing
Name: Title, dtype: object

In [None]:
#end