In [None]:
pip install scikit-learn


In [None]:
pip install nltk

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
netflix_df = pd.read_csv("netflixData.csv")
print(netflix_df.head())

                                Show Id                          Title  \
0  cc1b6ed9-cf9e-4057-8303-34577fb54477                       (Un)Well   
1  e2ef4e91-fb25-42ab-b485-be8e3b23dedb                         #Alive   
2  b01b73b7-81f6-47a7-86d8-acb63080d525  #AnneFrank - Parallel Stories   
3  b6611af0-f53c-4a08-9ffa-9716dc57eb9c                       #blackAF   
4  7f2d4170-bab8-4d75-adc2-197f7124c070               #cats_the_mewvie   

                                         Description  \
0  This docuseries takes a deep dive into the luc...   
1  As a grisly virus rampages a city, a lone man ...   
2  Through her diary, Anne Frank's story is retol...   
3  Kenya Barris and his family navigate relations...   
4  This pawesome documentary explores how our fel...   

                      Director  \
0                          NaN   
1                       Cho Il   
2  Sabina Fedeli, Anna Migotto   
3                          NaN   
4             Michael Margolis   

             

In [3]:
netflix_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5967 entries, 0 to 5966
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Show Id             5967 non-null   object 
 1   Title               5967 non-null   object 
 2   Description         5967 non-null   object 
 3   Director            3903 non-null   object 
 4   Genres              5967 non-null   object 
 5   Cast                5437 non-null   object 
 6   Production Country  5408 non-null   object 
 7   Release Date        5964 non-null   float64
 8   Rating              5963 non-null   object 
 9   Duration            5964 non-null   object 
 10  Imdb Score          5359 non-null   object 
 11  Content Type        5967 non-null   object 
 12  Date Added          4632 non-null   object 
dtypes: float64(1), object(12)
memory usage: 606.2+ KB


In [4]:
#Check if there are null values
print(netflix_df.isnull())

      Show Id  Title  Description  Director  Genres   Cast  \
0       False  False        False      True   False   True   
1       False  False        False     False   False  False   
2       False  False        False     False   False  False   
3       False  False        False      True   False  False   
4       False  False        False     False   False   True   
...       ...    ...          ...       ...     ...    ...   
5962    False  False        False     False   False  False   
5963    False  False        False      True   False  False   
5964    False  False        False      True   False  False   
5965    False  False        False     False   False  False   
5966    False  False        False     False   False  False   

      Production Country  Release Date  Rating  Duration  Imdb Score  \
0                  False         False   False     False       False   
1                  False         False   False     False       False   
2                  False         False 

In [5]:
#Check if there are null values
print(netflix_df.isnull().sum())

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64


In [6]:
required_nf_df = netflix_df[["Title", "Description", "Content Type", "Genres"  ]]
required_nf_df.head()

Unnamed: 0,Title,Description,Content Type,Genres
0,(Un)Well,This docuseries takes a deep dive into the luc...,TV Show,Reality TV
1,#Alive,"As a grisly virus rampages a city, a lone man ...",Movie,"Horror Movies, International Movies, Thrillers"
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...",Movie,"Documentaries, International Movies"
3,#blackAF,Kenya Barris and his family navigate relations...,TV Show,TV Comedies
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,Movie,"Documentaries, International Movies"


In [7]:
required_nf_df.count()

Title           5967
Description     5967
Content Type    5967
Genres          5967
dtype: int64

In [8]:
# Data cleaning, let's drop the rows containing the null values
required_nf_df.dropna()

Unnamed: 0,Title,Description,Content Type,Genres
0,(Un)Well,This docuseries takes a deep dive into the luc...,TV Show,Reality TV
1,#Alive,"As a grisly virus rampages a city, a lone man ...",Movie,"Horror Movies, International Movies, Thrillers"
2,#AnneFrank - Parallel Stories,"Through her diary, Anne Frank's story is retol...",Movie,"Documentaries, International Movies"
3,#blackAF,Kenya Barris and his family navigate relations...,TV Show,TV Comedies
4,#cats_the_mewvie,This pawesome documentary explores how our fel...,Movie,"Documentaries, International Movies"
...,...,...,...,...
5962,الف مبروك,"On his wedding day, an arrogant, greedy accoun...",Movie,"Comedies, Dramas, International Movies"
5963,دفعة القاهرة,A group of women leaves Kuwait to attend unive...,TV Show,"International TV Shows, TV Dramas"
5964,海的儿子,"Two brothers start a new life in Singapore, wh...",TV Show,"International TV Shows, TV Dramas"
5965,반드시 잡는다,After people in his town start turning up dead...,Movie,"Dramas, International Movies, Thrillers"


In [9]:
required_nf_df.count()

Title           5967
Description     5967
Content Type    5967
Genres          5967
dtype: int64

In [10]:
#Cleaning the title column
import nltk
import re

In [11]:
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

  text = re.sub('\[.*?\]', '', text)
  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('\w*\d\w*', '', text)


In [13]:
required_nf_df["Title"] = required_nf_df["Title"].apply(clean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  required_nf_df["Title"] = required_nf_df["Title"].apply(clean)


In [14]:
print(required_nf_df["Title"])

0                           unwel
1                            aliv
2       annefrank  parallel stori
3                         blackaf
4                    catsthemewvi
                  ...            
5962                    الف مبروك
5963                 دفعة القاهرة
5964                         海的儿子
5965                      반드시 잡는다
5966           최강전사 미니특공대  영웅의 탄생
Name: Title, Length: 5967, dtype: object


In [15]:
print(required_nf_df.Title.sample(10))

2637       last tango halifax
2430             juli phantom
4239                  shtisel
5131            price success
3726    pokémon movi power us
1111             cradl  grave
1412     dügün dernek  sünnet
662            bigfoot famili
5019                    lover
4454              stuck apart
Name: Title, dtype: object


In [16]:
genreList = required_nf_df["Genres"].tolist()
#tfidf = text.TfidfVectorizer(input=genreList, stop_words="english")
tfidf = text.TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(genreList)
similarity = cosine_similarity(tfidf_matrix)

In [17]:
indices = pd.Series(required_nf_df.index, index=required_nf_df['Title']).drop_duplicates()

In [18]:
def netFlix_recommendation(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return required_nf_df['Title'].iloc[movieindices]

In [19]:
print(netFlix_recommendation("fitoor"))

5       friendbutmarri
6      friendbutmarri 
33           week lago
50                 day
88         beauti life
127         love stori
143       second chanc
267          ali  alia
284               love
306        alonetogeth
Name: Title, dtype: object
