# Importing Necessary Libraries

In [7]:
import numpy as np
import pandas as pd
import copy
import re
import math
from scipy import spatial
from sklearn.neighbors import NearestNeighbors

# Loading Dataset

A total of 8807 Movies/TV Shows

In [8]:
netflix_df = pd.read_csv("./netflix_titles.csv")
netflix_df.shape

(8807, 12)

In [9]:
netflix_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [10]:
# Replacing all NaN values with "missing"
netflix_df.fillna('missing', inplace = True)
netflix_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,missing,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,missing,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",missing,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,missing,missing,missing,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,missing,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [11]:
netflix_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8807 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8807 non-null   object
 9   duration      8807 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [13]:
# Checking if there are any NaN still left
netflix_df.isnull().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

# Data Preprocessing and Cleaning

In [20]:
# Changing the Column "Listed" to "Genre"
netflix_df.rename(columns= {"listed_in": "genre"}, inplace=True)
netflix_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genre,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,missing,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,missing,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",missing,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,missing,missing,missing,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,missing,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


Columns Considered for Predictions are as Follows:

1. Country
2. Release_Year
3. Rating
4. Duration
5. Genre 

In [22]:
recommendation_cols = ["country", "release_year", "rating", "duration", "genre"]
new_df = copy.deepcopy(netflix_df[recommendation_cols])
new_df.head()

Unnamed: 0,country,release_year,rating,duration,genre
0,United States,2020,PG-13,90 min,Documentaries
1,South Africa,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries"
2,missing,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
3,missing,2021,TV-MA,1 Season,"Docuseries, Reality TV"
4,India,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ..."


In [23]:
country = []
release_year = [] 
rating = []
duration = [] 
genres = []

In [24]:
def split_by_delimeters(target_list):
    """
    this method splits a target list by some delimeters
    """
    result = []
    for i in target_list:
        delimiters = ",", "&"
        regex_pattern = '|'.join(map(re.escape, delimiters))
        result.extend(re.split(regex_pattern, i))
    result = [i.strip() if i not in ['', 'missing'] else i for i in result]
    return result

In [27]:
# preparing all columns for the dataset
country = list(set(split_by_delimeters(new_df['country'])))
release_year = list(set(new_df['release_year']))
release_year = [str(i) for i in release_year]
ratings = list(set(new_df['rating']))
seasons_durations = ['1_season', '2_season', '3_season', '4_season','5+_season']
movies_durations = ['0_25_min', '26_50_min', '51_75_min', '76_100_min', 
                    '101_125_min', '126_150_min', '151+_min' ]
durations = seasons_durations + movies_durations
genres = list(set(split_by_delimeters(new_df['genre'])))

In [28]:
# combining all columns for the one hot encoded vector form
all_columns = country + release_year + ratings + durations + genres
all_columns.remove('missing')

In [29]:
# initializes a df with '0' values for the one-hot-encoded vector
ohe_df = pd.DataFrame(0, index = np.arange(len(new_df)), columns = all_columns)

In [30]:
def duration_adjustment(duration: str) -> str:
    try:
        dur_list = []
        if 'Season' in duration:
            temp_res = duration.split()
            no_of_seasons = int(temp_res[0])
            if no_of_seasons <5:
                return seasons_durations[no_of_seasons - 1]
            return seasons_durations[-1]

        else:
            temp_res = duration.split()
            runtime_mins = int(temp_res[0])
            if runtime_mins <= 150:
                index = math.ceil((runtime_mins/25) - 1.0)
                return movies_durations[index]
            return movies_durations[-1]
    except:
        return 'missing'

In [33]:
def return_columns(row):
    """
    recieves a df row and returns the respective columns/features
    that the item i.e. movie falls in
    """
    result_cols = []
    result_cols.extend(split_by_delimeters([row['country']]))
    result_cols.extend(split_by_delimeters([row['genre']]))
    result_cols.append(str(row['release_year']))
    result_cols.append(row['rating'])
    result_cols.append(duration_adjustment(str(row['duration'])))
    if 'missing' in result_cols:
        result_cols.remove('missing')
    return result_cols
    

In [34]:
# preparing the one hot encoded df of all items i.e. movies as vectors
for ind,row in new_df.iterrows():
    ohe_df.loc[ind, return_columns(row)] = 1

In [35]:
ohe_df.head()

Unnamed: 0,Unnamed: 1,Somalia,Puerto Rico,Argentina,United Kingdom,Iceland,Samoa,Afghanistan,Egypt,Malaysia,...,Science,Classic Movies,Independent Movies,Horror Movies,Dramas,Spirituality,International TV Shows,Children,Sci-Fi,Anime Series
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [36]:
def recommend_by_cosine(movie, top_items):
    """
    recommends top_similar movies based on cosine similarity
    """
    movie_index = netflix_df[netflix_df['title'] == movie].index[0]
    vector = ohe_df.iloc[movie_index]
    distance = []
    for ind, row in ohe_df.iterrows():
        distance.append(spatial.distance.cosine(vector, row))
    
    indexes = sorted(range(len(distance)), key=lambda i: distance[i])[:top_items + 1]
    
    return list(netflix_df.iloc[indexes]['title'])[1:]
    

In [37]:
def recommend_by_knn(movie, top_items):
    """
    recommends top_similar movies based on knn algorithm
    """
    movie_index = netflix_df[netflix_df['title'] == movie].index[0]
    vector = ohe_df.iloc[movie_index]
    knn = NearestNeighbors(n_neighbors= top_items + 1, algorithm='auto')
    knn.fit(ohe_df.values)
    indexes = list(knn.kneighbors([vector], top_items + 1, return_distance=False)[0])
    return list(netflix_df.iloc[indexes]['title'])[1:]

In [48]:
# the first index is the movie itself and the rest are recommendations
recommend_by_cosine('Om Shanti Om',15)

['Dil',
 'Soldier',
 'Duplicate',
 'Kal Ho Naa Ho',
 'Kuch Kuch Hota Hai',
 'Kannum Kannum Kollaiyadithaal',
 'Dil Chahta Hai',
 'Honeymoon Travels Pvt. Ltd.',
 'Zindagi Na Milegi Dobara',
 'Chup Chup Ke',
 'Hattrick',
 'Namastey London',
 'Phir Bhi Dil Hai Hindustani',
 'Kabhi Haan Kabhi Naa',
 'English Babu Desi Mem']

In [47]:
# the first row is the movie itself and the rest are recommendations
recommend_by_knn('Om Shanti Om', 10)

['Phir Bhi Dil Hai Hindustani',
 'Ramji Londonwaley',
 'English Babu Desi Mem',
 'Kal Ho Naa Ho',
 'Chup Chup Ke',
 'Hattrick',
 'Honeymoon Travels Pvt. Ltd.',
 'Kabhi Haan Kabhi Naa',
 'Kuch Kuch Hota Hai',
 'Dil Chahta Hai']