In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import seaborn as sns
%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [30]:
#Load the dataset with pandas
film = pd.read_csv("/content/netflix_film.csv")

In [54]:
#Displays the contents of the csv file data
film.head()

Unnamed: 0,Title,Genre,Tags,Languages,Runtime,IMDb Score,Release Date,Netflix Release Date,IMDb Votes
0,Lets Fight Ghost,Unknown,"Comedy Programmes,Romantic TV Comedies,Horror ...","Swedish, Spanish",< 30 minutes,0,12 Dec 2008,2021-03-04,0
1,HOW TO BUILD A GIRL,Unknown,"Dramas,Comedies,Films Based on Books,British",English,1-2 hour,0,08 May 2020,2021-03-04,0
2,Centigrade,Unknown,Thrillers,English,1-2 hour,0,28 Aug 2020,2021-03-04,0
3,ANNE+,Unknown,"TV Dramas,Romantic TV Dramas,Dutch TV Shows",Turkish,< 30 minutes,0,01 Oct 2016,2021-03-04,0
4,Moxie,Unknown,"Social Issue Dramas,Teen Movies,Dramas,Comedie...",English,1-2 hour,0,22 Sep 2011,2021-03-04,0


In [51]:
#Check the amount of NaN data for each column
film.isna().sum()

Title                   0
Genre                   0
Tags                    0
Languages               0
Runtime                 0
IMDb Score              0
Release Date            0
Netflix Release Date    0
IMDb Votes              0
dtype: int64

In [53]:
#Check the percentage of NaN data for each column
film.isna().sum()/len(film)*100

Title                   0.0
Genre                   0.0
Tags                    0.0
Languages               0.0
Runtime                 0.0
IMDb Score              0.0
Release Date            0.0
Netflix Release Date    0.0
IMDb Votes              0.0
dtype: float64

In [33]:
#Delete unnecessary columns (optional)
film.drop(film.columns[[4, 5, 6, 8, 9, 10, 11, 13, 14, 15, 16, 17, 20, 21, 22, 23, 25, 26, 27, 28]], axis = 1, inplace = True)

In [50]:
#Replacing Data that has NaN values
film['IMDb Score'] = film['IMDb Score'].replace(np.nan, 0, regex = True)
film['IMDb Votes'] = film['IMDb Votes'].replace(np.nan, 0, regex = True)
film['Genre'] = film['Genre'].replace(np.nan, 'Unknown', regex = True)
film['Tags'] = film['Tags'].replace(np.nan, 'Unknown', regex = True)
film['Release Date'] = film['Release Date'].replace(np.nan, 'Unknown', regex = True)
film['Languages'] = film['Languages'].replace(np.nan, 'Unknown', regex = True)
film['Runtime'] = film['Runtime'].replace(np.nan, 'Unknown', regex = True)

In [55]:
#Check the number of film index
len(film.index)

15480

In [56]:
#Change the data type of 'IMDb Score' & 'IMDb Votes' column as float
film['IMDb Score'] = film["IMDb Score"].astype(float)
film["IMDb Votes"] = film["IMDb Votes"].astype(float)

In [57]:
#Check the data type of each column
film.dtypes

Title                    object
Genre                    object
Tags                     object
Languages                object
Runtime                  object
IMDb Score              float64
Release Date             object
Netflix Release Date     object
IMDb Votes              float64
dtype: object

In [58]:
# Scaling

film_features = pd.concat([film["Genre"].str.get_dummies(sep=","),
                            pd.get_dummies(film[["Languages"]]),
                            film[["IMDb Score"]],film["IMDb Votes"]],axis=1)
film["Title"] = film["Title"].map(lambda Title:re.sub('[^A-Za-z0-9]+', " ", Title))
film_features.head()

Unnamed: 0,Unknown,Languages_Afrikaans,"Languages_Afrikaans, English","Languages_Afrikaans, German, Swiss German",Languages_Akan,"Languages_Akan, English","Languages_Albanian, French","Languages_American Sign Language, English","Languages_American Sign Language, English, Spanish",Languages_Amharic,Languages_Arabic,"Languages_Arabic, English","Languages_Arabic, English, Dutch","Languages_Arabic, English, French","Languages_Arabic, French","Languages_Arabic, French, English","Languages_Arabic, German, English","Languages_Arabic, Hebrew","Languages_Arabic, Hebrew, Persian, English","Languages_Arabic, Italian","Languages_Arabic, Nyanja, English","Languages_Arabic, Persian, English","Languages_Arabic, Swedish",Languages_Aromanian,Languages_Assamese,Languages_Basque,"Languages_Basque, English, Spanish","Languages_Basque, Spanish, Catalan","Languages_Belarusian, Polish, Russian, Ukrainian",Languages_Bengali,"Languages_Bengali, English","Languages_Bengali, English, Mandarin","Languages_Bengali, Hindi, English","Languages_Bhojpuri, Punjabi, Hindi",Languages_Brazilian Sign Language,Languages_Bulgarian,Languages_Cantonese,"Languages_Cantonese, Chinese, English","Languages_Cantonese, English","Languages_Cantonese, English, Filipino, Afrikaans",...,Languages_Turkish,"Languages_Turkish, Arabic","Languages_Turkish, Arabic, English, Swedish","Languages_Turkish, Azerbaijani, Russian, English, Spanish, Japanese, French, Arabic","Languages_Turkish, English","Languages_Turkish, English, German","Languages_Turkish, English, Latin, Arabic, Greek, Italian","Languages_Turkish, Flemish","Languages_Turkish, French","Languages_Turkish, Georgian","Languages_Turkish, German, English","Languages_Turkish, Italian","Languages_Turkish, Italian, French","Languages_Ukrainian, Russian, English",Languages_Unknown,Languages_Urdu,"Languages_Urdu, English, Arabic","Languages_Urdu, English, Punjabi, Bhojpuri","Languages_Urdu, Hindi","Languages_Urdu, Hindi, Spanish, French, Korean, English","Languages_Urdu, Norwegian","Languages_Urdu, Pushto","Languages_Urdu, Tajik, Russian",Languages_Uzbek,Languages_Vietnamese,"Languages_Wayuu, Spanish, English","Languages_Welsh, English","Languages_Wolof, French, English, Arabic","Languages_Xhosa, English, Afrikaans","Languages_Xhosa, Southern Sotho, English, Afrikaans","Languages_Yiddish, English, Spanish","Languages_Yoruba, English","Languages_Yoruba, English, Ibo, Hausa","Languages_Yoruba, Ibo, English",Languages_Zulu,"Languages_Zulu, Afrikaans, Xhosa, English","Languages_Zulu, English, Southern Sotho, Tswana","Languages_Zulu, Xhosa, Afrikaans, English",IMDb Score,IMDb Votes
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0


In [59]:
film_features.columns

Index(['Unknown', 'Languages_Afrikaans', 'Languages_Afrikaans, English',
       'Languages_Afrikaans, German, Swiss German', 'Languages_Akan',
       'Languages_Akan, English', 'Languages_Albanian, French',
       'Languages_American Sign Language, English',
       'Languages_American Sign Language, English, Spanish',
       'Languages_Amharic',
       ...
       'Languages_Yiddish, English, Spanish', 'Languages_Yoruba, English',
       'Languages_Yoruba, English, Ibo, Hausa',
       'Languages_Yoruba, Ibo, English', 'Languages_Zulu',
       'Languages_Zulu, Afrikaans, Xhosa, English',
       'Languages_Zulu, English, Southern Sotho, Tswana',
       'Languages_Zulu, Xhosa, Afrikaans, English', 'IMDb Score',
       'IMDb Votes'],
      dtype='object', length=1442)

In [60]:
from sklearn.preprocessing import MinMaxScaler

In [61]:
min_max_scaler = MinMaxScaler()
film_features = min_max_scaler.fit_transform(film_features)

In [62]:
np.round(film_features,2)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [63]:
from sklearn.neighbors import NearestNeighbors

In [64]:
nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(film_features)

In [65]:
distances, indices = nbrs.kneighbors(film_features)

In [66]:
#Get index of film title
def get_index_from_title(title):
    return film[film["Title"]==title].index.tolist()[0]

In [67]:
all_film_titles = list(film.Title.values)

In [68]:
#Get id & name from partial title
def get_id_from_partial_title(partial):
    for title in all_film_titles:
        if partial in title:
            print(title,all_film_titles.index(title))

In [69]:
#Check for similiary film
def print_similar_films(query=None):
      found_id = get_index_from_title(query)
      for id in indices[found_id][1:]:
          print(film.loc[id]["Title"])

In [71]:
get_id_from_partial_title("Batman")

Batman Vs Teenage Mutant Ninja Turtles 2643
Lego DC Batman Family Matters 3212
Batman Ninja 6931
The Lego Batman Movie 9181
Batman The Killing Joke 10359
Batman v Superman Dawn of Justice 10411
Lego DC Comics Batman Be Leaguered 11907
Batman Returns 13513
Batman Begins 14480
Batman Forever 15276
Batman Mask of the Phantasm 15277
Batman 15282


In [73]:
get_index_from_title("Batman Vs Teenage Mutant Ninja Turtles")

2643

In [72]:
print_similar_films(query="Lego DC Comics Batman Be Leaguered")

The Coroner
Joker
Centigrade
The Invisible
Harrys Daughters
