## Part 0: Import Dependencies and Set-Up

In [58]:
# Import Dependencies
import hvplot.pandas
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import random
from scipy.spatial import distance
#from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer, StandardScaler


In [3]:
# Pandas Settings

# Display All Columns
pd.set_option('display.max_columns', None)


In [4]:
# Path to file directory and variables for input files.
file_dir = os.path.join("Data")

# imdb Titles metadata (Extracted from title.basics.tsv)
titles_metadata_file = f'{file_dir}/title_basics_non-adult_movies.tsv'

# imdb US Titles only ids (Extracted from title.akas.tsv)
titles_us_ids_only_file = f'{file_dir}/US_title_ids.csv'

# imdb Ratings data (Derived from title.ratings.tsv)
ratings_data_file = f'{file_dir}/title_ratings.csv'


In [5]:
# Set Viewer Title for Testing
#viewerTitle = "Apocalypse Now"
viewerTitle = "The Maltese Falcon (1941)"
#viewerTitle = "Toy Story"
#viewerTitle = "Witness (1985)"


## Part 1: Import Data, Clean and Transform Data

In [6]:
# Import imdb Titles metadata, imdb US Title IDs, imdb Ratings data

titles_metadata = pd.read_csv(titles_metadata_file, sep='\t')
titles_us_ids_only = pd.read_csv(titles_us_ids_only_file)
ratings_data = pd.read_csv(ratings_data_file)


In [7]:
# Check titles_metadata DataFrame
print(titles_metadata.shape)
titles_metadata.count()
titles_metadata.head()


(584642, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
3,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
4,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


In [8]:
# Drop all Titles where primaryTitle differs from originalTitle
# (Since language of titles is not often available, this is an attempt
# to filter out obscure non-English language films)

titles_metadata = titles_metadata.loc[titles_metadata['primaryTitle'] == titles_metadata['originalTitle']]


In [9]:
# Look for Films with the same primaryTitle
# and set primaryTitle to primaryTitle + (startYear)

duplicate_titles_df = pd.concat(g for _, g in titles_metadata.groupby('primaryTitle') if len(g) > 1)

duplicate_titles_df['primaryTitle'] = duplicate_titles_df.apply(lambda row: "".join([row['primaryTitle'], " (", str(row['startYear']), ")"]), axis=1)
duplicate_titles_df['originalTitle'] = duplicate_titles_df['primaryTitle']

duplicate_titles_df


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
417445,tt3120962,movie,#5 (2013),#5 (2013),0,2013,\N,68,"Biography,Comedy,Fantasy"
553039,tt8219776,movie,#5 (2018),#5 (2018),0,2018,\N,\N,Documentary
262785,tt11803670,movie,#Love (\N),#Love (\N),0,\N,\N,\N,Drama
342883,tt15521960,movie,#Love (\N),#Love (\N),0,\N,\N,\N,"Comedy,Romance"
446725,tt4004608,movie,$elfie Shootout (2016),$elfie Shootout (2016),0,2016,\N,86,Comedy
...,...,...,...,...,...,...,...,...,...
580668,tt9686590,movie,Ûmi no kyodai (1935),Ûmi no kyodai (1935),0,1935,\N,\N,Drama
450813,tt4149802,movie,Ümmü Sibyan: Zifir (2014),Ümmü Sibyan: Zifir (2014),0,2014,\N,\N,Horror
513928,tt6448010,movie,Ümmü Sibyan: Zifir (2015),Ümmü Sibyan: Zifir (2015),0,2015,\N,\N,\N
195195,tt0431498,movie,Üvey ana (1967),Üvey ana (1967),0,1967,\N,\N,"Drama,Romance"


In [10]:
# Merge duplicate_titles_df back with titles_metadata

cols = list(titles_metadata.columns)
titles_metadata.loc[titles_metadata['tconst'].isin(duplicate_titles_df['tconst']), cols] = duplicate_titles_df[cols]


In [11]:
# Check titles_us_ids_only DataFrame
print(titles_us_ids_only.shape)
titles_us_ids_only.count
titles_us_ids_only.head()


(1308380, 1)


Unnamed: 0,tconst
0,tt0000001
1,tt0000002
2,tt0000005
3,tt0000005
4,tt0000005


In [12]:
# Drop all Titles from titles_metadata that are not in titles_us_ids_only

titles_metadata = pd.merge(titles_metadata, titles_us_ids_only, on='tconst', how='inner')
titles_metadata = titles_metadata.drop_duplicates()


In [13]:
# Check results
print(titles_metadata.shape)
titles_metadata.head()


(207524, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
1,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,\N,120,"Adventure,Fantasy"
2,tt0001101,movie,Abraham Lincoln's Clemency,Abraham Lincoln's Clemency,0,1910,\N,\N,\N
3,tt0001159,movie,The Connecticut Yankee,The Connecticut Yankee,0,1910,\N,\N,\N
4,tt0001230,movie,Gentleman Joe,Gentleman Joe,0,1910,\N,\N,\N


In [14]:
# Drop titles_metadata Rows with "\N" for genres and startYear
# Drop titleType isAdult and endYear Columns

#titles_metadata['genres'].value_counts()

titles_metadata = titles_metadata.loc[~(titles_metadata['genres'] == "\\N") & ~(titles_metadata['startYear'] == "\\N")]
titles_metadata.drop(['titleType'], axis=1, inplace=True)
titles_metadata.drop(['isAdult'], axis=1, inplace=True)
titles_metadata.drop(['endYear'], axis=1, inplace=True)


In [15]:
# Check results
print(titles_metadata.shape)
#titles_metadata.dtypes
titles_metadata.head()


(153255, 6)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography"
1,tt0000679,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,1908,120,"Adventure,Fantasy"
5,tt0001285,The Life of Moses,The Life of Moses,1909,50,"Biography,Drama,Family"
11,tt0001498,The Battle of Trafalgar,The Battle of Trafalgar,1911,51,War
17,tt0001892,Den sorte drøm,Den sorte drøm,1911,53,Drama


In [16]:
# Convert startYear Column to int

titles_metadata['startYear'] = pd.to_numeric(titles_metadata['startYear'])


In [17]:
# Check results
titles_metadata.dtypes


tconst            object
primaryTitle      object
originalTitle     object
startYear          int64
runtimeMinutes    object
genres            object
dtype: object

In [18]:
# Drop titles_metadata Rows with 'startYear' less than 1920

titles_metadata = titles_metadata.loc[titles_metadata['startYear'] >= 1920]


In [19]:
# Check results
print(titles_metadata.shape)
titles_metadata.head()


(148483, 6)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
199,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western"
597,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy
2517,tt0008422,Perils of the West,Perils of the West,1922,\N,Western
2746,tt0008736,The Victim (1920),The Victim (1920),1920,\N,Drama
3089,tt0009187,His Temporary Wife,His Temporary Wife,1920,\N,Comedy


In [20]:
# Check ratings_metadata DataFrame
print(ratings_data.shape)
ratings_data.count()
ratings_data.head()


(1201036, 3)


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1834
1,tt0000002,6.0,236
2,tt0000003,6.5,1594
3,tt0000004,6.0,153
4,tt0000005,6.2,2410


In [65]:
# Merge titles_metadata and ratings_data on tconst

movies_df = pd.merge(titles_metadata, ratings_data, on="tconst")

# Extract only viewerTitle + 100 Titles for Testing Purposes
movies_df = pd.merge(movies_df.loc[movies_df['primaryTitle'] == viewerTitle], movies_df.sample(100), how='outer')
movies_df.drop_duplicates()

print(movies_df.shape)
movies_df.head()


(101078, 8)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western",3.7,28
1,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy,6.6,70
2,tt0010058,The Deadlier Sex,The Deadlier Sex,1920,60,"Comedy,Drama",6.2,25
3,tt0010495,My Husband's Other Wife,My Husband's Other Wife,1920,\N,Drama,4.2,15
4,tt0010502,Nachtgestalten (1920),Nachtgestalten (1920),1920,106,Horror,6.4,16


In [66]:
# Add url column to movies_df
movies_df['url'] = movies_df.apply(lambda row: "".join(["https://www.imdb.com/title/", row['tconst'], "/"]), axis=1)

print(movies_df.shape)
movies_df.head()


(101078, 9)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url
0,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western",3.7,28,https://www.imdb.com/title/tt0003854/
1,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy,6.6,70,https://www.imdb.com/title/tt0005076/
2,tt0010058,The Deadlier Sex,The Deadlier Sex,1920,60,"Comedy,Drama",6.2,25,https://www.imdb.com/title/tt0010058/
3,tt0010495,My Husband's Other Wife,My Husband's Other Wife,1920,\N,Drama,4.2,15,https://www.imdb.com/title/tt0010495/
4,tt0010502,Nachtgestalten (1920),Nachtgestalten (1920),1920,106,Horror,6.4,16,https://www.imdb.com/title/tt0010502/


In [67]:
# Check DataFrame

movies_df.dtypes


tconst             object
primaryTitle       object
originalTitle      object
startYear           int64
runtimeMinutes     object
genres             object
averageRating     float64
numVotes            int64
url                object
dtype: object

In [68]:
# Convert 'genres' entries into lists

movies_df['genres_list'] = movies_df.apply(lambda row: row['genres'].split(","), axis=1)

print(movies_df.shape)
movies_df.head()


(101078, 10)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list
0,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western",3.7,28,https://www.imdb.com/title/tt0003854/,"[Adventure, Music, Western]"
1,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy,6.6,70,https://www.imdb.com/title/tt0005076/,[Comedy]
2,tt0010058,The Deadlier Sex,The Deadlier Sex,1920,60,"Comedy,Drama",6.2,25,https://www.imdb.com/title/tt0010058/,"[Comedy, Drama]"
3,tt0010495,My Husband's Other Wife,My Husband's Other Wife,1920,\N,Drama,4.2,15,https://www.imdb.com/title/tt0010495/,[Drama]
4,tt0010502,Nachtgestalten (1920),Nachtgestalten (1920),1920,106,Horror,6.4,16,https://www.imdb.com/title/tt0010502/,[Horror]


In [69]:
# Transform (get_dummies via Multi Label Bin Encoding) movies_df by 'genres'

genres = movies_df['genres_list']

mlb = MultiLabelBinarizer()

X = pd.DataFrame(mlb.fit_transform(genres), columns=mlb.classes_, index=movies_df.index)

print(X.shape)
X.head()


(101078, 27)


Unnamed: 0,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [70]:
# Merge X back with movies_df

movies_df = pd.merge(movies_df, X, how='inner', left_index=True, right_index=True)

print(movies_df.shape)
movies_df.head()


(101078, 37)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western",3.7,28,https://www.imdb.com/title/tt0003854/,"[Adventure, Music, Western]",0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy,6.6,70,https://www.imdb.com/title/tt0005076/,[Comedy],0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,tt0010058,The Deadlier Sex,The Deadlier Sex,1920,60,"Comedy,Drama",6.2,25,https://www.imdb.com/title/tt0010058/,"[Comedy, Drama]",0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,tt0010495,My Husband's Other Wife,My Husband's Other Wife,1920,\N,Drama,4.2,15,https://www.imdb.com/title/tt0010495/,[Drama],0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,tt0010502,Nachtgestalten (1920),Nachtgestalten (1920),1920,106,Horror,6.4,16,https://www.imdb.com/title/tt0010502/,[Horror],0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [71]:
# Find viewerTitle Data for Testing Purposes

movies_df.loc[(movies_df['primaryTitle'] == viewerTitle)]


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
7887,tt0033870,The Maltese Falcon (1941),The Maltese Falcon (1941),1941,100,"Crime,Film-Noir,Mystery",8.0,154114,https://www.imdb.com/title/tt0033870/,"[Crime, Film-Noir, Mystery]",0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [72]:
# Integrate 'averageRating' into X DataFrame with 'primaryTitle' as new Index
Z = pd.merge(movies_df[['primaryTitle', 'averageRating']], X, how='outer', left_index=True, right_index=True)

Z.set_index('primaryTitle', inplace=True)

print(Z.shape)
Z.head()


(101078, 28)


Unnamed: 0_level_0,averageRating,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,Game-Show,History,Horror,Music,Musical,Mystery,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
primaryTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
Dodge City Trail,3.7,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
Charley's Aunt (1925),6.6,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
The Deadlier Sex,6.2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
My Husband's Other Wife,4.2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Nachtgestalten (1920),6.4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [73]:
# Standardize the data with StandardScaler()

Z = StandardScaler().fit_transform(Z)
Z[:5]


array([[-1.71653767e+00, -3.77062220e-01,  3.37491431e+00,
        -1.28065202e-01, -2.02193760e-01, -5.90199873e-01,
        -3.45768806e-01, -4.57870903e-01, -8.52523275e-01,
        -1.94734823e-01, -1.82354760e-01, -8.62282712e-02,
        -3.14538512e-03, -1.71714170e-01, -3.60494818e-01,
         5.34368513e+00, -1.44379836e-01, -2.35657951e-01,
        -6.27944836e-02, -1.44153917e-02, -3.67865697e-01,
        -2.05039599e-01, -1.21828679e-02, -1.35221147e-01,
        -8.89680067e-03, -3.62890797e-01, -1.41169264e-01,
         5.23990879e+00],
       [ 3.62269440e-01, -3.77062220e-01, -2.96303820e-01,
        -1.28065202e-01, -2.02193760e-01,  1.69434127e+00,
        -3.45768806e-01, -4.57870903e-01, -8.52523275e-01,
        -1.94734823e-01, -1.82354760e-01, -8.62282712e-02,
        -3.14538512e-03, -1.71714170e-01, -3.60494818e-01,
        -1.87136775e-01, -1.44379836e-01, -2.35657951e-01,
        -6.27944836e-02, -1.44153917e-02, -3.67865697e-01,
        -2.05039599e-01, -1.21

## Part 2: Principal Component Analysis

In [74]:
# Use PCA to reduce dimensions to three principal components
pca = PCA(n_components=3)

movies_pca = pca.fit_transform(Z)
movies_pca


array([[-0.11767739, -0.06042216,  2.55565026],
       [ 0.19668691, -0.73976151,  0.5176012 ],
       [ 0.00342955, -1.35492424, -0.44852581],
       ...,
       [ 2.14313528,  1.00855902,  0.32639631],
       [ 1.87541937,  1.04597207,  0.3915608 ],
       [-2.62386273,  1.15082146,  1.23491255]])

In [75]:
# Create a DataFrame with the three principal components
col_names = ["PC 1", "PC 2", "PC 3"]
#movies_pca_df = pd.DataFrame(movies_pca, columns=col_names, index=movies_df['primaryTitle'])
movies_pca_df = pd.DataFrame(movies_pca, columns=col_names, index=movies_df.index)

print(movies_pca_df.shape)
movies_pca_df.head()


(101078, 3)


Unnamed: 0,PC 1,PC 2,PC 3
0,-0.117677,-0.060422,2.55565
1,0.196687,-0.739762,0.517601
2,0.00343,-1.354924,-0.448526
3,-0.546932,-0.317495,-0.658034
4,-0.751162,1.327581,0.581499


## Part 3: Clustering Using Hierarchical Clustering

In [76]:
# Create the dendrogram

#fig = ff.create_dendrogram(movies_pca_df, color_threshold = 6)
#fig.update_layout(width=1000, height=800)
#fig.show()


MemoryError: Unable to allocate 38.1 GiB for an array with shape (5108330503,) and data type float64

In [77]:
# Run the Hierarchical Algorithm

agg = AgglomerativeClustering(n_clusters = 5)
model = agg.fit(movies_pca_df)


MemoryError: Unable to allocate 38.1 GiB for an array with shape (5108330503,) and data type float64

In [46]:
# Add a new Class Column to movies_pca_df

movies_pca_df['Class'] = model.labels_
movies_pca_df.head()

# Create overall clustered_df

clustered_df = pd.concat([movies_df, movies_pca_df], axis=1, sort=False)

print(clustered_df.shape)
clustered_df.head()


(101, 37)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,News,Romance,Sci-Fi,Sport,Thriller,War,Western,PC 1,PC 2,PC 3,Class
0,tt0033870,The Maltese Falcon (1941),The Maltese Falcon (1941),1941,100,"Crime,Film-Noir,Mystery",8.0,154114,https://www.imdb.com/title/tt0033870/,"[Crime, Film-Noir, Mystery]",0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,-1.342194,4.449498,-3.64101,1
1,tt0082639,"Ladies and Gentlemen, the Fabulous Stains","Ladies and Gentlemen, the Fabulous Stains",1982,87,"Comedy,Drama,Music",6.8,2633,https://www.imdb.com/title/tt0082639/,"[Comedy, Drama, Music]",0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0.145914,1.170659,0.014516,0
2,tt6341286,Stoneland (2016),Stoneland (2016),2016,46,Documentary,9.2,21,https://www.imdb.com/title/tt6341286/,[Documentary],0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-2.182739,-0.618947,0.591701,4
3,tt1263676,East L.A.,East L.A.,2008,\N,Drama,8.8,24,https://www.imdb.com/title/tt1263676/,[Drama],0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.796872,0.771669,0.040491,0
4,tt0027243,Édes mostoha,Édes mostoha,1935,90,Drama,6.7,27,https://www.imdb.com/title/tt0027243/,[Drama],0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.04671,0.663807,0.101798,0


## Part 5: Generate Recommendations for User

In [47]:
# Find viewerTitle Data for Testing Purposes

clustered_df.loc[(clustered_df['primaryTitle'] == viewerTitle)]


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,Film-Noir,History,Horror,Music,Musical,Mystery,News,Romance,Sci-Fi,Sport,Thriller,War,Western,PC 1,PC 2,PC 3,Class
0,tt0033870,The Maltese Falcon (1941),The Maltese Falcon (1941),1941,100,"Crime,Film-Noir,Mystery",8.0,154114,https://www.imdb.com/title/tt0033870/,"[Crime, Film-Noir, Mystery]",0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,-1.342194,4.449498,-3.64101,1


In [48]:
# Find tconst for viewerTitle

viewer_tconst = clustered_df.loc[(clustered_df['primaryTitle'] == viewerTitle)]['tconst']
viewer_tconst


0    tt0033870
Name: tconst, dtype: object

#### Take viewerTitle and find Closest Neighbors

In [49]:
# Find Class of viewerTitle

#clustered_df.head()
#clustered_df.loc[clustered_df['primaryTitle'] == viewerTitle]
viewerTitleClass = clustered_df.loc[clustered_df['primaryTitle'] == viewerTitle]['Class'].values[0]
viewerTitleClass


1

In [54]:
# Create a Distance Matrix by 'tconst'

# First, create a DataFrame of only the three Principal Components
# of Titles in the same Class as viewerTitle

clustered_df = clustered_df.loc[clustered_df['Class'] == viewerTitleClass]

distance_inputs_df = clustered_df[['tconst', 'PC 1', 'PC 2', 'PC 3']]
distance_inputs_df.set_index('tconst', inplace=True)

print(distance_inputs_df.shape)
distance_inputs_df.head()


(11, 3)


Unnamed: 0_level_0,PC 1,PC 2,PC 3
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0033870,-1.342194,4.449498,-3.64101
tt2170744,-0.67065,1.303777,-1.441684
tt0115907,-0.029394,3.348105,-1.99353
tt0035199,0.751377,1.251212,-1.999624
tt4442758,0.072659,1.348802,-2.055092


In [55]:
# Find Principal Component Coordinates
# for viewer_tconst

viewer_input_df = distance_inputs_df.loc[viewer_tconst]
viewer_input_df


Unnamed: 0_level_0,PC 1,PC 2,PC 3
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
tt0033870,-1.342194,4.449498,-3.64101


In [56]:
# Convert distance_inputs_df to Numpy Array

distance_inputs = distance_inputs_df.to_numpy()
distance_inputs

viewer_input = viewer_input_df.to_numpy()
viewer_input


array([[-1.34219383,  4.44949813, -3.64101036]])

In [59]:
# Calculate Euclidean Distances

distance_results = distance.cdist(viewer_input, distance_inputs, 'euclidean')
distance_results[0]


array([0.        , 3.89660975, 2.37712022, 4.16007499, 3.75915737,
       2.55304002, 3.42052428, 2.07400541, 4.53530539, 3.23880685,
       1.28302866])

In [60]:
# For each distance in distance_results, add a small random number
# to help guarantee uniqueness of distances
# (If distance is 0, leave it unchanged)

distance_results_rand = []

for distance in distance_results[0]:
    if distance == 0:
        continue
    else:
        distance = distance + random.randrange(1, 9, 1)/10e15
        
    distance_results_rand.append(distance)

distance_results_rand = np.asarray(distance_results_rand)
len(distance_results_rand)


10

In [61]:
# Find the Smallest Non-Zero Distance and its Position

min_non_zero = np.min(distance_results_rand[np.nonzero(distance_results_rand)])
min_non_zero

recommendation_index = list(distance_results_rand).index(min_non_zero)
recommendation_index


9

In [62]:
# Find Corresponding tconst for this index

print(clustered_df.iloc[recommendation_index]['tconst'])
print(clustered_df.iloc[recommendation_index]['primaryTitle'])
print(clustered_df.iloc[recommendation_index]['url'])


tt0045911
Inferno (1953)
https://www.imdb.com/title/tt0045911/


#### Output 5 Recommendations

In [63]:
# Find the 5 Smallest Non-Zero Distance and their Positions

k = 5
five_min_non_zero = np.partition(distance_results_rand[np.nonzero(distance_results_rand)], k)[:k]
five_min_non_zero


array([2.37712022, 1.28302866, 2.07400541, 2.55304002, 3.23880685])

In [64]:
# Loop through five_min_non_zero

for entry in five_min_non_zero:
    #print(entry)
    recommendation_index = list(distance_results_rand).index(entry)
    print(clustered_df.iloc[recommendation_index]['tconst'])
    print(clustered_df.iloc[recommendation_index]['Class'])
    print(clustered_df.iloc[recommendation_index]['primaryTitle'])
    print(clustered_df.iloc[recommendation_index]['url'])


tt2170744
1
Who Killed Soul Glow?
https://www.imdb.com/title/tt2170744/
tt0045911
1
Inferno (1953)
https://www.imdb.com/title/tt0045911/
tt2466510
1
The Backpack
https://www.imdb.com/title/tt2466510/
tt4442758
1
Temper (2015)
https://www.imdb.com/title/tt4442758/
tt0100786
1
Till There Was You (1991)
https://www.imdb.com/title/tt0100786/
