## Part 0: Import Dependencies and Set-Up

In [1]:
# Import Dependencies
#import hvplot.pandas
from imblearn.ensemble import BalancedRandomForestClassifier
import numpy as np
import os
import pandas as pd
#import plotly.express as px
#import random
#from scipy.spatial import distance
#from sklearn.cluster import KMeans
#from sklearn.decomposition import PCA
#from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer, StandardScaler


In [2]:
# Pandas Settings

# Display All Columns
pd.set_option('display.max_columns', None)


In [3]:
# Path to file directory and variables for input files.
file_dir = os.path.join("Data")

# imdb Titles metadata (Extracted from title.basics.tsv)
titles_metadata_file = f'{file_dir}/title_basics_non-adult_movies.tsv'

# imdb US Titles only ids (Extracted from title.akas.tsv)
titles_us_ids_only_file = f'{file_dir}/US_title_ids.csv'

# imdb Ratings data (Derived from title.ratings.tsv)
ratings_data_file = f'{file_dir}/title_ratings.csv'


In [4]:
# Set Viewer Title for Testing
#viewerTitle = "Apocalypse Now"
viewerTitle = "The Maltese Falcon (1941)"
#viewerTitle = "Toy Story"
#viewerTitle = "Witness (1985)"


## Part 1: Import Data, Clean and Transform Data

In [5]:
# Import imdb Titles metadata, imdb US Title IDs, imdb Ratings data

titles_metadata = pd.read_csv(titles_metadata_file, sep='\t')
titles_us_ids_only = pd.read_csv(titles_us_ids_only_file)
ratings_data = pd.read_csv(ratings_data_file)


In [6]:
# Check titles_metadata DataFrame
print(titles_metadata.shape)
titles_metadata.count()
titles_metadata.head()


(584642, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,\N,90,Drama
3,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1907,\N,\N,Drama
4,tt0000630,movie,Hamlet,Amleto,0,1908,\N,\N,Drama


In [7]:
# Drop all Titles where primaryTitle differs from originalTitle
# (Since language of titles is not often available, this is an attempt
# to filter out obscure non-English language films)

titles_metadata = titles_metadata.loc[titles_metadata['primaryTitle'] == titles_metadata['originalTitle']]


In [8]:
# Look for Films with the same primaryTitle
# and set primaryTitle to primaryTitle + (startYear)

duplicate_titles_df = pd.concat(g for _, g in titles_metadata.groupby('primaryTitle') if len(g) > 1)

duplicate_titles_df['primaryTitle'] = duplicate_titles_df.apply(lambda row: "".join([row['primaryTitle'], " (", str(row['startYear']), ")"]), axis=1)
duplicate_titles_df['originalTitle'] = duplicate_titles_df['primaryTitle']

duplicate_titles_df


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
417445,tt3120962,movie,#5 (2013),#5 (2013),0,2013,\N,68,"Biography,Comedy,Fantasy"
553039,tt8219776,movie,#5 (2018),#5 (2018),0,2018,\N,\N,Documentary
262785,tt11803670,movie,#Love (\N),#Love (\N),0,\N,\N,\N,Drama
342883,tt15521960,movie,#Love (\N),#Love (\N),0,\N,\N,\N,"Comedy,Romance"
446725,tt4004608,movie,$elfie Shootout (2016),$elfie Shootout (2016),0,2016,\N,86,Comedy
...,...,...,...,...,...,...,...,...,...
580668,tt9686590,movie,Ûmi no kyodai (1935),Ûmi no kyodai (1935),0,1935,\N,\N,Drama
450813,tt4149802,movie,Ümmü Sibyan: Zifir (2014),Ümmü Sibyan: Zifir (2014),0,2014,\N,\N,Horror
513928,tt6448010,movie,Ümmü Sibyan: Zifir (2015),Ümmü Sibyan: Zifir (2015),0,2015,\N,\N,\N
195195,tt0431498,movie,Üvey ana (1967),Üvey ana (1967),0,1967,\N,\N,"Drama,Romance"


In [9]:
# Merge duplicate_titles_df back with titles_metadata

cols = list(titles_metadata.columns)
titles_metadata.loc[titles_metadata['tconst'].isin(duplicate_titles_df['tconst']), cols] = duplicate_titles_df[cols]


In [10]:
# Check titles_us_ids_only DataFrame
print(titles_us_ids_only.shape)
titles_us_ids_only.count
titles_us_ids_only.head()


(1308380, 1)


Unnamed: 0,tconst
0,tt0000001
1,tt0000002
2,tt0000005
3,tt0000005
4,tt0000005


In [11]:
# Drop all Titles from titles_metadata that are not in titles_us_ids_only

titles_metadata = pd.merge(titles_metadata, titles_us_ids_only, on='tconst', how='inner')
titles_metadata = titles_metadata.drop_duplicates()


In [12]:
# Check results
print(titles_metadata.shape)
titles_metadata.head()


(207524, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Action,Adventure,Biography"
1,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,\N,120,"Adventure,Fantasy"
2,tt0001101,movie,Abraham Lincoln's Clemency,Abraham Lincoln's Clemency,0,1910,\N,\N,\N
3,tt0001159,movie,The Connecticut Yankee,The Connecticut Yankee,0,1910,\N,\N,\N
4,tt0001230,movie,Gentleman Joe,Gentleman Joe,0,1910,\N,\N,\N


In [13]:
# Drop titles_metadata Rows with "\N" for genres and startYear
# Drop titleType isAdult and endYear Columns

#titles_metadata['genres'].value_counts()

titles_metadata = titles_metadata.loc[~(titles_metadata['genres'] == "\\N") & ~(titles_metadata['startYear'] == "\\N")]
titles_metadata.drop(['titleType'], axis=1, inplace=True)
titles_metadata.drop(['isAdult'], axis=1, inplace=True)
titles_metadata.drop(['endYear'], axis=1, inplace=True)


In [14]:
# Check results
print(titles_metadata.shape)
#titles_metadata.dtypes
titles_metadata.head()


(153255, 6)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,70,"Action,Adventure,Biography"
1,tt0000679,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,1908,120,"Adventure,Fantasy"
5,tt0001285,The Life of Moses,The Life of Moses,1909,50,"Biography,Drama,Family"
11,tt0001498,The Battle of Trafalgar,The Battle of Trafalgar,1911,51,War
17,tt0001892,Den sorte drøm,Den sorte drøm,1911,53,Drama


In [15]:
# Convert startYear Column to int

titles_metadata['startYear'] = pd.to_numeric(titles_metadata['startYear'])


In [16]:
# Check results
titles_metadata.dtypes


tconst            object
primaryTitle      object
originalTitle     object
startYear          int64
runtimeMinutes    object
genres            object
dtype: object

In [17]:
# Drop titles_metadata Rows with 'startYear' less than 1920

titles_metadata = titles_metadata.loc[titles_metadata['startYear'] >= 1920]


In [18]:
# Check results
print(titles_metadata.shape)
titles_metadata.head()


(148483, 6)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
199,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western"
597,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy
2517,tt0008422,Perils of the West,Perils of the West,1922,\N,Western
2746,tt0008736,The Victim (1920),The Victim (1920),1920,\N,Drama
3089,tt0009187,His Temporary Wife,His Temporary Wife,1920,\N,Comedy


In [19]:
# Check ratings_metadata DataFrame
print(ratings_data.shape)
ratings_data.count()
ratings_data.head()


(1201036, 3)


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1834
1,tt0000002,6.0,236
2,tt0000003,6.5,1594
3,tt0000004,6.0,153
4,tt0000005,6.2,2410


In [20]:
# Merge titles_metadata and ratings_data on tconst

movies_df = pd.merge(titles_metadata, ratings_data, on="tconst")

print(movies_df.shape)
movies_df.head()


(101078, 8)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western",3.7,28
1,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy,6.6,70
2,tt0010058,The Deadlier Sex,The Deadlier Sex,1920,60,"Comedy,Drama",6.2,25
3,tt0010495,My Husband's Other Wife,My Husband's Other Wife,1920,\N,Drama,4.2,15
4,tt0010502,Nachtgestalten (1920),Nachtgestalten (1920),1920,106,Horror,6.4,16


In [21]:
# Add url column to movies_df
movies_df['url'] = movies_df.apply(lambda row: "".join(["https://www.imdb.com/title/", row['tconst'], "/"]), axis=1)

print(movies_df.shape)
movies_df.head()


(101078, 9)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url
0,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western",3.7,28,https://www.imdb.com/title/tt0003854/
1,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy,6.6,70,https://www.imdb.com/title/tt0005076/
2,tt0010058,The Deadlier Sex,The Deadlier Sex,1920,60,"Comedy,Drama",6.2,25,https://www.imdb.com/title/tt0010058/
3,tt0010495,My Husband's Other Wife,My Husband's Other Wife,1920,\N,Drama,4.2,15,https://www.imdb.com/title/tt0010495/
4,tt0010502,Nachtgestalten (1920),Nachtgestalten (1920),1920,106,Horror,6.4,16,https://www.imdb.com/title/tt0010502/


In [22]:
# Check DataFrame

movies_df.dtypes


tconst             object
primaryTitle       object
originalTitle      object
startYear           int64
runtimeMinutes     object
genres             object
averageRating     float64
numVotes            int64
url                object
dtype: object

In [23]:
# Convert 'genres' entries into lists

movies_df['genres_list'] = movies_df.apply(lambda row: row['genres'].split(","), axis=1)

print(movies_df.shape)
movies_df.head()


(101078, 10)


Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,url,genres_list
0,tt0003854,Dodge City Trail,Dodge City Trail,1936,56,"Adventure,Music,Western",3.7,28,https://www.imdb.com/title/tt0003854/,"[Adventure, Music, Western]"
1,tt0005076,Charley's Aunt (1925),Charley's Aunt (1925),1925,80,Comedy,6.6,70,https://www.imdb.com/title/tt0005076/,[Comedy]
2,tt0010058,The Deadlier Sex,The Deadlier Sex,1920,60,"Comedy,Drama",6.2,25,https://www.imdb.com/title/tt0010058/,"[Comedy, Drama]"
3,tt0010495,My Husband's Other Wife,My Husband's Other Wife,1920,\N,Drama,4.2,15,https://www.imdb.com/title/tt0010495/,[Drama]
4,tt0010502,Nachtgestalten (1920),Nachtgestalten (1920),1920,106,Horror,6.4,16,https://www.imdb.com/title/tt0010502/,[Horror]
