# IMDB Project Part 1

## Imports

In [1]:
# Confirm folder was created and files added successfully
import os
os.listdir("Data/")


['Filtered_Title_Basics',
 'Filtered_Title_Ratings',
 'title-akas-us-only.csv',
 'title.basics.tsv.gz',
 'title.ratings.tsv.gz']

In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns',50)

## Load Data

In [3]:
#load title akas us only
akas=pd.read_csv('Data/title-akas-us-only.csv', low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [4]:
#load title basics
basics=pd.read_csv('Data/title.basics.tsv.gz', sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## Pre-processing for title-basics

In [5]:
# Filter the basics table down to only include the US by using the filter akas dataframe
filter_us_titles = basics['tconst'].isin(akas['titleId'])
basics = basics[filter_us_titles]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10016872,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10016901,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10016939,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10016962,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


In [6]:
#convert placeholder "\N" values back to true null values to identify missing genres and runtimes
basics = basics.replace({'\\N':np.nan})

In [7]:
#confirm what columns have null values
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1365643 entries, 0 to 10016966
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   tconst          1365643 non-null  object
 1   titleType       1365643 non-null  object
 2   primaryTitle    1365643 non-null  object
 3   originalTitle   1365643 non-null  object
 4   isAdult         1365643 non-null  object
 5   startYear       1266978 non-null  object
 6   endYear         37130 non-null    object
 7   runtimeMinutes  862524 non-null   object
 8   genres          1337027 non-null  object
dtypes: object(9)
memory usage: 104.2+ MB


In [8]:
#drop rows with null values in the runtimeminutes or genres columns ONLY
basics = basics.dropna(subset=["runtimeMinutes", "genres"])

In [9]:
#confirm there are no more nulls for runtimeMinutes
basics['runtimeMinutes'].isna().sum()

0

In [10]:
#confirm there are no more nulls for genres
basics['genres'].isna().sum()

0

In [11]:
#filter to keep only full-length movies
basics=basics.loc[basics['titleType']=='movie']

In [12]:
#convert startYear to a float type
basics['startYear']=basics['startYear'].astype(float)

In [13]:
#filter to keep movies with startYears that are >=2000 and <=2022
greaterthan =basics['startYear'] >=2000 
lessthan =basics['startYear'] <=2022
basics['startYear']= greaterthan & lessthan

In [14]:
#create filter for documentaries
filter_documentaries = basics['genres'].str.contains('Documentary')

In [15]:
#eliminate movies that include "Documentary" in genre
basics=basics[~filter_documentaries]

### Display final preview of title basics and save to a csv

In [16]:
basics.info()
basics.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162615 entries, 8 to 10016777
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          162615 non-null  object
 1   titleType       162615 non-null  object
 2   primaryTitle    162615 non-null  object
 3   originalTitle   162615 non-null  object
 4   isAdult         162615 non-null  object
 5   startYear       162615 non-null  bool  
 6   endYear         0 non-null       object
 7   runtimeMinutes  162615 non-null  object
 8   genres          162615 non-null  object
dtypes: bool(1), object(8)
memory usage: 11.3+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,False,,45,Romance
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,False,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,False,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,False,,120,"Adventure,Fantasy"
1273,tt0001285,movie,The Life of Moses,The Life of Moses,0,False,,50,"Biography,Drama,Family"


In [17]:
fpath_out ="Data/Filtered_Title_Basics"
basics.to_csv(fpath_out)

In [18]:
# Loading and inspecting saved file
loaded = pd.read_csv(fpath_out)
loaded.head()

Unnamed: 0.1,Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,8,tt0000009,movie,Miss Jerry,Miss Jerry,0,False,,45,Romance
1,570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,False,,70,"Action,Adventure,Biography"
2,587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,False,,90,Drama
3,672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,False,,120,"Adventure,Fantasy"
4,1273,tt0001285,movie,The Life of Moses,The Life of Moses,0,False,,50,"Biography,Drama,Family"


## Pre-processing for title ratings

In [19]:
# load title ratings
ratings=pd.read_csv('Data/title.ratings.tsv.gz', sep='\t', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000003,6.5,1849
3,tt0000004,5.5,178
4,tt0000005,6.2,2632


In [20]:
# Filter the basics table down to only include the US by using the filter akas dataframe
filter_ratings = ratings['tconst'].isin(basics['tconst'])
ratings = ratings[filter_ratings]
ratings

Unnamed: 0,tconst,averageRating,numVotes
8,tt0000009,5.3,206
362,tt0000574,6.0,837
370,tt0000591,4.4,20
423,tt0000679,5.1,68
746,tt0001285,5.4,59
...,...,...,...
1331411,tt9914942,6.6,178
1331437,tt9915872,6.4,9
1331450,tt9916170,7.0,7
1331451,tt9916190,3.7,243


In [21]:
#check for nulls
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [22]:
#display final preview of filtered ratings
ratings.info()
ratings.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 132298 entries, 8 to 1331462
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         132298 non-null  object 
 1   averageRating  132298 non-null  float64
 2   numVotes       132298 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 4.0+ MB


Unnamed: 0,tconst,averageRating,numVotes
8,tt0000009,5.3,206
362,tt0000574,6.0,837
370,tt0000591,4.4,20
423,tt0000679,5.1,68
746,tt0001285,5.4,59


In [23]:
#save to a csv
fpath_out ="Data/Filtered_Title_Ratings"
ratings.to_csv(fpath_out)

In [24]:
# Loading and inspecting saved file
loaded = pd.read_csv(fpath_out)
loaded.head()

Unnamed: 0.1,Unnamed: 0,tconst,averageRating,numVotes
0,8,tt0000009,5.3,206
1,362,tt0000574,6.0,837
2,370,tt0000591,4.4,20
3,423,tt0000679,5.1,68
4,746,tt0001285,5.4,59
