# Downloading the Files

In [1]:
import pandas as pd
import numpy as np

In [2]:
basics_url ="https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

# Loading TSV's with Pandas

In [4]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10234933,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2009,\N,\N,"Action,Drama,Family"
10234934,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
10234935,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
10234936,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


In [6]:
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2000
1,tt0000002,5.8,269
2,tt0000003,6.5,1890
3,tt0000004,5.5,178
4,tt0000005,6.2,2676
...,...,...,...
1359481,tt9916730,7.6,11
1359482,tt9916766,7.0,22
1359483,tt9916778,7.2,36
1359484,tt9916840,8.8,6


In [7]:
akas = pd.read_csv(akas_url,sep='\t', low_memory=False)
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0
...,...,...,...,...,...,...,...,...
37476204,tt9916852,5,Episódio #3.20,PT,pt,\N,\N,0
37476205,tt9916852,6,Episodio #3.20,IT,it,\N,\N,0
37476206,tt9916852,7,एपिसोड #3.20,IN,hi,\N,\N,0
37476207,tt9916856,1,The Wind,DE,\N,imdbDisplay,\N,0


# Required Preprocessing - Details

## Filtering/Cleaning Steps:

## Title Basics:

###  Replace "\N" with np.nan

In [8]:
basics.replace({'\\N':np.nan}, inplace=True)

In [10]:
for col in basics:
    print('Column {} has {} missing values'.format(col,basics[col].isnull().sum()))

Column tconst has 0 missing values
Column titleType has 0 missing values
Column primaryTitle has 17 missing values
Column originalTitle has 17 missing values
Column isAdult has 1 missing values
Column startYear has 1369569 missing values
Column endYear has 10121304 missing values
Column runtimeMinutes has 7147085 missing values
Column genres has 458214 missing values


### Eliminate movies that are null for runtimeMinutes

In [12]:
basics = basics[basics['runtimeMinutes'].notna()]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
10234888,tt9916754,movie,Chico Albuquerque - Revelações,Chico Albuquerque - Revelações,0,2013,,49,Documentary
10234894,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0,2019,,43,"Family,Game-Show,Reality-TV"
10234929,tt9916840,tvEpisode,Horrid Henry's Comic Caper,Horrid Henry's Comic Caper,0,2014,,11,"Adventure,Animation,Comedy"
10234936,tt9916856,short,The Wind,The Wind,0,2015,,27,Short


### Eliminate movies that are null for genre

In [13]:
basics['genres'].isnull().sum()

80725

In [14]:
basics = basics[basics['genres'].notna()]

In [15]:
basics['genres'].isnull().sum()

0

### keep only titleType==Movie

In [17]:
# basics = basics[basics['titleType'] == 'movies']
# basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres


### keep startYear 2000-2022

In [16]:
basics = basics[(basics['startYear'] > '2000') & (basics['startYear'] > '2022')]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
69150,tt0070596,movie,Socialist Realism,El realismo socialista,0,2023,,78,Drama
74674,tt0076277,movie,Kodiyettam,Kodiyettam,0,2023,,137,Drama
86944,tt0088907,movie,Chidambaram,Chidambaram,0,2023,,103,Drama
106656,tt0109128,movie,Washed Ashore,Angeschwemmt,0,2023,,86,Documentary
107282,tt0109766,short,Fado Lusitano,Fado Lusitano,0,2023,,6,"Animation,History,Short"
...,...,...,...,...,...,...,...,...,...
10217277,tt9878650,movie,Surviving the Storm,Surviving the Storm,0,2023,,113,Drama
10217457,tt9879042,movie,Saudades do futuro,Saudades do futuro,0,2023,,120,Crime
10218806,tt9881942,movie,The Bilbaos,Los Bilbao,0,2023,,73,Documentary
10219659,tt9883868,tvSeries,Noble Detective,Bezsonov,0,2023,2023,48,"Crime,Drama,History"


### Eliminate movies that include "Documentary" in genre

In [17]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [18]:
(basics['genres'] == 'documentary').sum()

0

In [19]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
69150,tt0070596,movie,Socialist Realism,El realismo socialista,0,2023,,78,Drama
74674,tt0076277,movie,Kodiyettam,Kodiyettam,0,2023,,137,Drama
86944,tt0088907,movie,Chidambaram,Chidambaram,0,2023,,103,Drama
107282,tt0109766,short,Fado Lusitano,Fado Lusitano,0,2023,,6,"Animation,History,Short"
119371,tt0122511,movie,The Gnomes Great Adventure,The Gnomes Great Adventure,0,2023,,74,"Adventure,Animation,Comedy"
...,...,...,...,...,...,...,...,...,...
10214472,tt9872558,movie,Takkar,Takkar,0,2023,,138,"Action,Drama,Romance"
10215113,tt9873892,movie,They Cloned Tyrone,They Cloned Tyrone,0,2023,,122,"Comedy,Mystery,Sci-Fi"
10217277,tt9878650,movie,Surviving the Storm,Surviving the Storm,0,2023,,113,Drama
10217457,tt9879042,movie,Saudades do futuro,Saudades do futuro,0,2023,,120,Crime


### Keep only US movies

In [20]:
akas = akas[akas['region'] == 'US']

In [56]:
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
36,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
41,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
...,...,...,...,...,...,...,...,...
37475735,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,\N,imdbDisplay,\N,0
37475805,tt9916620,1,The Copeland Case,US,\N,imdbDisplay,\N,0
37475894,tt9916702,1,Loving London: The Playground,US,\N,\N,\N,0
37475937,tt9916756,1,Pretty Pretty Black Girl,US,\N,imdbDisplay,\N,0


### Replace "\N" with np.nan

In [21]:
akas.replace({'\\N':np.nan}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  akas.replace({'\\N':np.nan}, inplace=True)


In [22]:
keepers = basics['tconst'].isin(akas['titleId'])
keepers

69150        True
74674       False
86944       False
107282      False
119371       True
            ...  
10214472     True
10215113     True
10217277     True
10217457    False
10219659    False
Name: tconst, Length: 50025, dtype: bool

In [23]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
69150,tt0070596,movie,Socialist Realism,El realismo socialista,0,2023,,78,Drama
119371,tt0122511,movie,The Gnomes Great Adventure,The Gnomes Great Adventure,0,2023,,74,"Adventure,Animation,Comedy"
338597,tt0353564,video,It's Joe Time!,It's Joe Time!,0,2023,,51,"Animation,Comedy,Family"
339957,tt0354964,movie,Railway Guerrilla,Tie dao you ji dui,0,2023,,83,War
421838,tt0439572,movie,The Flash,The Flash,0,2023,,144,"Action,Adventure,Fantasy"
...,...,...,...,...,...,...,...,...,...
10212230,tt9867628,movie,Thuramukham,Thuramukham,0,2023,,175,"Action,Drama,Thriller"
10212815,tt9868836,short,Eat Your Heart Out,Eat Your Heart Out,0,2023,,11,"Comedy,Short"
10214472,tt9872558,movie,Takkar,Takkar,0,2023,,138,"Action,Drama,Romance"
10215113,tt9873892,movie,They Cloned Tyrone,They Cloned Tyrone,0,2023,,122,"Comedy,Mystery,Sci-Fi"


## Ratings:

### Replace "\N" with np.nan (if any)

In [24]:
ratings.replace({'\\N':np.nan}, inplace=True)

In [26]:
for col in ratings:
    print('Column {} has {} missing values'.format(col,ratings[col].isnull().sum()))

Column tconst has 0 missing values
Column averageRating has 0 missing values
Column numVotes has 0 missing values


### Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below)

In [27]:
keepers = ratings['tconst'].isin(akas['titleId'])
keepers

0           True
1           True
2          False
3          False
4           True
           ...  
1359481    False
1359482    False
1359483    False
1359484    False
1359485    False
Name: tconst, Length: 1359486, dtype: bool

In [28]:
ratings = ratings[keepers]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2000
1,tt0000002,5.8,269
4,tt0000005,6.2,2676
5,tt0000006,5.0,182
6,tt0000007,5.4,838
...,...,...,...
1359448,tt9916200,8.1,237
1359449,tt9916204,8.2,272
1359456,tt9916348,8.3,18
1359457,tt9916362,6.4,5569


# Creating a "Data" folder.

In [29]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['title_akas.csv.gz', 'title_basics.csv.gz', 'title_ratings.csv.gz']

# Saving Compressed .csv.gz Files

In [30]:
## Save current dataframe to file.
basics.to_csv("Data/basics.csv.gz",compression='gzip',index=False)

In [31]:
# Open saved file and preview again
basics_ = pd.read_csv("Data/basics.csv.gz", low_memory = False)
basics_.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0070596,movie,Socialist Realism,El realismo socialista,0,2023,,78,Drama
1,tt0122511,movie,The Gnomes Great Adventure,The Gnomes Great Adventure,0,2023,,74,"Adventure,Animation,Comedy"
2,tt0353564,video,It's Joe Time!,It's Joe Time!,0,2023,,51,"Animation,Comedy,Family"
3,tt0354964,movie,Railway Guerrilla,Tie dao you ji dui,0,2023,,83,War
4,tt0439572,movie,The Flash,The Flash,0,2023,,144,"Action,Adventure,Fantasy"


In [32]:
ratings.to_csv("Data/ratings.csv.gz",compression='gzip',index=False)

In [33]:
ratings_ = pd.read_csv("Data/ratings.csv.gz", low_memory = False)
ratings_.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2000
1,tt0000002,5.8,269
2,tt0000005,6.2,2676
3,tt0000006,5.0,182
4,tt0000007,5.4,838


In [34]:
akas.to_csv("Data/akas.csv.gz",compression='gzip',index=False)

In [35]:
akas_ = pd.read_csv("Data/akas.csv.gz", low_memory = False)
akas_.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0
