# Uploading datas

In [1]:
import pandas as pd
import numpy as np

In [2]:
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"

In [3]:
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"

In [4]:
rating_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

In [5]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)

In [6]:
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [7]:
rating = pd.read_csv(rating_url, sep='\t', low_memory=False)

# Data Filtering/Cleaning

## Data: Title Basics

In [8]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9257995 entries, 0 to 9257994
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 635.7+ MB


In [9]:
# Replace "\N" with np.nan
basics.replace({'\\N':np.nan}, inplace=True)

In [10]:
#check missing values 
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1235341
endYear           9161382
runtimeMinutes    6778985
genres             427921
dtype: int64

In [11]:
basics = basics.drop(columns='endYear')

In [12]:
#drop missing values in runtimeminutes column
basics.dropna(subset=['runtimeMinutes'], inplace=True)

In [13]:
#drop missing values in genres column
basics.dropna(subset=['genres'], inplace=True)

In [14]:
basics.dropna(subset=['startYear'], inplace=True)

In [15]:
#double check missing values
basics.isna().sum()

tconst            0
titleType         0
primaryTitle      1
originalTitle     1
isAdult           0
startYear         0
runtimeMinutes    0
genres            0
dtype: int64

In [16]:
basics['startYear']= basics['startYear'].astype(int)

In [17]:
filter1 = basics["titleType"]=="movie"

In [18]:
filter2 = basics['startYear']>=2000

In [19]:
filter3 = basics['startYear']<2022

In [20]:
basics = basics.loc[filter1&filter2&filter3]

In [21]:
## Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]



In [22]:
## Keep only US movie
# Filter the basics table down to only include the US by using the filter akas dataframe
##keepers =basics['tconst'].isin(akas['titleId'])
##keepers

In [23]:
##basics = basics[keepers]
#basics

NameError: name 'keepers' is not defined

# Data: Title akas

In [24]:
print(akas.info())
akas.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33318382 entries, 0 to 33318381
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.0+ GB
None


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [25]:
#keep only US movies.
akas = akas.loc[akas['region']=="US"]

In [26]:
#Replace "\N" with np.nan
akas.replace({'\\N':np.nan}, inplace=True)

# Data: Title ratings

In [27]:
print(rating.info())
rating.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1261565 entries, 0 to 1261564
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1261565 non-null  object 
 1   averageRating  1261565 non-null  float64
 2   numVotes       1261565 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.9+ MB
None


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1913
1,tt0000002,5.8,258
2,tt0000003,6.5,1717
3,tt0000004,5.6,170
4,tt0000005,6.2,2533


In [28]:
rating.replace({'\\N':np.nan}, inplace=True)

In [29]:
rating.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [30]:
## keepers_rating = rating['tconst'].isin(akas['titleId'])
## keepers_rating

In [31]:
#rating = rating[keepers_rating]
#rating

In [32]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136920 entries, 34792 to 9257844
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          136920 non-null  object
 1   titleType       136920 non-null  object
 2   primaryTitle    136920 non-null  object
 3   originalTitle   136920 non-null  object
 4   isAdult         136920 non-null  object
 5   startYear       136920 non-null  int32 
 6   runtimeMinutes  136920 non-null  object
 7   genres          136920 non-null  object
dtypes: int32(1), object(7)
memory usage: 8.9+ MB


In [33]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1354261 entries, 5 to 33318126
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1354261 non-null  object
 1   ordering         1354261 non-null  int64 
 2   title            1354261 non-null  object
 3   region           1354261 non-null  object
 4   language         3712 non-null     object
 5   types            964324 non-null   object
 6   attributes       45074 non-null    object
 7   isOriginalTitle  1352886 non-null  object
dtypes: int64(1), object(7)
memory usage: 93.0+ MB


In [34]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1261565 entries, 0 to 1261564
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1261565 non-null  object 
 1   averageRating  1261565 non-null  float64
 2   numVotes       1261565 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.9+ MB


In [35]:
keepers_rating = rating['tconst'].isin(akas['titleId'])
keepers_rating

0           True
1           True
2          False
3          False
4           True
           ...  
1261560    False
1261561     True
1261562    False
1261563    False
1261564    False
Name: tconst, Length: 1261565, dtype: bool

In [36]:
rating = rating[keepers_rating]
rating

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1913
1,tt0000002,5.8,258
4,tt0000005,6.2,2533
5,tt0000006,5.1,174
6,tt0000007,5.4,792
...,...,...,...
1261535,tt9916204,8.1,245
1261542,tt9916348,8.5,17
1261543,tt9916362,6.4,4887
1261547,tt9916428,3.8,14


In [37]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

34792       True
61094       True
67640       True
77934      False
86771       True
           ...  
9257667     True
9257676     True
9257715    False
9257760     True
9257844    False
Name: tconst, Length: 136920, dtype: bool

In [38]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,118,"Comedy,Fantasy,Romance"
61094,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,70,Drama
67640,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,122,Drama
86771,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,100,"Comedy,Horror,Sci-Fi"
93907,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,126,Drama
...,...,...,...,...,...,...,...,...
9257131,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,74,Drama
9257527,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,97,"Comedy,Drama,Fantasy"
9257667,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,51,Drama
9257676,tt9916190,movie,Safeguard,Safeguard,0,2020,95,"Action,Adventure,Thriller"


# Creating a "Data" folder

In [39]:
import os

In [40]:
os.makedirs('Data/',exist_ok=True) 

In [41]:
os.listdir("Data/")


['.ipynb_checkpoints',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_rating.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json']

In [42]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)
rating.to_csv("Data/title_rating.csv.gz",compression='gzip',index=False)
