# Data

In [63]:
import pandas as pd
import numpy as np

In [64]:
basics = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep='\t',
                    low_memory=False)

In [65]:
ratings = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz",
                      sep='\t', low_memory=False)

In [66]:
akas = pd.read_csv("https://datasets.imdbws.com/title.akas.tsv.gz", sep='\t',
                    low_memory=False)

Some columns need to be converted to numerical

## Cleaning Data

### akas

In [None]:
akas.info()

Changing \N to NaNs

In [None]:
akas.replace({'\\N': np.nan}, inplace=True)

Keep only US movies

In [None]:
akas['region'].value_counts()

In [None]:
akas = akas[akas['region'] == 'US']

In [None]:
akas['region'].value_counts()

In [None]:
akas.info()

### basics

In [67]:
basics.info()
basics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9448506 entries, 0 to 9448505
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 648.8+ MB


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


replace \N with np.nan

In [68]:
basics.replace({'\\N': np.nan}, inplace=True)

Remove NaNs from genre and runtimeMinutes columns

In [69]:
basics['genres'].isna().sum()

431729

In [70]:
basics['runtimeMinutes'].isna().sum()

6755882

In [71]:
basics.dropna(subset=['genres', 'runtimeMinutes'], inplace=True)

In [72]:
basics['genres'].isna().sum()

0

In [73]:
basics['runtimeMinutes'].isna().sum()

0

filter for only movies

In [74]:
basics['titleType'].value_counts()

tvEpisode       1260290
short            585571
movie            373694
video            177548
tvMovie           89350
tvSeries          88259
tvSpecial         17214
tvMiniSeries      16574
tvShort            9501
videoGame           313
Name: titleType, dtype: int64

In [75]:
basics = basics[basics['titleType'] == 'movie']
basics['titleType'].value_counts()

movie    373694
Name: titleType, dtype: int64

Filter for only non-fiction movies

In [76]:
is_doc = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_doc]

In [77]:
for i in basics['genres']:
    if 'Documentary' in i:
        print(i)
    elif 'documentary' in i:
        print(i)

Keep startYear between and including 2000-2021

In [81]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 284077 entries, 8 to 9448355
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          284077 non-null  object
 1   titleType       284077 non-null  object
 2   primaryTitle    284077 non-null  object
 3   originalTitle   284077 non-null  object
 4   isAdult         284077 non-null  object
 5   startYear       279659 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  284077 non-null  object
 8   genres          284077 non-null  object
dtypes: object(9)
memory usage: 21.7+ MB


In [87]:
# Shrinking the range of values for releases to movies between
# 2000 up to 2021
basics = basics[basics['startYear'] >= '2000']
basics = basics[basics['startYear'] <= '2021']
print(min(basics['startYear']))
max(basics['startYear'])

2000


'2021'

In [88]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137412 entries, 34804 to 9448355
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          137412 non-null  object
 1   titleType       137412 non-null  object
 2   primaryTitle    137412 non-null  object
 3   originalTitle   137412 non-null  object
 4   isAdult         137412 non-null  object
 5   startYear       137412 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  137412 non-null  object
 8   genres          137412 non-null  object
dtypes: object(9)
memory usage: 10.5+ MB


In [90]:
# Removing Empty Column
basics.drop(columns='endYear', inplace=True)
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137412 entries, 34804 to 9448355
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          137412 non-null  object
 1   titleType       137412 non-null  object
 2   primaryTitle    137412 non-null  object
 3   originalTitle   137412 non-null  object
 4   isAdult         137412 non-null  object
 5   startYear       137412 non-null  object
 6   runtimeMinutes  137412 non-null  object
 7   genres          137412 non-null  object
dtypes: object(8)
memory usage: 9.4+ MB


Keeping only US movies

In [96]:
basics = basics[basics['tconst'].isin(akas['titleId'])]

In [97]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80085 entries, 34804 to 9448271
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          80085 non-null  object
 1   titleType       80085 non-null  object
 2   primaryTitle    80085 non-null  object
 3   originalTitle   80085 non-null  object
 4   isAdult         80085 non-null  object
 5   startYear       80085 non-null  object
 6   runtimeMinutes  80085 non-null  object
 7   genres          80085 non-null  object
dtypes: object(8)
memory usage: 5.5+ MB


### ratings

In [98]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1257370 entries, 0 to 1257369
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1257370 non-null  object 
 1   averageRating  1257370 non-null  float64
 2   numVotes       1257370 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


Changing \N to NaNs

In [99]:
ratings.replace({'\\N': np.nan}, inplace=True)

Keep only US movies

In [101]:
ratings = ratings[ratings['tconst'].isin(akas['titleId'])]

In [102]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 478867 entries, 0 to 1257366
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         478867 non-null  object 
 1   averageRating  478867 non-null  float64
 2   numVotes       478867 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.6+ MB


# Saving Data on HardDrive

In [103]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

[]

In [104]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [105]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [106]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)