In [33]:
#Import lIBRARIES
import pandas as pd
import numpy as np
import os as os

In [34]:
# example making new folder with os

os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['title.akas.csv.gz', 'title.ratings.csv.gz', 'title_basics.csv.gz']

In [35]:
##Load Basics set and assign to DF

basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"

basicsDF = pd.read_csv(basics_url, sep='\t', low_memory=False)

basicsDF.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [36]:
#Assign local file to DF
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [37]:
#Basics preprocessing

#Replace "\N" with np.nan
basics.replace({'\\N': np.nan}, inplace=True) 
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [38]:
#Eliminate movies that are null for runtimeMinutes
basics = basics.dropna(subset=['genres', 'runtimeMinutes'])

In [39]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            0
originalTitle           0
isAdult                 0
startYear           34655
endYear           2306258
runtimeMinutes          0
genres                  0
dtype: int64

In [40]:
#keep only titleType==Movie
movie_filter = basics['titleType']=='movie'
movie_filter.head()

basics[movie_filter].head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
1172,tt0001184,movie,Don Juan de Serrallonga,Don Juan de Serrallonga,0,1910,,58,"Adventure,Drama"
1273,tt0001285,movie,The Life of Moses,The Life of Moses,0,1909,,50,"Biography,Drama,Family"


In [41]:
#keep startYear 2000-2022
start_filt = basics['startYear'] >= '2000'
start_filt.head()

0    False
1    False
2    False
3    False
4    False
Name: startYear, dtype: bool

In [42]:
end_filt = basics['startYear']<='2022'
end_filt.head()

0    True
1    True
2    True
3    True
4    True
Name: startYear, dtype: bool

In [43]:
basics = basics.loc[start_filt & end_filt, :]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,,133,Documentary
33805,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,,20,Short
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
39547,tt0040241,short,Color Rhapsodie,Color Rhapsodie,0,2021,,6,Short
43551,tt0044326,short,Abstronic,Abstronic,0,2021,,6,Short


In [44]:
#Eliminate movies that include  "Documentary" in genre (see tip below)
#basics = basics[basics["genres"].str.contains("Documentary") == False]
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics

In [45]:
pd.set_option("display.max_rows", None)
basics['genres'].value_counts()

Drama,Short                       97416
Drama                             89463
Comedy                            85274
Reality-TV                        73501
Short                             72636
Adult                             68815
Comedy,Short                      56943
Drama,Romance                     40382
Talk-Show                         36719
Action,Adventure,Animation        24519
News                              17890
Comedy,Drama                      17708
Comedy,Drama,Romance              17608
Horror,Short                      16589
Animation,Short                   16111
Crime,Drama,Mystery               14804
News,Talk-Show                    14682
Adventure,Animation,Comedy        13625
Music,Short                       13355
Game-Show                         12329
Music                             11044
Comedy,Romance                    10743
Comedy,Drama,Short                10663
Crime                             10546
Short,Thriller                    10485


In [46]:
##Load AKAs data set and save locally 
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

akasDF = pd.read_csv(akas_url, sep='\t', low_memory=False)

akasDF.to_csv("Data/title.akas.csv.gz",compression='gzip',index=False)

In [47]:
#Assign local file to DF
akas = pd.read_csv("Data/title.akas.csv.gz", low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [48]:
akas.replace({'\\N':np.nan}, inplace=True)

In [49]:
#AKA - keep only US entries. I keep breaking the main df, I'm creating a dummy
#df that I will move back once done.
dfa= akas
dfa.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [50]:
#Apply filter to retain only movies in US region
region_filt = dfa['region']=='US'
region_filt.head()

0    False
1    False
2    False
3    False
4    False
Name: region, dtype: bool

In [51]:
#Check that filtering worked...it did.  ONLY US region films are showing.
dfa[region_filt].head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [52]:
dfa = dfa.loc[region_filt, :]
dfa.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [53]:
dfa['region'].value_counts()

US    1326857
Name: region, dtype: int64

In [54]:
#Transfer dummy set back to actual df
akas = dfa
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [56]:
#Save new cleansed df to local csv file from earlier 
akas.to_csv("Data/title.akas.csv.gz",compression='gzip',index=False)

In [57]:
#Display df info as requested
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1326857 entries, 5 to 32244076
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1326857 non-null  object
 1   ordering         1326857 non-null  int64 
 2   title            1326857 non-null  object
 3   region           1326857 non-null  object
 4   language         3570 non-null     object
 5   types            1025868 non-null  object
 6   attributes       44141 non-null    object
 7   isOriginalTitle  1325482 non-null  object
dtypes: int64(1), object(7)
memory usage: 91.1+ MB


In [58]:
##Load Ratings data set and assign to DF
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

ratingsDF = pd.read_csv(ratings_url, sep='\t', low_memory=False)

ratingsDF.to_csv("Data/title.ratings.csv.gz",compression='gzip',index=False)

In [59]:
#Assign local file to DF
ratings = pd.read_csv("Data/title.ratings.csv.gz", low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1886
1,tt0000002,5.9,250
2,tt0000003,6.5,1673
3,tt0000004,5.8,163
4,tt0000005,6.2,2493


In [None]:
#RATINGS Preprocessing
ratings = ratings.replace({'\\N':np.nan}) 

In [None]:
ratings.info()

In [None]:
ratings.to_csv("Data/title.ratings.csv.gz",compression='gzip',index=False)

In [None]:
del ratings

In [None]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers

In [None]:
basics = basics[keepers]
basics

In [None]:
basics = basics.to_csv("Data/Final_basics.csv.gz", low_memory=False, compression= 'gzip',index=False)
basics.head()