In [1]:
import pandas as pd
import numpy as np

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)

In [4]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9228738 entries, 0 to 9228737
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 633.7+ MB


In [6]:
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            10
dtype: int64

In [7]:
basics.replace({'\\N':np.nan},inplace = True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [8]:
basics.dropna(inplace=True)
basics.isna().sum()

tconst            0
titleType         0
primaryTitle      0
originalTitle     0
isAdult           0
startYear         0
endYear           0
runtimeMinutes    0
genres            0
dtype: int64

In [9]:
basics.duplicated().sum()

0

In [10]:
basics['startYear'] = basics['startYear'].astype(int)
basics = basics.loc[(basics['startYear']>=2000) & (basics['startYear']<=2022)]

In [11]:
dataType = basics.dtypes['startYear']
print(dataType)

int64


In [12]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [13]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
108529,tt0111056,tvSeries,Gensomaden Saiyuki,Gensomaden Saiyuki,0,2000,2001,23,"Action,Adventure,Animation"
137803,tt0142032,tvMiniSeries,Dune,Dune,0,2000,2000,265,"Adventure,Drama,Sci-Fi"
153524,tt0158466,tvMiniSeries,Anne of Green Gables: The Continuing Story,Anne of Green Gables: The Continuing Story,0,2000,2000,185,"Drama,Family,Romance"
155717,tt0160904,tvSeries,MI-5,Spooks,0,2002,2011,60,"Action,Crime,Drama"
158583,tt0163944,tvMiniSeries,Labyrinten,Labyrinten,0,2000,2000,259,"Drama,Thriller"
...,...,...,...,...,...,...,...,...,...
9227911,tt9915022,tvSeries,Yarali Kuslar,Yarali Kuslar,0,2019,2019,90,Drama
9227944,tt9915114,tvSeries,Touche pas à mon sport,Touche pas à mon sport,0,2015,2016,65,"Sport,Talk-Show"
9228045,tt9915338,tvSeries,Aunty Donna: Camp Bush Camp!,Aunty Donna: Camp Bush Camp!,0,2018,2018,5,Comedy
9228251,tt9915822,tvSeries,Ichhapyaari Naagin,Ichhapyaari Naagin,0,2016,2017,20,Fantasy


In [14]:
keepers =basics['tconst'].isin(akas['titleId']=='US')
keepers

108529     False
137803     False
153524     False
155717     False
158583     False
           ...  
9227911    False
9227944    False
9228045    False
9228251    False
9228431    False
Name: tconst, Length: 22320, dtype: bool

In [15]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [16]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33214887 entries, 0 to 33214886
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.0+ GB


In [17]:
akas.isna().sum()

titleId              0
ordering             0
title                5
region             104
language             0
types                0
attributes           0
isOriginalTitle      0
dtype: int64

In [18]:
akas.replace({'\\N':np.nan},inplace = True)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [19]:
akas.dropna(inplace=True)
akas.isna().sum()

titleId            0
ordering           0
title              0
region             0
language           0
types              0
attributes         0
isOriginalTitle    0
dtype: int64

In [20]:
akas.duplicated().sum()

0

In [22]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,256
2,tt0000003,6.5,1714
3,tt0000004,5.6,169
4,tt0000005,6.2,2529


In [23]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259994 entries, 0 to 1259993
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1259994 non-null  object 
 1   averageRating  1259994 non-null  float64
 2   numVotes       1259994 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


In [24]:
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [25]:
keepers_ratings =ratings['tconst'].isin(akas['titleId']=='US')
keepers_ratings

0          False
1          False
2          False
3          False
4          False
           ...  
1259989    False
1259990    False
1259991    False
1259992    False
1259993    False
Name: tconst, Length: 1259994, dtype: bool