In [12]:
# Import Libraries
import pandas as pd
import numpy as np

In [13]:
# Load the datasets
akas_url = "https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"
basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"

In [14]:
# Load AKAs dataset
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
akas.replace({'\\N': np.nan}, inplace=True)

# Filter AKAs dataset to include only US movies
us_movies_akas = akas[akas['region'] == 'US']

In [15]:
# Load Ratings dataset
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
ratings.replace({'\\N': np.nan}, inplace=True)

In [16]:
# Load Basics dataset
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics.replace({'\\N': np.nan}, inplace=True)

# Filter Basics dataset to include only full-length movies released between 2000 and 2021 in the US and exclude documentaries
basics = basics[basics['titleType'] == 'movie']
basics['startYear'] = basics['startYear'].astype(float)  # Convert startYear column to float
basics = basics[basics['startYear'].between(2000, 2021)]
basics = basics[~basics['genres'].str.contains('documentary', case=False, na=False)]

# Filter Basics dataset based on AKAs dataset to include only US movies
basics = basics[basics['tconst'].isin(us_movies_akas['titleId'])]

In [17]:
# Check remaining movies and data types in Basics dataset
print(basics.info())

# Check remaining movies and data types in AKAs dataset
print(us_movies_akas.info())

# Check remaining movies and data types in Ratings dataset
print(ratings.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97482 entries, 34803 to 9985716
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          97482 non-null  object 
 1   titleType       97482 non-null  object 
 2   primaryTitle    97482 non-null  object 
 3   originalTitle   97482 non-null  object 
 4   isAdult         97482 non-null  object 
 5   startYear       97482 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  83340 non-null  object 
 8   genres          94538 non-null  object 
dtypes: float64(1), object(8)
memory usage: 7.4+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1449468 entries, 5 to 36461217
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1449468 non-null  object
 1   ordering         1449468 non-null  int64 
 2   title            1449468 non

In [18]:
# Save filtered datasets as compressed CSV files
basics.to_csv('Data/basics.csv.gz', compression='gzip', index=False)
us_movies_akas.to_csv('Data/akas.csv.gz', compression='gzip', index=False)
ratings.to_csv('Data/ratings.csv.gz', compression='gzip', index=False)