### Load and Unzip the Data

In [1]:
import pandas as pd

# Define file paths (update these if your files are in a different location)
title_basics_path = "title.basics.tsv.gz"
title_ratings_path = "title.ratings.tsv.gz"
title_crew_path = "title.crew.tsv.gz"
title_principals_path = "title.principals.tsv.gz"

# Load data
title_basics_df = pd.read_csv(title_basics_path, sep='\t', na_values="\\N", compression='gzip', low_memory=False)
title_ratings_df = pd.read_csv(title_ratings_path, sep='\t', na_values="\\N", compression='gzip', low_memory=False)
title_crew_df = pd.read_csv(title_crew_path, sep='\t', na_values="\\N", compression='gzip', low_memory=False)
title_principals_df = pd.read_csv(title_principals_path, sep='\t', na_values="\\N", compression='gzip', low_memory=False)

# Show sample data
print("Title Basics Sample:")
print(title_basics_df.head())

print("Title Ratings Sample:")
print(title_ratings_df.head())

print("Title Crew Sample:")
print(title_crew_df.head())

print("Title Principals Sample:")
print(title_principals_df.head())


Title Basics Sample:
      tconst titleType            primaryTitle           originalTitle  \
0  tt0000001     short              Carmencita              Carmencita   
1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
2  tt0000003     short            Poor Pierrot          Pauvre Pierrot   
3  tt0000004     short             Un bon bock             Un bon bock   
4  tt0000005     short        Blacksmith Scene        Blacksmith Scene   

   isAdult  startYear  endYear runtimeMinutes                    genres  
0      0.0     1894.0      NaN              1         Documentary,Short  
1      0.0     1892.0      NaN              5           Animation,Short  
2      0.0     1892.0      NaN              5  Animation,Comedy,Romance  
3      0.0     1892.0      NaN             12           Animation,Short  
4      0.0     1893.0      NaN              1                     Short  
Title Ratings Sample:
      tconst  averageRating  numVotes
0  tt0000001            5.7   

In [2]:
# Step 1: Keep only movies (ignore TV shows, shorts, etc.)
movies_df = title_basics_df[title_basics_df["titleType"] == "movie"]

# Step 2: Remove movies without a release year
movies_df = movies_df.dropna(subset=["startYear"])

# Step 3: Convert startYear and runtimeMinutes to numeric
movies_df["startYear"] = pd.to_numeric(movies_df["startYear"], errors="coerce")
movies_df["runtimeMinutes"] = pd.to_numeric(movies_df["runtimeMinutes"], errors="coerce")

# Step 4: Filter ratings to keep movies with at least 100 votes
title_ratings_df["numVotes"] = pd.to_numeric(title_ratings_df["numVotes"], errors="coerce")
filtered_ratings_df = title_ratings_df[title_ratings_df["numVotes"] >= 100]

# Step 5: Merge movies and ratings
merged_df = pd.merge(movies_df, filtered_ratings_df, on="tconst", how="inner")

# Show sample data
print("Filtered Movies Sample:")
print(merged_df.head())
print(f"\nTotal Movies After Filtering: {len(merged_df)}")


Filtered Movies Sample:
      tconst titleType                   primaryTitle  \
0  tt0000009     movie                     Miss Jerry   
1  tt0000147     movie  The Corbett-Fitzsimmons Fight   
2  tt0000574     movie    The Story of the Kelly Gang   
3  tt0001892     movie                 Den sorte drøm   
4  tt0001964     movie                  The Traitress   

                   originalTitle  isAdult  startYear  endYear  runtimeMinutes  \
0                     Miss Jerry      0.0     1894.0      NaN            45.0   
1  The Corbett-Fitzsimmons Fight      0.0     1897.0      NaN           100.0   
2    The Story of the Kelly Gang      0.0     1906.0      NaN            70.0   
3                 Den sorte drøm      0.0     1911.0      NaN            53.0   
4                 Die Verräterin      0.0     1911.0      NaN            48.0   

                       genres  averageRating  numVotes  
0                     Romance            5.3       220  
1      Documentary,News,Sport   

In [3]:
# Step 1: Merge director & writer info
title_crew_df["directors"] = title_crew_df["directors"].replace("\\N", "").str.split(",")
title_crew_df["writers"] = title_crew_df["writers"].replace("\\N", "").str.split(",")

merged_df = pd.merge(merged_df, title_crew_df, on="tconst", how="left")

# Step 2: Extract top actors (principals)
top_actors_df = title_principals_df[
    (title_principals_df["category"].isin(["actor", "actress"])) & (title_principals_df["ordering"].astype(int) <= 3)
]

# Group actors by movie and join names
top_actors_df = top_actors_df.groupby("tconst")["nconst"].apply(lambda x: list(x)).reset_index()

# Merge actors into the main dataset
merged_df = pd.merge(merged_df, top_actors_df, on="tconst", how="left")

# Step 3: Replace NaN with empty lists
merged_df["directors"] = merged_df["directors"].apply(lambda x: x if isinstance(x, list) else [])
merged_df["writers"] = merged_df["writers"].apply(lambda x: x if isinstance(x, list) else [])
merged_df["nconst"] = merged_df["nconst"].apply(lambda x: x if isinstance(x, list) else [])

# Rename the actors column for clarity
merged_df = merged_df.rename(columns={"nconst": "top_actors"})

# Show sample data
print("Merged Data Sample:")
print(merged_df.head())
print(f"\nTotal Movies After Merging Crew & Actors: {len(merged_df)}")


Merged Data Sample:
      tconst titleType                   primaryTitle  \
0  tt0000009     movie                     Miss Jerry   
1  tt0000147     movie  The Corbett-Fitzsimmons Fight   
2  tt0000574     movie    The Story of the Kelly Gang   
3  tt0001892     movie                 Den sorte drøm   
4  tt0001964     movie                  The Traitress   

                   originalTitle  isAdult  startYear  endYear  runtimeMinutes  \
0                     Miss Jerry      0.0     1894.0      NaN            45.0   
1  The Corbett-Fitzsimmons Fight      0.0     1897.0      NaN           100.0   
2    The Story of the Kelly Gang      0.0     1906.0      NaN            70.0   
3                 Den sorte drøm      0.0     1911.0      NaN            53.0   
4                 Die Verräterin      0.0     1911.0      NaN            48.0   

                       genres  averageRating  numVotes    directors  \
0                     Romance            5.3       220  [nm0085156]   
1      D

In [8]:
from sklearn.model_selection import train_test_split
import os

repo_root = os.path.abspath("..")

# Ensure only necessary columns for content-based filtering are retained
selected_columns = ["tconst", "primaryTitle", "originalTitle", "startYear", "runtimeMinutes", "genres", "averageRating", "numVotes"]

filtered_df = merged_df[selected_columns].dropna()  # Drop rows with missing values

# Perform an 80/20 train-test split
train_df, test_df = train_test_split(filtered_df, test_size=0.2, random_state=42)

# Save the train and test sets

train_csv_path = os.path.join(repo_root, "imdb-data/imdb_train_data.csv")
test_csv_path = os.path.join(repo_root, "imdb-data/imdb_test_data.csv")

train_df.to_csv(train_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

print(f"IMDb Train Data saved!")
print(f"IMDb Test Data saved!")



IMDb Train Data saved!
IMDb Test Data saved!
