In [10]:
import os
import pandas as pd

script_dir = os.getcwd() 

print(f"Current working directory: {script_dir}")

Current working directory: c:\Users\willi\OneDrive\Documents\GitHub\Movie-Recommendations


Originally, the title.basics.tsv.gz file contained the following fields: tconst, titleType, primaryTitle, originalTitle, isAdult, startYear, endYear, runtimeMinutes, and genres.

Since the original title, start year, end year, and runtime are not relevant features, I have removed them. Additionally, I have filtered out any content that is not classified as a movie (excluding "short" films).

In [11]:
# Load data
basics_input_file = os.path.join(script_dir, "IMDB Datasets", "title.basics.tsv")
df = pd.read_csv(basics_input_file, sep="\t", dtype=str, na_values="\\N")

# Filter irrelevant columns
title_basics_df_filtered = df.loc[df['titleType'] == 'movie', ["tconst",  "isAdult", "startYear", "runtimeMinutes", "genres"]]

display(title_basics_df_filtered.head(10))

Unnamed: 0,tconst,isAdult,startYear,runtimeMinutes,genres
8,tt0000009,0,1894,45.0,Romance
144,tt0000147,0,1897,100.0,"Documentary,News,Sport"
498,tt0000502,0,1905,100.0,
570,tt0000574,0,1906,70.0,"Action,Adventure,Biography"
587,tt0000591,0,1907,90.0,Drama
610,tt0000615,0,1907,,Drama
625,tt0000630,0,1908,,Drama
668,tt0000675,0,1908,,Drama
672,tt0000679,0,1908,120.0,"Adventure,Fantasy"
828,tt0000838,0,1909,,


To ensure data quality, I have also removed movies with fewer than 1,000 ratings, eliminating lesser-known films that could introduce noise into the recommendations.

In [12]:
# Load data
ratings_input_file = os.path.join(script_dir, "IMDB Datasets", "title.ratings.tsv") 
df = pd.read_csv(ratings_input_file, sep="\t", dtype=str, na_values="\\N")

df["numVotes"] = df["numVotes"].astype(int)

# Calculate Mean, Median, Mode
mean_votes = df["numVotes"].mean()
median_votes = df["numVotes"].median()
mode_votes = df["numVotes"].mode()[0]

# Print results
print(f"Mean: {mean_votes}")
print(f"Median: {median_votes}")
print(f"Mode: {mode_votes}")

ratings_df_filtered = df.loc[df["numVotes"] >= 1000]

display(ratings_df_filtered.head(10))

Mean: 1025.952498432897
Median: 26.0
Mode: 7


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2137
2,tt0000003,6.4,2170
4,tt0000005,6.2,2902
7,tt0000008,5.4,2281
9,tt0000010,6.8,7882
11,tt0000012,7.4,13382
12,tt0000013,5.7,2051
13,tt0000014,7.1,6109
14,tt0000015,6.1,1262
15,tt0000016,5.9,1646


To create a comprehensive dataset, I merged the filtered movie data with ratings and crew information using the common identifier tconst. Additionally, I removed the titleType and numVotes columns, as they are no longer needed.

In [13]:
# Load data
crew_input_file = os.path.join(script_dir, "IMDB Datasets", "title.crew.tsv")
df_crew = pd.read_csv(crew_input_file, sep="\t", dtype=str, na_values="\\N")

# Merge dataframes
df_merged = pd.merge(title_basics_df_filtered, ratings_df_filtered, on="tconst", how="inner")
df_merged = pd.merge(df_merged, df_crew, on="tconst", how="left")

# Remove numVotes
df_merged = df_merged.drop(columns=["numVotes"])

# Save data
output_file = os.path.join(script_dir, "Cleaned Datasets", "merged_movie_data.tsv")
df_merged.to_csv(output_file, sep="\t", index=False)

# Display
print(f"Merged dataset saved to: {output_file}")
display(df_merged.head(10))

Merged dataset saved to: c:\Users\willi\OneDrive\Documents\GitHub\Movie-Recommendations\Cleaned Datasets\merged_movie_data.tsv


Unnamed: 0,tconst,isAdult,startYear,runtimeMinutes,genres,averageRating,directors,writers
0,tt0002130,0,1911,71,"Adventure,Drama,Fantasy",7.0,"nm0078205,nm0655824,nm0209738",nm0019604
1,tt0002423,0,1919,113,"Biography,Drama,Romance",6.6,nm0523932,"nm0266183,nm0473134"
2,tt0002844,0,1913,54,"Crime,Drama",6.9,nm0275421,"nm0019855,nm0275421,nm0816232"
3,tt0003014,0,1913,96,Drama,7.0,nm0803705,"nm0472236,nm0803705"
4,tt0003037,0,1913,61,"Crime,Drama",6.9,nm0275421,"nm0019855,nm0275421,nm0816232"
5,tt0003165,0,1913,90,"Crime,Drama,Mystery",6.9,nm0275421,"nm0019855,nm0275421,nm0816232"
6,tt0003419,0,1913,85,"Drama,Fantasy,Horror",6.4,"nm0263912,nm0753233","nm0263912,nm0210503"
7,tt0003643,0,1914,78,"Crime,Drama,Horror",6.4,nm0000428,"nm0000590,nm0000428"
8,tt0003740,0,1914,148,"Adventure,Drama,History",7.1,nm0665163,"nm0195339,nm0515385,nm0665163,nm0758215"
9,tt0003772,0,1914,52,"Drama,Fantasy",6.0,nm0456804,nm0674518
