Import Necessary Dependencies

---

In [1]:
import pandas as pd

SAVE_DATA_PATH = "../../data/final/final.csv"

Loading Our Data

---

In [2]:
BASE_DATA_PATH = "../../data/processed/"
MOVIELENS_DATA_PATH = BASE_DATA_PATH + "preprocessed_movielens_data.csv"
TMDB_DATA_PATH = BASE_DATA_PATH + "preprocessed_tmdb_data.csv"

data_movielens = pd.read_csv(MOVIELENS_DATA_PATH)
data_tmdb = pd.read_csv(TMDB_DATA_PATH)

Analysis

---

In [3]:
print("Movielens Data Shape:", data_movielens.shape)
print("TMDB Data Shape:", data_tmdb.shape)  

Movielens Data Shape: (73859, 8)
TMDB Data Shape: (141426, 17)


In [4]:
print("Movielens Dataframe : \n", data_movielens.head())
print("TMDB Dataframe : \n", data_tmdb.head())

Movielens Dataframe : 
                          title  \
0                    Toy Story   
1                      Jumanji   
2             Grumpier Old Men   
3            Waiting to Exhale   
4  Father of the Bride Part II   

                                              genres  release_year  \
0  ['adventure', 'animation', 'children', 'comedy...        1995.0   
1               ['adventure', 'children', 'fantasy']        1995.0   
2                              ['comedy', 'romance']        1995.0   
3                     ['comedy', 'drama', 'romance']        1995.0   
4                                         ['comedy']        1995.0   

                                      positive_users  positive_count  \
0  [1, 2, 7, 12, 24, 35, 42, 51, 54, 64, 72, 79, ...           50572   
1  [9, 41, 51, 73, 82, 101, 117, 177, 207, 210, 2...           10622   
2  [9, 41, 200, 314, 367, 473, 475, 540, 775, 940...            5152   
3  [260, 934, 1760, 2051, 2077, 2721, 2722, 2957,...          

In [5]:
# Sets of movie_ids
movielens_ids = set(data_movielens["movie_id"])
tmdb_ids = set(data_tmdb["movie_id"])
# Intersection
common_ids = movielens_ids & tmdb_ids
print("Number of common movie_ids:", len(common_ids))

Number of common movie_ids: 46650


Basic Cleaning

---

In [6]:
# Drop redundant and unnecessary columns from both datafarames before merging
data_movielens = data_movielens.drop(columns=["release_year"])
data_tmdb = data_tmdb.drop(columns=["title", "original_title", "popularity"])

Data Merging

---

In [7]:
final_data = pd.merge(
    data_movielens,
    data_tmdb,
    on="movie_id",
    how="inner",  # ensures only common movies are kept
)

final_data.head(n=2).T

Unnamed: 0,0,1
title,Toy Story,Jumanji
genres,"['adventure', 'animation', 'children', 'comedy...","['adventure', 'children', 'fantasy']"
positive_users,"[1, 2, 7, 12, 24, 35, 42, 51, 54, 64, 72, 79, ...","[9, 41, 51, 73, 82, 101, 117, 177, 207, 210, 2..."
positive_count,50572,10622
negative_users,"[14, 87, 180, 187, 196, 339, 468, 479, 486, 49...","[14, 39, 50, 72, 79, 141, 148, 149, 227, 265, ..."
negative_count,6299,6539
movie_id,862_tt0114709,8844_tt0113497
vote_average,7.971,7.239
vote_count,17152,9833
status,Released,Released


In [8]:
if final_data.isnull().sum().sum() == 0:
    print("No missing values in the final dataset.")
else:
    print("There are missing values in the final dataset.")
    final_data = final_data.dropna()
    print("After dropping missing values, new shape:", final_data.shape)

There are missing values in the final dataset.
After dropping missing values, new shape: (46018, 20)


In [9]:
# Convert movie_id back to TMDB ID as integer
final_data["tmdb_id"] = final_data["movie_id"].apply(lambda x: int(x.split("_")[0]))
final_data = final_data.drop(columns=["movie_id"])

Save Our Final Data

---

In [10]:
final_data.to_csv(SAVE_DATA_PATH, index=False)