## 1. MovieLens Data Preprocessing

---

Import Necessary Dependencies

---

In [1]:
import pandas as pd 

SAVE_DATA_PATH_MOVIELENS = "../data/processed/preprocessed_movielens_data.csv"

Loading our data

---

In [2]:
BASE_DATA_PATH_MOVIELENS = "../data/raw/movielens-data/"
MOVIES_DATA_PATH = BASE_DATA_PATH_MOVIELENS + "movies.csv"
LINKS_DATA_PATH = BASE_DATA_PATH_MOVIELENS + "links.csv"
RATINGS_DATA_PATH = BASE_DATA_PATH_MOVIELENS + "ratings.csv"

In [3]:
data_movies = pd.read_csv(MOVIES_DATA_PATH)
data_links = pd.read_csv(LINKS_DATA_PATH)
data_ratings = pd.read_csv(RATINGS_DATA_PATH)

print("Movies data : \n", data_movies.head())
print("Links data : \n", data_links.head())
print("Ratings data : \n", data_ratings.head())

Movies data : 
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
Links data : 
    movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0
Ratings data : 
    userId  movieId  rating   timestamp
0       1        1     4.0  1225734739
1       1      110     4.0  1225865086
2       1      158     4.0  1225733503
3       1      260     4.5  12257

### Analysis and Cleaning (MovieLens)

---

In [4]:
data_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
rows, cols = data_movies.shape
print("Number of movies: ", rows)
print("Number of attributes per movie: ", cols)

Number of movies:  86537
Number of attributes per movie:  3


In [6]:
duplicate_titles = (data_movies['title']
    .value_counts()
    .reset_index()
)
duplicate_titles.columns = ['title', 'count']
duplicate_titles = duplicate_titles[duplicate_titles['count'] > 1]
number_of_duplicate_titles = duplicate_titles.shape[0]
print("Number of duplicate movie titles: ", number_of_duplicate_titles)

Number of duplicate movie titles:  202


In [7]:
data_movies = data_movies.drop_duplicates(subset="title")
data_movies.reset_index(drop=True, inplace=True)

In [8]:
# Extract title and release year
data_movies[["title", "release_year"]] = data_movies["title"].str.extract(
    r"^(.*)\s\((\d{4})\)$"
)
# Convert year to integer
data_movies["release_year"] = data_movies["release_year"].astype("Int64")

In [9]:
data_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [10]:
data_ratings.drop(columns=["timestamp"], inplace=True)
data_movies.drop(columns=["genres"], inplace=True )

In [11]:
data_negative_ratings = data_ratings[data_ratings['rating'] < 3.0]
data_positive_ratings = data_ratings[data_ratings['rating'] >= 4.0]

positive_users_data = (
    data_positive_ratings.groupby("movieId")["userId"]
    .apply(list)
    .reset_index(name="positive_users")
)

negative_users_data = (
    data_negative_ratings.groupby("movieId")["userId"]
    .apply(list)
    .reset_index(name="negative_users")
)

In [12]:
# Final data preparation for MovieLens and saving
# (Simplified for brevity, assuming standard preprocessing as seen in the original notebook)
data_movies_merged = pd.merge(data_movies, data_links, on="movieId", how="inner")
data_movies_merged = pd.merge(data_movies_merged, positive_users_data, on="movieId", how="left")
data_movies_merged = pd.merge(data_movies_merged, negative_users_data, on="movieId", how="left")

# Fill NaN with empty lists for users
data_movies_merged["positive_users"] = data_movies_merged["positive_users"].apply(lambda d: d if isinstance(d, list) else [])
data_movies_merged["negative_users"] = data_movies_merged["negative_users"].apply(lambda d: d if isinstance(d, list) else [])

data_movies_merged["positive_count"] = data_movies_merged["positive_users"].apply(len)
data_movies_merged["negative_count"] = data_movies_merged["negative_users"].apply(len)

def create_movie_id(row):
    return f"{int(row['tmdbId'])}_tt{row['imdbId']}" if pd.notnull(row['tmdbId']) else None

data_movies_merged["movie_id"] = data_movies_merged.apply(create_movie_id, axis=1)
data_movies_merged.dropna(subset=["movie_id"], inplace=True)

data_movies_merged.to_csv(SAVE_DATA_PATH_MOVIELENS, index=False)
print(f"MovieLens preprocessed data saved to {SAVE_DATA_PATH_MOVIELENS}")

MovieLens preprocessed data saved to ../data/processed/preprocessed_movielens_data.csv


## 2. TMDB Data Preprocessing

---

In [13]:
import os
import sys
import pandas as pd
# Add the repo root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
from utils.langauge_code import get_language_name

SAVE_DATA_PATH_TMDB = "../data/processed/preprocessed_tmdb_data.csv"
BASE_DATA_PATH_TMDB = "../data/raw/tmdb-data.csv"

In [14]:
data_tmdb = pd.read_csv(BASE_DATA_PATH_TMDB)
print(data_tmdb.head())

       id            title  vote_average  vote_count    status release_date  \
0   27205        Inception         8.364       34495  Released   2010-07-15   
1  157336     Interstellar         8.417       32571  Released   2014-11-05   
2     155  The Dark Knight         8.512       30619  Released   2008-07-16   
3   19995           Avatar         7.573       29815  Released   2009-12-15   
4   24428     The Avengers         7.710       29166  Released   2012-04-25   

      revenue  runtime  adult                     backdrop_path  ...  \
0   825532764      148  False  /8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg  ...   
1   701729206      169  False  /pbrkL804c8yAv3zBZR4QPEafpAR.jpg  ...   
2  1004558444      152  False  /nMKdUUepR0i5zn0y1T4CsSB5chy.jpg  ...   
3  2923706026      162  False  /vL5LR6WdxWPjLPFRLe133jXWsh5.jpg  ...   
4  1518815515      143  False  /9BBTo63ANSmhC4e6r62OJFuK2GL.jpg  ...   

    original_title                                           overview  \
0        Inception 

In [15]:
# Cleaning TMDB
drop_cols = [
    "homepage",
    "backdrop_path",
    "spoken_languages",
    "tagline",
    "production_countries",
]

data_tmdb = data_tmdb.drop(columns=drop_cols)
data_tmdb.dropna(inplace=True)
data_tmdb.drop_duplicates(inplace=True)
data_tmdb.reset_index(drop=True, inplace=True)

data_tmdb['original_language'] = data_tmdb['original_language'].apply(get_language_name)

def create_tmdb_movie_id(row):
    return f"{row['id']}_{row['imdb_id']}"

data_tmdb["movie_id"] = data_tmdb.apply(create_tmdb_movie_id, axis=1)

data_tmdb.to_csv(SAVE_DATA_PATH_TMDB, index=False)
print(f"TMDB preprocessed data saved to {SAVE_DATA_PATH_TMDB}")

TMDB preprocessed data saved to ../data/processed/preprocessed_tmdb_data.csv


## 3. Merging Datasets

---

In [16]:
data_movielens = pd.read_csv(SAVE_DATA_PATH_MOVIELENS)
data_tmdb = pd.read_csv(SAVE_DATA_PATH_TMDB)

print("Movielens Data Shape:", data_movielens.shape)
print("TMDB Data Shape:", data_tmdb.shape)

Movielens Data Shape: (86204, 10)
TMDB Data Shape: (130199, 20)


In [17]:
# Sets of movie_ids
movielens_ids = set(data_movielens["movie_id"])
tmdb_ids = set(data_tmdb["movie_id"])
# Intersection
common_ids = movielens_ids & tmdb_ids
print("Number of common movie_ids:", len(common_ids))

Number of common movie_ids: 21471


In [18]:
# Drop redundant and unnecessary columns before merging
cols_to_drop_ml = ["release_year"] if "release_year" in data_movielens.columns else []
data_movielens = data_movielens.drop(columns=cols_to_drop_ml)

cols_to_drop_tmdb = ["title", "original_title", "popularity"] 
data_tmdb = data_tmdb.drop(columns=[col for col in cols_to_drop_tmdb if col in data_tmdb.columns])

final_data = pd.merge(
    data_movielens,
    data_tmdb,
    on="movie_id",
    how="inner",
)

final_data = final_data.dropna()
final_data["tmdb_id"] = final_data["movie_id"].apply(lambda x: int(x.split("_")[0]))
final_data = final_data.drop(columns=["movie_id"])

## 4. Final Data Sampling

---

In [21]:
N = 15000
SAVE_DATA_PATH_FINAL = "../data/final.csv"

In [22]:
data = final_data.copy()

profitable_movies = data[data["revenue"] > data["budget"]]
profitable_movies_sample = profitable_movies.sample(n=min(N//3, len(profitable_movies)), random_state=42)

unprofitable_movies = data[data["budget"] > data["revenue"]]
unprofitable_movies_sample = unprofitable_movies.sample(n=min(N//5, len(unprofitable_movies)), random_state=42)

excluded_movie_ids = set(profitable_movies_sample["tmdb_id"]) | set(unprofitable_movies_sample["tmdb_id"])
remaining_movies = data[
    (~data["tmdb_id"].isin(excluded_movie_ids))
    & (data["vote_average"] > 6.5)
]
remaining_sample = remaining_movies.sample(n=min(N//3, len(remaining_movies)), random_state=42)

all_sampled_ids = excluded_movie_ids | set(remaining_sample["tmdb_id"])
low_rating_movies = data[
    (~data["tmdb_id"].isin(all_sampled_ids))
    & (data["vote_average"] >= 3)
    & (data["vote_average"] < 6)
]
low_rating_sample = low_rating_movies.sample(n=min(2000, len(low_rating_movies)), random_state=42)

final_sampled_data = pd.concat(
    [profitable_movies_sample, unprofitable_movies_sample, remaining_sample, low_rating_sample],
    ignore_index=True,
)
final_sampled_data = final_sampled_data.sample(frac=1, random_state=42).reset_index(drop=True)

final_sampled_data.to_csv(SAVE_DATA_PATH_FINAL, index=False)
print(f"Final sampled data saved to {SAVE_DATA_PATH_FINAL}")

Final sampled data saved to ../data/final.csv
