In [10]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

In [11]:
# Processing Data
df = pd.read_csv("Datasets/Movies_Merged.csv")
df = df.drop(columns=["Directed by","No of Persons Voted","Written by","Duration","id","title","vote_count","vote_average","status","release_date","revenue","runtime","backdrop_path","budget",
                      "homepage","imdb_id","original_language","original_title","overview","poster_path","tagline","production_companies","spoken_languages",
                      "Release Year"])
df = df.rename(columns={"Release Date":"Release_Date","adult":"Adult","popularity":"Popularity","production_countries":"Production_Countries","keywords":"Keywords"})

# Changing Release Date to Release Year
df['Release_Year'] = pd.to_datetime(df['Release_Date'], errors='coerce').dt.year

# Making the Rating column numeric and filling the missing values with the mean to later normalize it
scaler = MinMaxScaler()
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df.loc[:, 'Rating'] = df['Rating'].fillna(df['Rating'].mean())
df['Normalized_Rating'] = scaler.fit_transform(df[['Rating']])

# Convert "Adult" to binary
df["Adult"] = df["Adult"].astype(int)  # Convert True/False to 1/0

# Making the Popularity column numeric and filling the missing values with the mean to later normalize it
scaler = MinMaxScaler()
df['Popularity'] = pd.to_numeric(df['Popularity'], errors='coerce')
df.loc[:, 'Popularity'] = df['Popularity'].fillna(df['Popularity'].mean())
df['Normalized_Popularity'] = scaler.fit_transform(df[['Popularity']])

# Normalize "Release_Year"
scaler = MinMaxScaler()
df["Normalized_Release_Year"] = scaler.fit_transform(df[["Release_Year"]])

# Adding Movie ID Column as a first column
movie_id = pd.Series(range(len(df)), name='ID')
df = pd.concat([movie_id, df.reset_index(drop=True)], axis=1)

# Displaying the first 5 rows of the dataset
df = df.drop(columns=["Release_Date","Rating","Popularity","Release_Year"])
df.head(5)

Unnamed: 0,ID,Title,Description,Adult,Genres,Production_Countries,Keywords,Normalized_Rating,Normalized_Popularity,Normalized_Release_Year
0,0,Three Colors: Red,Krzysztof Kieslowski closes his Three Colors t...,0,"Drama, Mystery, Romance","France, Poland, Switzerland","infidelity, judge, isolation, shadowing, engli...",0.824742,0.006831,0.444444
1,1,The Conformist,"Set in Rome in the 1930s, this re-release of B...",0,Drama,"Germany, France, Italy","paris, france, hitman, italy, fascism, childho...",0.721649,0.00492,0.0
2,2,Tokyo Story,Yasujiro Ozu’s Tokyo Story follows an aging co...,0,Drama,Japan,"fish, beach, dream, baby, coma, peace, boat, c...",0.804124,0.006621,0.037037
3,3,The Godfather,Francis Ford Coppola's epic features Marlon Br...,0,"Drama, Crime",United States of America,"based on novel or book, loss of loved one, lov...",0.927835,0.052916,0.037037
4,4,Boyhood,"Filmed over 12 years with the same cast, Richa...",0,Drama,United States of America,"high school, family's daily life, college, urb...",0.742268,0.006224,0.814815


In [12]:
# One-hot encode genres and production countries
genres = df["Genres"].str.get_dummies(sep=", ")
production_countries = df["Production_Countries"].str.get_dummies(sep=", ")

In [13]:
# Encode textual features into embeddings
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def batch_encode_texts_gpu(texts, batch_size=32, max_length=128):
    embeddings = []
    model.eval()
    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=max_length).to(device)
            outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1)
            embeddings.append(batch_embeddings.cpu().numpy())  # Move to CPU for saving
    return np.vstack(embeddings)

description_embeddings = batch_encode_texts_gpu(df['Description'].astype(str).tolist())
keyword_embeddings  = batch_encode_texts_gpu(df['Keywords'].astype(str).tolist())

In [14]:
# Assign each row its respective embedding
df['Description_Embedding'] = list(description_embeddings)
df['Keyword_Embedding'] = list(keyword_embeddings)
df.head(5)

Unnamed: 0,ID,Title,Description,Adult,Genres,Production_Countries,Keywords,Normalized_Rating,Normalized_Popularity,Normalized_Release_Year,Description_Embedding,Keyword_Embedding
0,0,Three Colors: Red,Krzysztof Kieslowski closes his Three Colors t...,0,"Drama, Mystery, Romance","France, Poland, Switzerland","infidelity, judge, isolation, shadowing, engli...",0.824742,0.006831,0.444444,"[-0.0666364, 0.0040805954, 0.35922006, -0.1488...","[0.15138477, -0.15010078, 0.4445166, -0.036870..."
1,1,The Conformist,"Set in Rome in the 1930s, this re-release of B...",0,Drama,"Germany, France, Italy","paris, france, hitman, italy, fascism, childho...",0.721649,0.00492,0.0,"[-0.09982319, -0.1731106, 0.27626318, -0.17890...","[0.045312237, 0.03634315, -0.08631198, 0.00019..."
2,2,Tokyo Story,Yasujiro Ozu’s Tokyo Story follows an aging co...,0,Drama,Japan,"fish, beach, dream, baby, coma, peace, boat, c...",0.804124,0.006621,0.037037,"[-0.3167529, 0.048170626, 0.24167661, -0.26639...","[0.059226327, 0.13421835, 0.40654075, 0.010912..."
3,3,The Godfather,Francis Ford Coppola's epic features Marlon Br...,0,"Drama, Crime",United States of America,"based on novel or book, loss of loved one, lov...",0.927835,0.052916,0.037037,"[-0.2850091, 0.03263696, 0.15261325, -0.411123...","[0.05989898, -0.06531602, 0.15561125, -0.12018..."
4,4,Boyhood,"Filmed over 12 years with the same cast, Richa...",0,Drama,United States of America,"high school, family's daily life, college, urb...",0.742268,0.006224,0.814815,"[-0.12020515, 0.10880962, 0.20739554, -0.23863...","[-0.0044798157, 0.20708695, 0.36554858, -0.141..."


In [15]:
# Combining the Features into Movie Embeddings
movie_embeddings = pd.concat([df[["ID","Title","Adult","Normalized_Release_Year","Normalized_Rating","Normalized_Popularity"]], genres, production_countries], axis=1)
movie_embeddings['Description_Embedding'] = df['Description_Embedding']
movie_embeddings['Keyword_Embedding'] = df['Keyword_Embedding']

# Save the preprocessed dataset
movie_embeddings.to_pickle("Dataset_Processed/Movie_Embeddings.pkl")