> Importing Necessary Dependencies

---

In [None]:
import os
import re
import ast
import nltk
import zipfile
import gdown
import pickle
import torch
import random
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from langchain_huggingface import HuggingFaceEmbeddings

nltk.download("stopwords")

In [None]:
# Move the notebook file path to the root of the project directory
os.chdir("../")

In [None]:
# Choose GPU if available, else CPU
device = (
    "cuda" if torch.cuda.is_available() else "cpu"
)  
# Print the selected device
print("Device : ", device)  

In [None]:
# Global Variables
GOOGLE_DRIVE_FILE_ID = "1PMwyTlpKh-1IThTmTHRYxd1pjIsn5HYd"
DATA_DOWNLOADED_PATH = "./data/final.csv"
SAVED_EMBEDDING_PATH = "./models/movie_embeddings.pkl"
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

> Data Ingestion & Loading

---

In [None]:
# Data Ingestion
def download_and_extract(file_id: str, zip_name="final-movie-data.zip", extract_dir="data"):
    if not os.path.exists(zip_name):
        print("Downloading dataset from Google Drive...")
        url = f"https://drive.google.com/uc?id={file_id}"
        gdown.download(url, zip_name, quiet=False)
    else:
        print("Zip file already exists. Skipping download.")

    if not os.path.exists(extract_dir):
        os.makedirs(extract_dir)
    with zipfile.ZipFile(zip_name, "r") as zip_ref:
        print("Extracting dataset...")
        zip_ref.extractall(extract_dir)
        print("Extraction complete.")
        
download_and_extract(file_id=GOOGLE_DRIVE_FILE_ID)

In [None]:
# Data Loading
df = pd.read_csv(DATA_DOWNLOADED_PATH)
df.head(3).T

In [None]:
# Check the shape of our data
rows, cols = df.shape
print("Number of rows : ", rows)
print("Number of columns : ", cols)

> Data Transformation

---

In [None]:
# Drop unnessary columns
df = df.drop(
    [
        "positive_users",
        "positive_count",
        "negative_users",
        "negative_count",
        "vote_average",
        "vote_count",
        "status",
        "release_date",
        "revenue",
        "runtime",
        "budget",
        "poster_path",
    ],
    axis=1,
)
df.head(n=3).T

In [None]:
# Ttansform and clean the genre & prodcution companies columns
def clean_genres(x):
    try:
        # Convert string representation of list to actual list
        if isinstance(x, str):
            x = ast.literal_eval(x)
        # Join list elements into comma-separated string
        return ", ".join([str(i).strip() for i in x])
    except:  # noqa: E722
        return str(x)
    
df["genres"] = df["genres"].apply(clean_genres)
df["production_companies"] = df["production_companies"].apply(
    lambda x: ", ".join([c.replace(" ", "") for c in x.split(",")])
)
df.head(n=3)

In [None]:
# Concat all the transformed features into one column
df["concat_description"] = None
df["concat_description"] = (
    df["overview"].astype(str)
    + " "
    + df["genres"].astype(str)
    + " "
    + df["production_companies"].astype(str)
    + " "
    + df["original_language"].astype(str)
    + " "
)

In [None]:
# Display the concated features
df = df[["tmdb_id", "title", "concat_description", "genres"]]
df.head()

> Data Preparation

---

In [None]:
## Functions to clean the concated description

def make_lower_case(text):
    text_lower = None
    text_lower = text.lower()
    return text_lower

def remove_stop_words(text):
    text = text.split()
    stop_words = set(stopwords.words("english"))
    removed_stop_word_text = None
    filtered_words = [word for word in text if word not in stop_words]
    removed_stop_word_text = " ".join(filtered_words)
    return removed_stop_word_text

def remove_numbers(text):
    pattern = r"[0-9]"
    removed_numbers_text = re.sub(pattern, "", text)
    return removed_numbers_text

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r"[\w-]+")
    tokens = tokenizer.tokenize(text)
    removed_punctuation_text = " ".join(tokens)
    return removed_punctuation_text

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    return " ".join(lemmatized)

In [None]:
## Apply the cleaninf functions defined above
df_cleaned = df.copy()
df_cleaned["cleaned_description"] = (
    df["concat_description"]
    .apply(make_lower_case)
    .apply(remove_punctuation)
    .apply(remove_numbers)
    .apply(lemmatize_text)
    .apply(remove_stop_words)
)

In [None]:
# Increase the weight of genres in the concated description
def weighted_description(row, genre_weight=3):
    # split the comma-separated genre string into a list of words
    genres_list = [g.strip() for g in row["genres"].split(",")]
    # repeat genres
    genres_weighted = " ".join(genres_list * genre_weight)
    # concatenate with cleaned description
    return row["cleaned_description"] + " " + genres_weighted

df_cleaned["weighted_description"] = df_cleaned.apply(weighted_description, axis=1)

In [None]:
# View a sample of our feature weighted_description
df_cleaned.loc[0, "weighted_description"]

In [None]:
# Convert your movie descriptions column to a list
descriptions = df_cleaned["weighted_description"].tolist()

> Modeling

---

In [None]:
# Download and load the pretrained huggingface embedding model
embedding = HuggingFaceEmbeddings(model_name=MODEL_NAME)

In [None]:
# Compute movie embedding using our dataset
if os.path.exists(SAVED_EMBEDDING_PATH):
    print("Loading precomputed movie embeddings...")
    with open(SAVED_EMBEDDING_PATH, "rb") as f:
        movie_embedding = pickle.load(f)
else:
    print("Computing movie embeddings...")
    movie_embedding = np.array(embedding.embed_documents(descriptions))
    with open(SAVED_EMBEDDING_PATH, "wb") as f:
        pickle.dump(movie_embedding, f)

In [None]:
# Function to recommend movie using the movie embedding computed above
def content_based_recommend(movie_title, df, embeddings=movie_embedding, N=10):
    idx = df[df["title"] == movie_title].index[0]
    movie_vec = embeddings[idx].reshape(1, -1)
    sims = cosine_similarity(movie_vec, embeddings).flatten()
    top_indices = sims.argsort()[::-1][1 : N + 1]
    return [(df.iloc[i]["title"], round(float(sims[i]), 3)) for i in top_indices]

In [None]:
# Test the movie embedding
movie_title = "The Addams Family"
result = content_based_recommend(
    movie_title=movie_title, df=df_cleaned, embeddings=movie_embedding, N=10
)
result