> Importing Necessary Dependencies

---

In [None]:
import os
import re
import ast
import nltk
import zipfile
import gdown
import pickle
import torch
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from langchain_huggingface import HuggingFaceEmbeddings

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/siddhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Move the notebook file path to the root of the project directory
os.chdir("../")

In [3]:
# Choose GPU if available, else CPU
device = (
    "cuda" if torch.cuda.is_available() else "cpu"
)  
# Print the selected device
print("Device : ", device)  

Device :  cuda


In [4]:
# Global Variables
GOOGLE_DRIVE_FILE_ID = "1PMwyTlpKh-1IThTmTHRYxd1pjIsn5HYd"
DATA_DOWNLOADED_PATH = "./data/final.csv"
SAVED_EMBEDDING_PATH = "./models/movie_embeddings.pkl"
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

> Data Ingestion & Loading

---

In [5]:
# Data Ingestion
def download_and_extract(file_id: str, zip_name="final-movie-data.zip", extract_dir="data"):
    if not os.path.exists(zip_name):
        print("Downloading dataset from Google Drive...")
        url = f"https://drive.google.com/uc?id={file_id}"
        gdown.download(url, zip_name, quiet=False)
    else:
        print("Zip file already exists. Skipping download.")

    if not os.path.exists(extract_dir):
        os.makedirs(extract_dir)
    with zipfile.ZipFile(zip_name, "r") as zip_ref:
        print("Extracting dataset...")
        zip_ref.extractall(extract_dir)
        print("Extraction complete.")
        
download_and_extract(file_id=GOOGLE_DRIVE_FILE_ID)

Zip file already exists. Skipping download.
Extracting dataset...
Extraction complete.


In [6]:
# Data Loading
df = pd.read_csv(DATA_DOWNLOADED_PATH)
df.head(3).T

Unnamed: 0,0,1,2
movieId,169336,98381,113862
title,Michael Ian Black: Noted Expert,Hellraiser: Revelations,"Guest, The"
imdbId,5200506,1716747,2980592
tmdbId,399938.0,70584.0,241848.0
positive_users,[63994],"[44970, 106905, 108905, 172263, 216119]","[305, 2158, 2172, 2402, 3083, 3553, 3786, 3884..."
negative_users,[],"[3083, 11969, 21507, 22009, 45584, 64906, 6565...","[95, 527, 3892, 5616, 6386, 8219, 9082, 21730,..."
positive_count,1,5,442
negative_count,0,44,193
id,399938,70584,241848
vote_average,6.8,3.555,6.634


In [7]:
# Check the shape of our data
rows, cols = df.shape
print("Number of rows : ", rows)
print("Number of columns : ", cols)

Number of rows :  13702
Number of columns :  25


> Data Transformation

---

In [8]:
# Drop unnessary columns
df = df.drop(
    [
        "positive_users",
        "positive_count",
        "negative_users",
        "negative_count",
        "vote_average",
        "vote_count",
        "status",
        "release_date",
        "revenue",
        "runtime",
        "budget",
        "poster_path",
    ],
    axis=1,
)
df.head(n=3).T

Unnamed: 0,0,1,2
movieId,169336,98381,113862
title,Michael Ian Black: Noted Expert,Hellraiser: Revelations,"Guest, The"
imdbId,5200506,1716747,2980592
tmdbId,399938.0,70584.0,241848.0
id,399938,70584,241848
adult,False,False,False
imdb_id,tt5200506,tt1716747,tt2980592
original_language,English,English,English
overview,"Veteran of sketch, television, and film, comed...",Two friends in Mexico discover the Lament Conf...,A soldier introduces himself to the Peterson f...
genres,Comedy,Horror,"Mystery, Thriller, Action"


In [9]:
# Ttansform and clean the genre & prodcution companies columns
def clean_genres(x):
    try:
        # Convert string representation of list to actual list
        if isinstance(x, str):
            x = ast.literal_eval(x)
        # Join list elements into comma-separated string
        return ", ".join([str(i).strip() for i in x])
    except:  # noqa: E722
        return str(x)
    
df["genres"] = df["genres"].apply(clean_genres)
df["production_companies"] = df["production_companies"].apply(
    lambda x: ", ".join([c.replace(" ", "") for c in x.split(",")])
)
df.head(n=3)

Unnamed: 0,movieId,title,imdbId,tmdbId,id,adult,imdb_id,original_language,overview,genres,production_companies,keywords,tmdb_id
0,169336,Michael Ian Black: Noted Expert,5200506,399938.0,399938,False,tt5200506,English,"Veteran of sketch, television, and film, comed...",Comedy,NewWaveEntertainment,stand-up comedy,399938
1,98381,Hellraiser: Revelations,1716747,70584.0,70584,False,tt1716747,English,Two friends in Mexico discover the Lament Conf...,Horror,"NeoArt&Logic, PuzzleboxFilms, DimensionExtreme...","pinhead, demon, puzzle box, cenobite",70584
2,113862,"Guest, The",2980592,241848.0,241848,False,tt2980592,English,A soldier introduces himself to the Peterson f...,"Mystery, Thriller, Action","HanWayFilms, SnootEntertainment","high school, psychopath, harassment, halloween...",241848


In [10]:
# Concat all the transformed features into one column
df["concat_description"] = None
df["concat_description"] = (
    df["overview"].astype(str)
    + " "
    + df["genres"].astype(str)
    + " "
    + df["production_companies"].astype(str)
    + " "
    + df["original_language"].astype(str)
    + " "
)

In [11]:
# Display the concated features
df = df[["tmdb_id", "title", "concat_description", "genres"]]
df.head()

Unnamed: 0,tmdb_id,title,concat_description,genres
0,399938,Michael Ian Black: Noted Expert,"Veteran of sketch, television, and film, comed...",Comedy
1,70584,Hellraiser: Revelations,Two friends in Mexico discover the Lament Conf...,Horror
2,241848,"Guest, The",A soldier introduces himself to the Peterson f...,"Mystery, Thriller, Action"
3,36691,Solitary Man,A car magnate watches his personal and profess...,"Comedy, Drama, Romance"
4,341392,Lolo,"On holiday in the south of France, chic Parisi...",Comedy


> Data Preparation

---

In [12]:
## Functions to clean the concated description

def make_lower_case(text):
    text_lower = None
    text_lower = text.lower()
    return text_lower

def remove_stop_words(text):
    text = text.split()
    stop_words = set(stopwords.words("english"))
    removed_stop_word_text = None
    filtered_words = [word for word in text if word not in stop_words]
    removed_stop_word_text = " ".join(filtered_words)
    return removed_stop_word_text

def remove_numbers(text):
    pattern = r"[0-9]"
    removed_numbers_text = re.sub(pattern, "", text)
    return removed_numbers_text

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r"[\w-]+")
    tokens = tokenizer.tokenize(text)
    removed_punctuation_text = " ".join(tokens)
    return removed_punctuation_text

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    return " ".join(lemmatized)

In [13]:
## Apply the cleaninf functions defined above
df_cleaned = df.copy()
df_cleaned["cleaned_description"] = (
    df["concat_description"]
    .apply(make_lower_case)
    .apply(remove_punctuation)
    .apply(remove_numbers)
    .apply(lemmatize_text)
    .apply(remove_stop_words)
)

In [14]:
# Increase the weight of genres in the concated description
def weighted_description(row, genre_weight=3):
    # split the comma-separated genre string into a list of words
    genres_list = [g.strip() for g in row["genres"].split(",")]
    # repeat genres
    genres_weighted = " ".join(genres_list * genre_weight)
    # concatenate with cleaned description
    return row["cleaned_description"] + " " + genres_weighted

df_cleaned["weighted_description"] = df_cleaned.apply(weighted_description, axis=1)

In [15]:
# View a sample of our feature weighted_description
df_cleaned.loc[0, "weighted_description"]

'veteran sketch television film comedian michael ian black ha mastered delivery equal part dapper deadpan whether discussing pro-choice debate tilt-a-whirl taped john jay college new york city black first comedy special epix includes wry take human experience parenting gender role guilty pleasure shape size comedy newwaveentertainment english Comedy Comedy Comedy'

In [16]:
# Convert your movie descriptions column to a list
descriptions = df_cleaned["weighted_description"].tolist()

> Modeling

---

In [17]:
# Download and load the pretrained huggingface embedding model
embedding = HuggingFaceEmbeddings(model_name=MODEL_NAME)

In [18]:
# Compute movie embedding using our dataset
if os.path.exists(SAVED_EMBEDDING_PATH):
    print("Loading precomputed movie embeddings...")
    with open(SAVED_EMBEDDING_PATH, "rb") as f:
        movie_embedding = pickle.load(f)
else:
    print("Computing movie embeddings...")
    movie_embedding = np.array(embedding.embed_documents(descriptions))
    with open(SAVED_EMBEDDING_PATH, "wb") as f:
        pickle.dump(movie_embedding, f)

Computing movie embeddings...


In [19]:
# Function to recommend movie using the movie embedding computed above
def content_based_recommend(movie_title, df, embeddings=movie_embedding, N=10):
    idx = df[df["title"] == movie_title].index[0]
    movie_vec = embeddings[idx].reshape(1, -1)
    sims = cosine_similarity(movie_vec, embeddings).flatten()
    top_indices = sims.argsort()[::-1][1 : N + 1]
    return [(df.iloc[i]["title"], round(float(sims[i]), 3)) for i in top_indices]

In [20]:
# Test the movie embedding
movie_title = "The Addams Family"
result = content_based_recommend(
    movie_title=movie_title, df=df_cleaned, embeddings=movie_embedding, N=10
)
result

[('The Addams Family 2', 0.81),
 ('Dragons: Gift of the Night Fury', 0.681),
 ('A Talking Cat!?!', 0.67),
 ('Animal Crackers', 0.67),
 ('Auntie Edna', 0.669),
 ('Minions & More Volume 2', 0.669),
 ('Hair Love', 0.656),
 ('Letter to Momo, A (Momo e no tegami)', 0.656),
 ('Happy Family', 0.654),
 ('Kung Fu Panda 3', 0.652)]