> Importing Necessary Dependencies

In [1]:
import os
import re
import ast
import nltk
import zipfile
import gdown
import pickle
import torch
import random
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from langchain_huggingface import HuggingFaceEmbeddings

nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/siddhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Move the notebook file path to the root of the project directory
os.chdir("../")

In [3]:
# Choose GPU if available, else CPU
device = (
    "cuda" if torch.cuda.is_available() else "cpu"
)  
# Print the selected device
print("Device : ", device)  

Device :  cuda


In [4]:
# Global Variables
GOOGLE_DRIVE_FILE_ID = "1r5zmhD4unxkauzRpuEHZXDpMUx5HK13R"
DATA_DOWNLOADED_PATH = "./data/final.csv"
SAVED_EMBEDDING_PATH = "./models/movie_embeddings.pkl"
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

Data Ingestion & Loading

In [5]:
# Data Ingestion
def download_and_extract(file_id: str, zip_name="final-movie-data.zip", extract_dir="data"):
    if not os.path.exists(zip_name):
        print("Downloading dataset from Google Drive...")
        url = f"https://drive.google.com/uc?id={file_id}"
        gdown.download(url, zip_name, quiet=False)
    else:
        print("Zip file already exists. Skipping download.")

    if not os.path.exists(extract_dir):
        os.makedirs(extract_dir)
    with zipfile.ZipFile(zip_name, "r") as zip_ref:
        print("Extracting dataset...")
        zip_ref.extractall(extract_dir)
        print("Extraction complete.")
        
download_and_extract(file_id=GOOGLE_DRIVE_FILE_ID)

Downloading dataset from Google Drive...


Downloading...
From (original): https://drive.google.com/uc?id=1r5zmhD4unxkauzRpuEHZXDpMUx5HK13R
From (redirected): https://drive.google.com/uc?id=1r5zmhD4unxkauzRpuEHZXDpMUx5HK13R&confirm=t&uuid=17bc619a-00fa-4f9d-91e5-8e95e086bd45
To: /home/siddhu/Desktop/Movie-Recommendation-System/final-movie-data.zip
100%|██████████| 77.3M/77.3M [00:17<00:00, 4.30MB/s]


Extracting dataset...
Extraction complete.


In [6]:
# Data Loading
df = pd.read_csv(DATA_DOWNLOADED_PATH)
df.head(3).T

Unnamed: 0,0,1,2
title,Toy Story,Jumanji,Grumpier Old Men
genres,"['adventure', 'animation', 'children', 'comedy...","['adventure', 'children', 'fantasy']","['comedy', 'romance']"
positive_users,"[1, 2, 7, 12, 24, 35, 42, 51, 54, 64, 72, 79, ...","[9, 41, 51, 73, 82, 101, 117, 177, 207, 210, 2...","[9, 41, 200, 314, 367, 473, 475, 540, 775, 940..."
positive_count,50572,10622,5152
negative_users,"[14, 87, 180, 187, 196, 339, 468, 479, 486, 49...","[14, 39, 50, 72, 79, 141, 148, 149, 227, 265, ...","[73, 149, 190, 285, 477, 562, 677, 889, 974, 1..."
negative_count,6299,6539,3574
vote_average,7.971,7.239,6.476
vote_count,17152,9833,347
status,Released,Released,Released
release_date,1995-10-30,1995-12-15,1995-12-22


In [7]:
# Check the shape of our data
rows, cols = df.shape
print("Number of rows : ", rows)
print("Number of columns : ", cols)

Number of rows :  46018
Number of columns :  20


Data Transformation

In [8]:
# Drop unnessary columns
df = df.drop(
    [
        "positive_users",
        "positive_count",
        "negative_users",
        "negative_count",
        "vote_average",
        "vote_count",
        "status",
        "release_date",
        "revenue",
        "runtime",
        "budget",
        "poster_path",
    ],
    axis=1,
)
df.head(n=3).T

Unnamed: 0,0,1,2
title,Toy Story,Jumanji,Grumpier Old Men
genres,"['adventure', 'animation', 'children', 'comedy...","['adventure', 'children', 'fantasy']","['comedy', 'romance']"
adult,False,False,False
original_language,English,English,English
overview,"Led by Woody, Andy's toys live happily in his ...",When siblings Judy and Peter discover an encha...,A family wedding reignites the ancient feud be...
production_companies,Pixar,"TriStar Pictures, Interscope Communications, T...","Lancaster Gate, Warner Bros. Pictures"
keywords,"rescue, friendship, mission, martial arts, jea...","giant insect, board game, disappearance, jungl...","fishing, sequel, old man, best friend, wedding..."
tmdb_id,862,8844,15602


In [9]:
# Ttansform and clean the genre & prodcution companies columns
def clean_genres(x):
    try:
        # Convert string representation of list to actual list
        if isinstance(x, str):
            x = ast.literal_eval(x)
        # Join list elements into comma-separated string
        return ", ".join([str(i).strip() for i in x])
    except:  # noqa: E722
        return str(x)
    
df["genres"] = df["genres"].apply(clean_genres)
df["production_companies"] = df["production_companies"].apply(
    lambda x: ", ".join([c.replace(" ", "") for c in x.split(",")])
)
df.head(n=3)

Unnamed: 0,title,genres,adult,original_language,overview,production_companies,keywords,tmdb_id
0,Toy Story,"adventure, animation, children, comedy, fantasy",False,English,"Led by Woody, Andy's toys live happily in his ...",Pixar,"rescue, friendship, mission, martial arts, jea...",862
1,Jumanji,"adventure, children, fantasy",False,English,When siblings Judy and Peter discover an encha...,"TriStarPictures, InterscopeCommunications, Tei...","giant insect, board game, disappearance, jungl...",8844
2,Grumpier Old Men,"comedy, romance",False,English,A family wedding reignites the ancient feud be...,"LancasterGate, WarnerBros.Pictures","fishing, sequel, old man, best friend, wedding...",15602


In [10]:
# Concat all the transformed features into one column
df["concat_description"] = None
df["concat_description"] = (
    df["overview"].astype(str)
    + " "
    + df["genres"].astype(str)
    + " "
    + df["production_companies"].astype(str)
    + " "
    + df["original_language"].astype(str)
    + " "
)

In [11]:
# Display the concated features
df = df[["tmdb_id", "title", "concat_description", "genres"]]
df.head()

Unnamed: 0,tmdb_id,title,concat_description,genres
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","adventure, animation, children, comedy, fantasy"
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"adventure, children, fantasy"
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"comedy, romance"
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","comedy, drama, romance"
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,comedy


Data Preparation

In [12]:
## Functions to clean the concated description

def make_lower_case(text):
    text_lower = None
    text_lower = text.lower()
    return text_lower

def remove_stop_words(text):
    text = text.split()
    stop_words = set(stopwords.words("english"))
    removed_stop_word_text = None
    filtered_words = [word for word in text if word not in stop_words]
    removed_stop_word_text = " ".join(filtered_words)
    return removed_stop_word_text

def remove_numbers(text):
    pattern = r"[0-9]"
    removed_numbers_text = re.sub(pattern, "", text)
    return removed_numbers_text

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r"[\w-]+")
    tokens = tokenizer.tokenize(text)
    removed_punctuation_text = " ".join(tokens)
    return removed_punctuation_text

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    return " ".join(lemmatized)

In [13]:
## Apply the cleaninf functions defined above
df_cleaned = df.copy()
df_cleaned["cleaned_description"] = (
    df["concat_description"]
    .apply(make_lower_case)
    .apply(remove_punctuation)
    .apply(remove_numbers)
    .apply(lemmatize_text)
    .apply(remove_stop_words)
)

In [14]:
# Increase the weight of genres in the concated description
def weighted_description(row, genre_weight=3):
    # split the comma-separated genre string into a list of words
    genres_list = [g.strip() for g in row["genres"].split(",")]
    # repeat genres
    genres_weighted = " ".join(genres_list * genre_weight)
    # concatenate with cleaned description
    return row["cleaned_description"] + " " + genres_weighted

df_cleaned["weighted_description"] = df_cleaned.apply(weighted_description, axis=1)

In [15]:
# View a sample of our feature weighted_description
df_cleaned.loc[0, "weighted_description"]

'led woody andy toy live happily room andy birthday brings buzz lightyear onto scene afraid losing place andy heart woody plot buzz circumstance separate buzz woody owner duo eventually learns put aside difference adventure animation child comedy fantasy pixar english adventure animation children comedy fantasy adventure animation children comedy fantasy adventure animation children comedy fantasy'

In [16]:
# Convert your movie descriptions column to a list
descriptions = df_cleaned["weighted_description"].tolist()

Modeling

In [17]:
# Download and load the pretrained huggingface embedding model
embedding = HuggingFaceEmbeddings(model_name=MODEL_NAME)

In [18]:
# Compute movie embedding using our dataset
if os.path.exists(SAVED_EMBEDDING_PATH):
    print("Loading precomputed movie embeddings...")
    with open(SAVED_EMBEDDING_PATH, "rb") as f:
        movie_embedding = pickle.load(f)
else:
    print("Computing movie embeddings...")
    movie_embedding = np.array(embedding.embed_documents(descriptions))
    with open(SAVED_EMBEDDING_PATH, "wb") as f:
        pickle.dump(movie_embedding, f)

Loading precomputed movie embeddings...


In [19]:
# Function to recommend movie using the movie embedding computed above
def content_based_recommend(movie_title, df, embeddings=movie_embedding, N=10):
    idx = df[df["title"] == movie_title].index[0]
    movie_vec = embeddings[idx].reshape(1, -1)
    sims = cosine_similarity(movie_vec, embeddings).flatten()
    top_indices = sims.argsort()[::-1][1 : N + 1]
    return [(df.iloc[i]["title"], round(float(sims[i]), 3)) for i in top_indices]

In [24]:
# Test the movie embedding

random_title = random.choice(df.title.to_list())
print("Random Movie Title : ", random_title)

movie_title = random_title
result = content_based_recommend(
    movie_title=movie_title, df=df_cleaned, embeddings=movie_embedding, N=10
)
result

Random Movie Title :  Hidden


[('What We Become', 0.747),
 ('Pod', 0.728),
 ('The Strangers: Prey at Night', 0.725),
 ('Itsy Bitsy', 0.724),
 ('Blood Honey', 0.719),
 ('We Need to Do Something', 0.712),
 ('The Darkness', 0.708),
 ('A Quiet Place', 0.707),
 ('Dawning', 0.707),
 ('Panic in Year Zero!', 0.696)]