In [1]:
# Importing necessary libraries
import os
import pandas as pd
import re
import nltk
import ast
import pickle
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from langchain_huggingface import HuggingFaceEmbeddings

nltk.download('stopwords')
nltk.download("punkt")       # sentence/word tokenizer
nltk.download("punkt_tab")   # required for newer NLTK versions
nltk.download("wordnet")     # lemmatizer dictionary
nltk.download("omw-1.4")     # WordNet multilingual word forms

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to /home/siddhu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/siddhu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/siddhu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/siddhu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/siddhu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
# Dataset paths
FINAL_DATASET_PATH = '../data/final/final.csv'
SAMPLED_DATA_PATH = '../data/final/sample.csv'
# Load and display the data
df = pd.read_csv(SAMPLED_DATA_PATH)
df.head(n=3).T

Unnamed: 0,0,1,2
title,Much Ado About Nothing: Shakespeare's Globe Th...,Piranhaconda,Edge of Fury
genres,"['comedy', 'drama']","['horror', 'sci-fi']",['thriller']
positive_users,"[61947, 172263]","[18184, 177617, 198781]",[]
positive_count,2,3,0
negative_users,[],"[49217, 62537, 68978, 71504, 78715, 80729, 814...",[189614]
negative_count,0,22,1
vote_average,8.0,5.068,5.5
vote_count,2,139,6
status,Released,Released,Released
release_date,2012-10-09,2012-06-16,1958-05-01


In [3]:
# Check Columns
df.columns

Index(['title', 'genres', 'positive_users', 'positive_count', 'negative_users',
       'negative_count', 'vote_average', 'vote_count', 'status',
       'release_date', 'revenue', 'runtime', 'adult', 'budget',
       'original_language', 'overview', 'poster_path', 'production_companies',
       'keywords', 'tmdb_id'],
      dtype='str')

In [4]:
df = df.drop(['positive_users', 'positive_count', 'negative_users',
       'negative_count', 'vote_average', 'vote_count', 'status',
       'release_date', 'revenue', 'runtime', 'budget', 'poster_path']
       , axis=1)

df.head(n=3).T

Unnamed: 0,0,1,2
title,Much Ado About Nothing: Shakespeare's Globe Th...,Piranhaconda,Edge of Fury
genres,"['comedy', 'drama']","['horror', 'sci-fi']",['thriller']
adult,False,False,False
original_language,English,English,English
overview,Much Ado About Nothing is a comedic play by Wi...,A hybrid creature - half piranha and half anac...,A psychopathic young beachcomber pretends to b...
production_companies,Shakespeare's Globe,New Horizons Picture,Wisteria Productions
keywords,theater play,"ransom, hawaii, water monster, filmmaking, kil...",summer house
tmdb_id,210695,115084,35128


In [5]:
def clean_genres(x):
    try:
        # Convert string representation of list to actual list
        if isinstance(x, str):
            x = ast.literal_eval(x)
        # Join list elements into comma-separated string
        return ", ".join([str(i).strip() for i in x])
    except:  # noqa: E722
        return str(x)

In [6]:
df['genres'] = df['genres'].apply(clean_genres)
df["production_companies"] = df["production_companies"].apply(lambda x: ", ".join([c.replace(" ", "") for c in x.split(",")]))
df.head(n=3)

Unnamed: 0,title,genres,adult,original_language,overview,production_companies,keywords,tmdb_id
0,Much Ado About Nothing: Shakespeare's Globe Th...,"comedy, drama",False,English,Much Ado About Nothing is a comedic play by Wi...,Shakespeare'sGlobe,theater play,210695
1,Piranhaconda,"horror, sci-fi",False,English,A hybrid creature - half piranha and half anac...,NewHorizonsPicture,"ransom, hawaii, water monster, filmmaking, kil...",115084
2,Edge of Fury,thriller,False,English,A psychopathic young beachcomber pretends to b...,WisteriaProductions,summer house,35128


In [7]:
df['concat_description'] = None
df['concat_description'] = (
    df['overview'].astype(str) + " " +
    df['genres'].astype(str) + " " +
    df['production_companies'].astype(str) + " " +
    df['original_language'].astype(str) + " " 
)

In [8]:
df = df[['tmdb_id','title', 'concat_description', 'genres']]
df.head()

Unnamed: 0,tmdb_id,title,concat_description,genres
0,210695,Much Ado About Nothing: Shakespeare's Globe Th...,Much Ado About Nothing is a comedic play by Wi...,"comedy, drama"
1,115084,Piranhaconda,A hybrid creature - half piranha and half anac...,"horror, sci-fi"
2,35128,Edge of Fury,A psychopathic young beachcomber pretends to b...,thriller
3,92341,Bird of Paradise,When a young South Seas sailor falls overboard...,"adventure, drama, romance"
4,10127,Critters 2: The Main Course,A batch of unhatched critter eggs are mistaken...,"comedy, horror, sci-fi"


### Data Preprocessing

In [9]:
def make_lower_case(text):
    text_lower = None
    text_lower = text.lower()
    return text_lower

In [10]:
def remove_stop_words(text):
    text = text.split()
    stop_words = set(stopwords.words("english"))
    removed_stop_word_text = None
    filtered_words = [word for word in text if word not in stop_words]
    removed_stop_word_text = " ".join(filtered_words)
    return removed_stop_word_text


In [11]:
def remove_numbers(text):
    pattern = r'[0-9]'
    removed_numbers_text = re.sub(pattern, '', text)
    return removed_numbers_text

In [12]:
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'[\w-]+')
    tokens = tokenizer.tokenize(text)
    removed_punctuation_text = " ".join(tokens)
    return removed_punctuation_text

In [13]:

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text) 
    lemmatized = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    return " ".join(lemmatized)

In [14]:
df_cleaned = df.copy()
df_cleaned['cleaned_description'] = (
    df['concat_description']
      .apply(make_lower_case)
      .apply(remove_punctuation)
      .apply(remove_numbers)
      .apply(lemmatize_text)
      .apply(remove_stop_words)
)

In [15]:
def weighted_description(row, genre_weight=3):
    # split the comma-separated genre string into a list of words
    genres_list = [g.strip() for g in row['genres'].split(",")]
    # repeat genres
    genres_weighted = " ".join(genres_list * genre_weight)
    # concatenate with cleaned description
    return row['cleaned_description'] + " " + genres_weighted

df_cleaned['weighted_description'] = df_cleaned.apply(weighted_description, axis=1)

In [16]:
df_cleaned.loc[0, 'weighted_description']

'much ado nothing comedic play william shakespeare thought written shakespeare wa approaching middle career play wa included first folio published much ado nothing generally considered one shakespeare best comedy combine element robust hilarity serious meditation honor shame court politics like like twelfth night much ado nothing though interspersed darker concern joyful comedy end multiple marriage death also known globe screen much ado nothing comedy drama shakespeare sglobe english comedy drama comedy drama comedy drama'

In [17]:
# Word counts
df_cleaned['concat_word_len'] = df['concat_description'].apply(lambda x: len(x.split()))
df_cleaned['cleaned_word_len'] = df_cleaned['cleaned_description'].apply(lambda x: len(x.split()))

# Minimum and maximum
concat_min_len = df_cleaned['concat_word_len'].min()
concat_max_len = df_cleaned['concat_word_len'].max()

clean_min_len = df_cleaned['cleaned_word_len'].min()
clean_max_len = df_cleaned['cleaned_word_len'].max()

df_cleaned.drop(['concat_word_len', 'cleaned_word_len'], axis=1, inplace=True)

print(f"Minimum description length (words) before cleaning: {concat_min_len}")
print(f"Maximum description length (words) before cleaning: {concat_max_len}")
print('--------------------------------------------------------')
print(f"Minimum description length (words) after cleaning: {clean_min_len}")
print(f"Maximum description length (words) after cleaning: {clean_max_len }")

Minimum description length (words) before cleaning: 7
Maximum description length (words) before cleaning: 194
--------------------------------------------------------
Minimum description length (words) after cleaning: 6
Maximum description length (words) after cleaning: 117


In [18]:
# Convert your movie descriptions column to a list
descriptions = df_cleaned["weighted_description"].tolist()

`Model Training (Mini-LM and Mpnet)`

In [19]:
EMBED_MODEL_MINILM_PATH = "../models/Content-Based-Model/embeddings_minilm.pkl"
EMBED_MODEL_MPNET_PATH = "../models/Content-Based-Model/embeddings_mpnet.pkl"
EMBED_MODEL_SENTT5_PATH = "../models/Content-Based-Model/embeddings_sent5large.pkl"

In [20]:
model_minilm = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
model_mpnet = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)
model_sent5 = HuggingFaceEmbeddings(
    model_name="sentence-transformers/sentence-t5-large"
)

`all-MiniLM-L6-v2 (Mini-Lm)`

In [21]:
if os.path.exists(EMBED_MODEL_MINILM_PATH):
    print("Loading precomputed embeddings...")
    with open(EMBED_MODEL_MINILM_PATH, "rb") as f:
        minilm_embeddings = pickle.load(f)
else:
    print("Computing embeddings...")
    descriptions = df_cleaned["weighted_description"].tolist()
    minilm_embeddings = np.array(model_minilm.embed_documents(descriptions))
    with open(EMBED_MODEL_MINILM_PATH, "wb") as f:
        pickle.dump(minilm_embeddings, f)

Loading precomputed embeddings...


`all-mpnet-base-v2 (Mpnet)`

In [22]:
if os.path.exists(EMBED_MODEL_MPNET_PATH):
    print("Loading precomputed MPNet embeddings...")
    with open(EMBED_MODEL_MPNET_PATH, "rb") as f:
        mpnet_embeddings = pickle.load(f)
else:
    print("Computing MPNet embeddings...")
    mpnet_embeddings = np.array(model_mpnet.embed_documents(descriptions))
    with open(EMBED_MODEL_MPNET_PATH, "wb") as f:
        pickle.dump(mpnet_embeddings, f)

Loading precomputed MPNet embeddings...


`sentence-t5-large (Sent5)`

In [23]:
# ------------------- Sentence-T5-Large -------------------
if os.path.exists(EMBED_MODEL_SENTT5_PATH):
    print("Loading precomputed Sentence-T5 embeddings...")
    with open(EMBED_MODEL_SENTT5_PATH, "rb") as f:
        sentT5_embeddings = pickle.load(f)
else:
    print("Computing Sentence-T5-Large embeddings...")
    sentT5_embeddings = np.array(model_sent5.embed_documents(descriptions))
    with open(EMBED_MODEL_SENTT5_PATH, "wb") as f:
        pickle.dump(sentT5_embeddings, f)

Loading precomputed Sentence-T5 embeddings...


In [24]:
def content_based_recommend(movie_title, df, embeddings=minilm_embeddings, N=10):
    idx = df[df['title'] == movie_title].index[0]
    movie_vec = embeddings[idx].reshape(1, -1)
    sims = cosine_similarity(movie_vec, embeddings).flatten()
    top_indices = sims.argsort()[::-1][1:N+1]
    return [(df.iloc[i]['title'], round(float(sims[i]), 3)) for i in top_indices]

In [25]:
import random

random_title = random.choice(df.title.to_list())
print("Random Movie Title : ", random_title)

Random Movie Title :  Fear of Rain


In [26]:
movie_title = random_title
result1 = content_based_recommend(
    movie_title=movie_title, df=df_cleaned, embeddings=minilm_embeddings, N=15
)
result2 = content_based_recommend(
    movie_title=movie_title, df=df_cleaned, embeddings=mpnet_embeddings, N=15
)
result3 = content_based_recommend(
    movie_title=movie_title, df=df_cleaned, embeddings=sentT5_embeddings, N=15
)

In [27]:
# Combine all results into a single DataFrame
df1 = pd.DataFrame(result1, columns=["Movie", "minilm_score"])
df2 = pd.DataFrame(result2, columns=["Movie", "mpnet_score"])
df3 = pd.DataFrame(result3, columns=["Movie", "sent5_score"])

In [28]:
print(f"Movie : {movie_title}")
# Create a MultiIndex column structure
results_df = pd.concat(
    [df1, df2, df3],
    axis=1,
    keys=["all-MiniLM-L6-v2", "all-mpnet-base-v2 ", "sentence-t5-large"],
)
results_df

Movie : Fear of Rain


Unnamed: 0_level_0,all-MiniLM-L6-v2,all-MiniLM-L6-v2,all-mpnet-base-v2,all-mpnet-base-v2,sentence-t5-large,sentence-t5-large
Unnamed: 0_level_1,Movie,minilm_score,Movie,mpnet_score,Movie,sent5_score
0,"Grudge 2, The",0.716,Super Dark Times,0.808,Super Dark Times,0.919
1,Amusement,0.712,Wake Up,0.801,"Messengers, The",0.914
2,"Nightmare on Elm Street, A",0.709,Paperhouse,0.786,The Channel,0.914
3,Another Me,0.691,Incendiary,0.785,The Carrier,0.909
4,Shut In,0.682,The Paperboy,0.781,The Harvest,0.909
5,Wake Up,0.682,Goodnight Mommy (Ich seh ich seh),0.78,Split,0.908
6,The Boogeyman,0.677,Shut In,0.776,Monstrous,0.908
7,"Clean, Shaven",0.675,"Possession of Joel Delaney, The",0.775,It's Just A Game,0.908
8,Mother's Day,0.67,Mother's Day,0.774,Mysterious Skin,0.908
9,Leatherface,0.667,Another Me,0.774,"Innocents, The",0.907
