In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.read_csv("../data/cleaned_netflix.csv")

vectorizer = TfidfVectorizer(
    max_df=0.85,
    min_df=2,
    ngram_range=(1, 2)
)

tfidf = vectorizer.fit_transform(df['clean_text'])
tfidf.shape



(8807, 19059)

In [3]:
from sklearn.decomposition import TruncatedSVD

n_components = 100  # hyperparameter (important)

svd = TruncatedSVD(n_components=n_components, random_state=42)
lsa_matrix = svd.fit_transform(tfidf)

lsa_matrix.shape


(8807, 100)

In [4]:
svd.explained_variance_ratio_.sum()


np.float64(0.1376672426135253)

In [5]:
import numpy as np
from numpy.linalg import norm

def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (norm(vec1) * norm(vec2))


In [6]:
def recommend_lsa(title, top_k=5):
    if title not in df['title'].values:
        return "Title not found"
    
    idx = df[df['title'] == title].index[0]
    scores = []

    for i in range(len(df)):
        if i != idx:
            sim = cosine_similarity(lsa_matrix[idx], lsa_matrix[i])
            scores.append((df.iloc[i]['title'], sim))

    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    return scores[:top_k]


In [15]:
recommend_lsa("It's Okay to Not Be Okay")


[('Magic Phone', np.float64(0.9165439766650283)),
 ('This Is My Love', np.float64(0.8677525375368532)),
 ('Oh My Ghost', np.float64(0.8615169287964746)),
 ('Chocolate', np.float64(0.8560417508765805)),
 ('Momo Salon', np.float64(0.8545225574986102))]

In [16]:
import numpy as np

np.save("../data/lsa_embeddings.npy", lsa_matrix)
