In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [2]:
links = pd.read_csv("./data/links.csv")
movies = pd.read_csv("./data/movies.csv")
ratings = pd.read_csv("./data/ratings.csv")
tags = pd.read_csv("./data/tags.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


# Question 2

In [7]:
uidx = {u: i for i, u in enumerate(ratings.userId.unique())}
midx = {m: i for i, m in enumerate(ratings.movieId.unique())}
ridx = {i: m for m, i in midx.items()}

In [10]:
rows = ratings.movieId.map(midx).values # converting movie ids to indexes
cols = ratings.userId.map(uidx).values
data = ratings.rating.values

In [11]:
csr_mat = csr_matrix((data, (rows, cols)), shape = (len(midx), len(uidx)))

In [64]:
# Train KNN model
model = NearestNeighbors(metric = "cosine", algorithm = "brute", n_neighbors = 10)
model.fit(csr_mat)


In [13]:
# Get similar movies for list

movie_ids = [260, 1407, 4993]
k = 10
out = {}
for mid in movie_ids:
    if mid not in midx:
        out[mid] = []
        continue
    idx = midx[mid]
    _, neighbors = model.kneighbors(csr_mat[idx], n_neighbors = k+1)
    out[mid] = [ridx[i] for i in neighbors[0][1:]] # skipping self

out

{260: [1196, 1210, 1198, 2571, 1291, 1270, 2628, 1240, 858, 2028],
 1407: [1717, 2710, 1387, 1573, 2115, 3499, 1517, 2502, 1994, 1393],
 4993: [7153, 5952, 6539, 2571, 4306, 2959, 4226, 5349, 3578, 33794]}

In [66]:
model.kneighbors(csr_mat[260], n_neighbors = 11)

(array([[0.        , 0.56945929, 0.57363978, 0.59089154, 0.59488697,
         0.61811933, 0.61974849, 0.62130753, 0.6242686 , 0.62439215,
         0.62627967]]),
 array([[ 260,  705,   57,  509, 2396,  325,   63, 4899,  611,  474,  487]]))

In [14]:
class ItemCF:
    def __init__(self, k: int = 10, metric: str = "cosine"):
        self.k = k
        self.nn = NearestNeighbors(metric=metric, algorithm="brute")

    def fit(self, ratings: pd.DataFrame) -> None:
        """Expects columns: userId, movieId, rating"""
        
        self.uidx = {u: i for i, u in enumerate(ratings.userId.unique())}
        self.midx = {m: i for i, m in enumerate(ratings.movieId.unique())}
        self.ridx = {i: m for m, i in self.midx.items()}

        rows = ratings.movieId.map(self.midx).values
        cols = ratings.userId.map(self.uidx).values
        data = ratings.rating.values
        csr_mat = csr_matrix((data, (rows, cols)),
                         shape=(len(self.midx), len(self.uidx)))
        self.nn.fit(csr_mat)
        self._mat = csr_mat                     

    def most_similar(self, movie_ids, top: int = 10):
        """Return {movie_id: [similar_movie_ids]}"""
        out = {}
        for mid in movie_ids:
            if mid not in self.midx:
                out[mid] = []
                continue
            i = self.midx[mid]
            _, idx = self.nn.kneighbors(self._mat[i], n_neighbors=top + 1)
            out[mid] = [self.ridx[j] for j in idx[0][1:]]   # drop itself
        return out


In [15]:
cf = ItemCF(k=10); cf.fit(ratings)
similar = cf.most_similar([260, 1407, 4993])
similar


{260: [1196, 1210, 1198, 2571, 1291, 1270, 2628, 1240, 858, 2028],
 1407: [1717, 2710, 1387, 1573, 2115, 3499, 1517, 2502, 1994, 1393],
 4993: [7153, 5952, 6539, 2571, 4306, 2959, 4226, 5349, 3578, 33794]}

# Question 3

In [17]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [18]:
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [19]:
ratings['u_idx'] = ratings.userId.astype('category').cat.codes
ratings['m_idx'] = ratings.movieId.astype('category').cat.codes
uidx = dict(enumerate(ratings.userId.astype('category').cat.categories))
midx = dict(enumerate(ratings.movieId.astype('category').cat.categories))
ridx = {i: r for i, r in midx.items()}

In [20]:
user_tensor = torch.tensor(ratings['u_idx'].values, dtype=torch.long)
movie_tensor = torch.tensor(ratings['m_idx'].values, dtype=torch.long)
rating_tensor = torch.tensor(ratings.rating.values, dtype=torch.float32)

In [21]:
# model parameters

num_users = len(uidx)
num_movies = len(midx)
n_factors= 20

# Learnable parameters
mu = rating_tensor.mean()  # scalar constant, not learnable
bu = nn.Parameter(torch.zeros(num_users))                         # user bias
bi = nn.Parameter(torch.zeros(num_movies))                        # item bias
pu = nn.Parameter(torch.randn(num_users, n_factors) * 0.01)       # user latent
qi = nn.Parameter(torch.randn(num_movies, n_factors) * 0.01)      # item latent

In [22]:
optimizer = optim.Adam([bu, bi, pu, qi], lr=0.05)
epochs = 30
lambda_ = 0.01 


for epoch in range(epochs):
    preds = mu + bu[user_tensor] + bi[movie_tensor] + (pu[user_tensor] * qi[movie_tensor]).sum(1)
    
    reg_term = (
        lambda_ * (
            bu[user_tensor].pow(2).sum() +
            bi[movie_tensor].pow(2).sum() +
            pu[user_tensor].pow(2).sum() +
            qi[movie_tensor].pow(2).sum()
        )
    )

    loss = ((preds - rating_tensor) ** 2).mean() + reg_term / len(user_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")

Epoch 1: Loss = 1.0869
Epoch 2: Loss = 1.0143
Epoch 3: Loss = 0.9233
Epoch 4: Loss = 0.8332
Epoch 5: Loss = 0.7400
Epoch 6: Loss = 0.6488
Epoch 7: Loss = 0.5688
Epoch 8: Loss = 0.5082
Epoch 9: Loss = 0.4670
Epoch 10: Loss = 0.4391
Epoch 11: Loss = 0.4170
Epoch 12: Loss = 0.3958
Epoch 13: Loss = 0.3742
Epoch 14: Loss = 0.3534
Epoch 15: Loss = 0.3350
Epoch 16: Loss = 0.3198
Epoch 17: Loss = 0.3077
Epoch 18: Loss = 0.2982
Epoch 19: Loss = 0.2905
Epoch 20: Loss = 0.2837
Epoch 21: Loss = 0.2774
Epoch 22: Loss = 0.2713
Epoch 23: Loss = 0.2652
Epoch 24: Loss = 0.2594
Epoch 25: Loss = 0.2540
Epoch 26: Loss = 0.2491
Epoch 27: Loss = 0.2446
Epoch 28: Loss = 0.2407
Epoch 29: Loss = 0.2372
Epoch 30: Loss = 0.2340


In [23]:
mu + bu[uidx[1]] + bi + (pu[uidx[1]] * qi).sum(1)

tensor([1.6777, 1.5493, 1.8262,  ..., 2.4948, 2.4837, 3.2667],
       grad_fn=<AddBackward0>)

In [24]:
(pu[uidx[1]] * qi).dim()

2

In [25]:
user_ids_to_recommend = [1, 2, 3]
for uid in user_ids_to_recommend:
    uidx_i = uidx[uid]
    
    scores = mu + bu[uidx_i] + bi + (pu[uidx_i] * qi).sum(1)
    
    seen = set(ratings[ratings.userId == uid].movieId.map(midx))  # model indices
    sorted_idxs = scores.argsort(descending=True)

    #top_idxs = [i.item() for i in sorted_idxs if i.item() not in seen and i.item() in ridx][:10]
    top_idxs = [i.item() for i in sorted_idxs if i.item() in ridx][:10]
    top_movie_ids = [ridx[i] for i in top_idxs]

    print(f"\nTop 10 recommendations for user {uid}: {top_movie_ids}")


Top 10 recommendations for user 1: [4467, 71530, 49932, 1274, 213, 26258, 2491, 32892, 183897, 106489]

Top 10 recommendations for user 2: [3089, 3608, 27611, 6993, 2290, 3556, 1982, 3435, 176371, 3754]

Top 10 recommendations for user 3: [4041, 52435, 3841, 1408, 2126, 31878, 745, 1251, 86347, 1212]


## OOP refactoring

In [27]:
import torch, torch.nn as nn

class MFModel(nn.Module):
    def __init__(self, n_users:int, n_items:int, n_latent:int=32):
        super().__init__()
        self.user_p = nn.Embedding(n_users, n_latent)
        self.item_q = nn.Embedding(n_items, n_latent)
        self.user_b = nn.Embedding(n_users, 1)
        self.item_b = nn.Embedding(n_items, 1)
        self.mu     = nn.Parameter(torch.zeros(1), requires_grad=False)

    def forward(self, u, i):
        pred = (self.mu +
                self.user_b(u).squeeze() + self.item_b(i).squeeze() +
                (self.user_p(u) * self.item_q(i)).sum(1))
        return pred


    def train(model, df, n_epochs=10, lr=5e-3, lambda_=1e-4):
        opt = torch.optim.Adam(model.parameters(), lr=lr)
        users = torch.tensor(df.u_idx.values, dtype=torch.long)
        items = torch.tensor(df.m_idx.values, dtype=torch.long)
        ratings = torch.tensor(df.rating.values, dtype=torch.float32)
    
        model.mu.data.fill_(ratings.mean())  # initialize global mean
    
        for epoch in range(n_epochs):
            opt.zero_grad()
            preds = model(users, items)
    
            # Regularization manually
            reg = (
                model.user_b(users).pow(2).sum() +
                model.item_b(items).pow(2).sum() +
                model.user_p(users).pow(2).sum() +
                model.item_q(items).pow(2).sum()
            )
            loss = ((ratings - preds) ** 2).mean() + lambda_ * reg / len(users)
    
            loss.backward()
            opt.step()
            print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")


    def recommend_top_k(model, user_ids, k=10):
        with torch.no_grad():
            all_items = torch.arange(model.item_q.num_embeddings)
            out = {}
            for uid in user_ids:
                u = torch.tensor([uid], dtype=torch.long)
                preds = model(u.repeat(len(all_items)), all_items)
                top = preds.argsort(descending=True)[:k]
                out[int(uid)] = [int(item) for item in top]
            return out


In [28]:
# Encode user and movie IDs as categorical indices
ratings['u_idx'] = ratings.userId.astype('category').cat.codes  # maps userId to internal index
ratings['m_idx'] = ratings.movieId.astype('category').cat.codes  # maps movieId to internal index

# Initialize the matrix factorization model
n_users = ratings['u_idx'].nunique()
n_items = ratings['m_idx'].nunique()
model = MFModel(n_users, n_items, n_latent=32)

# Train the model on the ratings data
MFModel.train(model, ratings, n_epochs=30)

# Generate top 10 movie recommendations for user IDs 1, 2, and 3
# map original user IDs to internal indices used in training
original_user_ids = [1, 2, 3]
internal_user_ids = [ratings[ratings.userId == uid].u_idx.values[0] for uid in original_user_ids]

# Get recommendations
recs = MFModel.recommend_top_k(model, internal_user_ids, k=10)

# Map back internal user IDs to original user IDs for display
user_id_map = dict(enumerate(ratings.userId.astype('category').cat.categories))
final_recommendations = {
    user_id_map[uid]: recs[uid] for uid in recs
}

# Display recommendations
print("Top 10 movie recommendations for users 1, 2, and 3:")
for user, movie_ids in final_recommendations.items():
    print(f"User {user}: {movie_ids}")


Epoch 1: Loss = 35.8938
Epoch 2: Loss = 35.1456
Epoch 3: Loss = 34.4127
Epoch 4: Loss = 33.6949
Epoch 5: Loss = 32.9922
Epoch 6: Loss = 32.3046
Epoch 7: Loss = 31.6319
Epoch 8: Loss = 30.9741
Epoch 9: Loss = 30.3310
Epoch 10: Loss = 29.7024
Epoch 11: Loss = 29.0882
Epoch 12: Loss = 28.4882
Epoch 13: Loss = 27.9022
Epoch 14: Loss = 27.3300
Epoch 15: Loss = 26.7713
Epoch 16: Loss = 26.2260
Epoch 17: Loss = 25.6938
Epoch 18: Loss = 25.1745
Epoch 19: Loss = 24.6678
Epoch 20: Loss = 24.1734
Epoch 21: Loss = 23.6912
Epoch 22: Loss = 23.2208
Epoch 23: Loss = 22.7620
Epoch 24: Loss = 22.3145
Epoch 25: Loss = 21.8781
Epoch 26: Loss = 21.4525
Epoch 27: Loss = 21.0375
Epoch 28: Loss = 20.6327
Epoch 29: Loss = 20.2379
Epoch 30: Loss = 19.8529
Top 10 movie recommendations for users 1, 2, and 3:
User 1: [7832, 1694, 5577, 4050, 5686, 5103, 6174, 1099, 3711, 8574]
User 2: [1516, 7318, 7433, 266, 6334, 3137, 6505, 4751, 3116, 6960]
User 3: [9302, 3540, 9097, 7361, 8884, 9494, 2165, 2739, 1596, 3857]


# Question 4

With access to item features and an open-source LLM, one effective way to improve the recommendation system is to enhance the matrix factorization model by incorporating content-based item embeddings. We can use the LLM to generate dense feature vectors from item metadata (e.g., movie descriptions, tags, or genres), and then integrate these vectors directly into the model’s item representation (item_q). By combining learned latent embeddings with LLM-derived semantic features, we create a hybrid model that captures both collaborative and content-based signals. This improves personalization, reduces cold-start issues, and makes recommendations more robust and explainable.
