# 1.0 Loading Libraries and Data

In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [3]:
links = pd.read_csv("./data/links.csv")
movies = pd.read_csv("./data/movies.csv")
ratings = pd.read_csv("./data/ratings.csv")
tags = pd.read_csv("./data/tags.csv")

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


# 2.0 Question 2

## 2.1 Creating CSR Matrix

In [9]:
uidx = {u: i for i, u in enumerate(ratings.userId.unique())}
midx = {m: i for i, m in enumerate(ratings.movieId.unique())}
ridx = {i: m for m, i in midx.items()}

In [10]:
rows = ratings.movieId.map(midx).values # converting movie ids to indexes
cols = ratings.userId.map(uidx).values
data = ratings.rating.values

In [11]:
csr_mat = csr_matrix((data, (rows, cols)), shape = (len(midx), len(uidx)))

## 2.2 Fitting the model with CSR Matrix

In [13]:
# Train KNN model
model = NearestNeighbors(metric = "cosine", algorithm = "brute", n_neighbors = 10)
model.fit(csr_mat)


## 2.3 Generating Similar Movies

In [15]:
# Get similar movies for list

movie_ids = [260, 1407, 4993]
k = 10
out = {}
for mid in movie_ids:
    if mid not in midx:
        out[mid] = []
        continue
    idx = midx[mid]
    _, neighbors = model.kneighbors(csr_mat[idx], n_neighbors = k+1)
    out[mid] = [ridx[i] for i in neighbors[0][1:]] # skipping self

out

{260: [1196, 1210, 1198, 2571, 1291, 1270, 2628, 1240, 858, 2028],
 1407: [1717, 2710, 1387, 1573, 2115, 3499, 1517, 2502, 1994, 1393],
 4993: [7153, 5952, 6539, 2571, 4306, 2959, 4226, 5349, 3578, 33794]}

## 2.4 Printing Similar Movie Titles

In [17]:
# Reference movie
ref_movie = movies[movies.movieId == 260][['movieId', 'title']]

# Get neighbor movieIds (excluding the original movie itself)
neighbor_ids = [ridx[i] for i in neighbors[0][1:]]

# Titles of similar movies
similar_movies = movies[movies.movieId.isin(neighbor_ids)][['movieId', 'title']]
similar_movies = similar_movies.reset_index(drop=True)

# Print table
print(f"\nFor movie:\n{ref_movie.iloc[0]['title']} (movieId: {ref_movie.iloc[0]['movieId']})")
print("\nTop 10 Similar Movies:")
print(similar_movies)



For movie:
Star Wars: Episode IV - A New Hope (1977) (movieId: 260)

Top 10 Similar Movies:
   movieId                                              title
0     2571                                 Matrix, The (1999)
1     2959                                  Fight Club (1999)
2     3578                                   Gladiator (2000)
3     4226                                     Memento (2000)
4     4306                                       Shrek (2001)
5     5349                                  Spider-Man (2002)
6     5952      Lord of the Rings: The Two Towers, The (2002)
7     6539  Pirates of the Caribbean: The Curse of the Bla...
8     7153  Lord of the Rings: The Return of the King, The...
9    33794                               Batman Begins (2005)


## 2.5 Refactoring Code to OOP 

In [19]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

class ItemCF:
    def __init__(self, k: int = 10, metric: str = "cosine"):
        self.k = k
        self.nn = NearestNeighbors(metric=metric, algorithm="brute")

    def fit(self, ratings: pd.DataFrame) -> None:
        """Expects columns: userId, movieId, rating"""
        
        self.uidx = {u: i for i, u in enumerate(ratings.userId.unique())}
        self.midx = {m: i for i, m in enumerate(ratings.movieId.unique())}
        self.ridx = {i: m for m, i in self.midx.items()}

        rows = ratings.movieId.map(self.midx).values
        cols = ratings.userId.map(self.uidx).values
        data = ratings.rating.values
        csr_mat = csr_matrix((data, (rows, cols)),
                         shape=(len(self.midx), len(self.uidx)))
        self.nn.fit(csr_mat)
        self._mat = csr_mat                     

    def most_similar(self, movie_ids, top: int = 10):
        """Return {movie_id: [similar_movie_ids]}"""
        out = {}
        for mid in movie_ids:
            if mid not in self.midx:
                out[mid] = []
                continue
            i = self.midx[mid]
            _, idx = self.nn.kneighbors(self._mat[i], n_neighbors=top + 1)
            out[mid] = [self.ridx[j] for j in idx[0][1:]]   # drop itself
        return out

    
    def print_similar_table(self, movie_id: int, movies_df: pd.DataFrame) -> None:
        """Print a table of similar movies for the given movie_id"""
        if movie_id not in self.midx:
            print(f"Movie ID {movie_id} not found in training data.")
            return

        i = self.midx[movie_id]
        _, neighbors = self.nn.kneighbors(self._mat[i], n_neighbors=self.k + 1)

        # Get original movie and neighbor IDs
        neighbor_ids = [self.ridx[j] for j in neighbors[0][1:]]
        ref_movie = movies_df[movies_df.movieId == movie_id][['movieId', 'title']]

        # Get titles of neighbors
        similar_movies = movies_df[movies_df.movieId.isin(neighbor_ids)][['movieId', 'title']]
        similar_movies = similar_movies.reset_index(drop=True)

        # Display
        print(f"\nFor movie:\n{ref_movie.iloc[0]['title']} (movieId: {ref_movie.iloc[0]['movieId']})")
        print("\nTop", self.k, "Similar Movies:\n")
        print(similar_movies)

In [20]:
cf = ItemCF(k=10)
cf.fit(ratings)
similar = cf.most_similar([260, 1407, 4993])

similar


{260: [1196, 1210, 1198, 2571, 1291, 1270, 2628, 1240, 858, 2028],
 1407: [1717, 2710, 1387, 1573, 2115, 3499, 1517, 2502, 1994, 1393],
 4993: [7153, 5952, 6539, 2571, 4306, 2959, 4226, 5349, 3578, 33794]}

In [21]:
cf.print_similar_table(260, movies)


For movie:
Star Wars: Episode IV - A New Hope (1977) (movieId: 260)

Top 10 Similar Movies:

   movieId                                              title
0      858                              Godfather, The (1972)
1     1196  Star Wars: Episode V - The Empire Strikes Back...
2     1198  Raiders of the Lost Ark (Indiana Jones and the...
3     1210  Star Wars: Episode VI - Return of the Jedi (1983)
4     1240                             Terminator, The (1984)
5     1270                          Back to the Future (1985)
6     1291          Indiana Jones and the Last Crusade (1989)
7     2028                         Saving Private Ryan (1998)
8     2571                                 Matrix, The (1999)
9     2628   Star Wars: Episode I - The Phantom Menace (1999)


# 3.0 Question 3

## 3.1 Creating User, Item (Movies), Ratings Tensors

In [24]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

In [25]:
ratings['u_idx'] = ratings.userId.astype('category').cat.codes
ratings['m_idx'] = ratings.movieId.astype('category').cat.codes
uidx = dict(enumerate(ratings.userId.astype('category').cat.categories))
midx = dict(enumerate(ratings.movieId.astype('category').cat.categories))
ridx = {i: r for i, r in midx.items()}

In [26]:
# Split the data 
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)

In [27]:
# Convert train set to tensors
user_tensor = torch.tensor(train_df['u_idx'].values, dtype=torch.long)
movie_tensor = torch.tensor(train_df['m_idx'].values, dtype=torch.long)
rating_tensor = torch.tensor(train_df.rating.values, dtype=torch.float32)

# Convert test set to tensors
test_user_tensor = torch.tensor(test_df['u_idx'].values, dtype=torch.long)
test_item_tensor = torch.tensor(test_df['m_idx'].values, dtype=torch.long)
test_rating_tensor = torch.tensor(test_df['rating'].values, dtype=torch.float32)

## 3.2 Initiating Learnable Parameters

In [29]:
# model parameters
num_users = len(uidx)
num_movies = len(midx)
n_factors= 10

# Learnable parameters
mu = rating_tensor.mean()                                     # scalar constant, not learnable
bu = nn.Parameter(torch.zeros(num_users))                     # user bias
bi = nn.Parameter(torch.zeros(num_movies))                    # item bias
pu = nn.Parameter(torch.randn(num_users, n_factors) * 0.01)   # user latent
qi = nn.Parameter(torch.randn(num_movies, n_factors) * 0.01)  # item latent

## 3.3 Training Model

In [31]:
optimizer = optim.Adam([bu, bi, pu, qi], lr=0.05)
epochs = 15
lambda_ = 0.9 

train_losses = []
test_rmses = []

for epoch in range(epochs):
    preds = mu + bu[user_tensor] + bi[movie_tensor] + (pu[user_tensor] * qi[movie_tensor]).sum(1)

    reg_term = (
        lambda_ * (
            bu[user_tensor].pow(2).sum() +
            bi[movie_tensor].pow(2).sum() +
            pu[user_tensor].pow(2).sum() +
            qi[movie_tensor].pow(2).sum()
        )
    )

    loss = ((preds - rating_tensor) ** 2).mean() + reg_term / len(user_tensor)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    train_losses.append(loss.item())

    # Evaluate RMSE on test set
    with torch.no_grad():
        test_preds = mu + bu[test_user_tensor] + bi[test_item_tensor] + (pu[test_user_tensor] * qi[test_item_tensor]).sum(1)
        test_rmse = torch.sqrt(((test_preds - test_rating_tensor) ** 2).mean()).item()
        test_rmses.append(test_rmse)

    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f} | Test RMSE = {test_rmse:.4f}")

Epoch 1: Loss = 1.0853 | Test RMSE = 1.0190
Epoch 2: Loss = 1.0488 | Test RMSE = 0.9930
Epoch 3: Loss = 0.9647 | Test RMSE = 0.9714
Epoch 4: Loss = 0.9315 | Test RMSE = 0.9537
Epoch 5: Loss = 0.9160 | Test RMSE = 0.9393
Epoch 6: Loss = 0.8915 | Test RMSE = 0.9280
Epoch 7: Loss = 0.8696 | Test RMSE = 0.9191
Epoch 8: Loss = 0.8607 | Test RMSE = 0.9124
Epoch 9: Loss = 0.8609 | Test RMSE = 0.9073
Epoch 10: Loss = 0.8600 | Test RMSE = 0.9036
Epoch 11: Loss = 0.8544 | Test RMSE = 0.9010
Epoch 12: Loss = 0.8482 | Test RMSE = 0.8994
Epoch 13: Loss = 0.8456 | Test RMSE = 0.8985
Epoch 14: Loss = 0.8461 | Test RMSE = 0.8981
Epoch 15: Loss = 0.8461 | Test RMSE = 0.8983


## 3.4 Generating Recomendations

In [33]:
user_ids_to_recommend = [1, 2, 3]

for uid in user_ids_to_recommend:
    uidx_i = uidx[uid]
    
    scores = mu + bu[uidx_i] + bi + (pu[uidx_i] * qi).sum(1)

    sorted_idxs = scores.argsort(descending=True)

    top_idxs = [i.item() for i in sorted_idxs if i.item() in ridx][:10]
    top_movie_ids = [ridx[i] for i in top_idxs]

    print(f"\nTop 10 recommendations for user {uid}: {top_movie_ids}")


Top 10 recommendations for user 1: [6818, 6835, 5746, 5181, 3567, 132333, 5490, 25947, 4789, 156605]

Top 10 recommendations for user 2: [6818, 5181, 6835, 5746, 3567, 132333, 5490, 25947, 4789, 156605]

Top 10 recommendations for user 3: [6818, 5181, 5746, 6835, 3567, 132333, 5490, 25947, 4789, 156605]


## 3.5 Printing Recomended Movie Titles

In [35]:
# Get neighbor movieIds (excluding the original movie itself)
user_id = 1
uidx_i = uidx[user_id]
scores = mu + bu[uidx_i] + bi + (pu[uidx_i] * qi).sum(1)
sorted_idxs = scores.argsort(descending=True)
top_idxs = [i.item() for i in sorted_idxs if i.item() in ridx][:10]
top_movie_ids = [ridx[i] for i in top_idxs]

# Titles of recomended movies
recomended_movies = movies[movies.movieId.isin(top_movie_ids)][['movieId', 'title']]
recomended_movies = similar_movies.reset_index(drop=True)

# Top 5 Rated Movies by User
user_rated = ratings[ratings.userId == user_id]
top_rated = user_rated.sort_values(by="rating", ascending=False).head(5)
top_rated_movies = top_rated.merge(movies, on="movieId")[['movieId', 'title', 'rating']]
top_rated_movies = top_rated_movies.reset_index(drop=True)

# Print user liked movies
print(f"\nTop 5 movies rated by user {user_id}:\n")
print(top_rated_movies)

# Print recomended movies 
print(f"For user {user_id} Top 10 Recomended Movies:\n")
print(recomended_movies)


Top 5 movies rated by user 1:

   movieId                                      title  rating
0     5060               M*A*S*H (a.k.a. MASH) (1970)     5.0
1     2872                           Excalibur (1981)     5.0
2     1291  Indiana Jones and the Last Crusade (1989)     5.0
3     1298                Pink Floyd: The Wall (1982)     5.0
4     2948               From Russia with Love (1963)     5.0
For user 1 Top 10 Recomended Movies:

   movieId                                              title
0     2571                                 Matrix, The (1999)
1     2959                                  Fight Club (1999)
2     3578                                   Gladiator (2000)
3     4226                                     Memento (2000)
4     4306                                       Shrek (2001)
5     5349                                  Spider-Man (2002)
6     5952      Lord of the Rings: The Two Towers, The (2002)
7     6539  Pirates of the Caribbean: The Curse of the Bla...


## 3.6 OOP refactoring

In [37]:
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

class MFModel(nn.Module):
    def __init__(self, n_users: int, n_items: int, n_latent: int = 20):
        super().__init__()
        self.user_p = nn.Embedding(n_users, n_latent)
        self.item_q = nn.Embedding(n_items, n_latent)
        self.user_b = nn.Embedding(n_users, 1)
        self.item_b = nn.Embedding(n_items, 1)
        self.mu = nn.Parameter(torch.zeros(1), requires_grad=False)

    def forward(self, u, i):
        return (
            self.mu +
            self.user_b(u).squeeze() +
            self.item_b(i).squeeze() +
            (self.user_p(u) * self.item_q(i)).sum(1)
        )

    def train_model(self, train_df, test_df, n_epochs=30, lr=0.05, lambda_=0.01):
        optimizer = optim.Adam(self.parameters(), lr=lr)
        users = torch.tensor(train_df.u_idx.values, dtype=torch.long)
        items = torch.tensor(train_df.m_idx.values, dtype=torch.long)
        ratings = torch.tensor(train_df.rating.values, dtype=torch.float32)

        test_users = torch.tensor(test_df.u_idx.values, dtype=torch.long)
        test_items = torch.tensor(test_df.m_idx.values, dtype=torch.long)
        test_ratings = torch.tensor(test_df.rating.values, dtype=torch.float32)

        self.mu.data.fill_(ratings.mean())

        for epoch in range(n_epochs):
            self.train()
            optimizer.zero_grad()
            preds = self(users, items)

            reg = (
                self.user_b(users).pow(2).sum() +
                self.item_b(items).pow(2).sum() +
                self.user_p(users).pow(2).sum() +
                self.item_q(items).pow(2).sum()
            )
            loss = ((ratings - preds) ** 2).mean() + lambda_ * reg / len(users)
            loss.backward()
            optimizer.step()

            # Evaluate on test set
            self.eval()
            with torch.no_grad():
                test_preds = self(test_users, test_items)
                test_rmse = torch.sqrt(((test_preds - test_ratings) ** 2).mean()).item()

            print(f"Epoch {epoch+1}: Loss = {loss.item():.4f} | Test RMSE = {test_rmse:.4f}")

    def recommend_top_k(self, user_ids, k=10):
        self.eval()
        with torch.no_grad():
            all_items = torch.arange(self.item_q.num_embeddings)
            out = {}
            for uid in user_ids:
                u = torch.tensor([uid], dtype=torch.long)
                preds = self(u.repeat(len(all_items)), all_items)
                top = preds.argsort(descending=True)[:k]
                out[int(uid)] = [int(item) for item in top]
            return out

    def display_user_recommendations(self, user_id, ratings_df, movies_df, ridx, k=10):
        uidx_i = int(ratings_df[ratings_df.userId == user_id].u_idx.values[0])
        user_idx_tensor = torch.tensor([uidx_i], dtype=torch.long)

        scores = self.mu + self.user_b(user_idx_tensor).squeeze() + \
         self.item_b.weight.squeeze() + \
         (self.user_p(user_idx_tensor) * self.item_q.weight).sum(1)

        sorted_idxs = scores.argsort(descending=True)
        top_idxs = [i.item() for i in sorted_idxs if i.item() in ridx][:k]
        top_movie_ids = [ridx[i] for i in top_idxs]

        recommended = movies_df[movies_df.movieId.isin(top_movie_ids)][['movieId', 'title']]
        recommended = recommended.reset_index(drop=True)

        # Top 5 user-rated movies
        user_ratings = ratings_df[ratings_df.userId == user_id]
        top_rated = user_ratings.sort_values(by="rating", ascending=False).head(5)
        top_rated_movies = top_rated.merge(movies_df, on="movieId")[['movieId', 'title', 'rating']]
        top_rated_movies = top_rated_movies.reset_index(drop=True)

        print(f"\nTop 5 movies rated by user {user_id}:\n")
        print(top_rated_movies)

        print(f"\nTop {k} movie recommendations for user {user_id}:\n")
        print(recommended)


In [71]:
# Encode IDs
ratings['u_idx'] = ratings.userId.astype('category').cat.codes
ratings['m_idx'] = ratings.movieId.astype('category').cat.codes
uidx = dict(enumerate(ratings.userId.astype('category').cat.categories))
midx = dict(enumerate(ratings.movieId.astype('category').cat.categories))
ridx = {i: r for i, r in midx.items()}

# Train-test split
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)

# Model init + train
n_users = ratings.u_idx.nunique()
n_items = ratings.m_idx.nunique()
model = MFModel(n_users, n_items, n_latent=20)
model.train_model(train_df, test_df, n_epochs=30, lambda_=0.05)

print("\nTop 10 recommended movies for users 1, 2 and 3:")
model.recommend_top_k(user_ids=[1, 2, 3], k=10)


Epoch 1: Loss = 25.0415 | Test RMSE = 4.5321
Epoch 2: Loss = 20.8639 | Test RMSE = 4.2839
Epoch 3: Loss = 17.4195 | Test RMSE = 4.0554
Epoch 4: Loss = 14.5974 | Test RMSE = 3.8457
Epoch 5: Loss = 12.2935 | Test RMSE = 3.6537
Epoch 6: Loss = 10.4160 | Test RMSE = 3.4783
Epoch 7: Loss = 8.8869 | Test RMSE = 3.3180
Epoch 8: Loss = 7.6406 | Test RMSE = 3.1716
Epoch 9: Loss = 6.6225 | Test RMSE = 3.0378
Epoch 10: Loss = 5.7883 | Test RMSE = 2.9152
Epoch 11: Loss = 5.1020 | Test RMSE = 2.8029
Epoch 12: Loss = 4.5345 | Test RMSE = 2.6998
Epoch 13: Loss = 4.0627 | Test RMSE = 2.6049
Epoch 14: Loss = 3.6682 | Test RMSE = 2.5175
Epoch 15: Loss = 3.3361 | Test RMSE = 2.4368
Epoch 16: Loss = 3.0546 | Test RMSE = 2.3621
Epoch 17: Loss = 2.8144 | Test RMSE = 2.2929
Epoch 18: Loss = 2.6080 | Test RMSE = 2.2286
Epoch 19: Loss = 2.4294 | Test RMSE = 2.1688
Epoch 20: Loss = 2.2739 | Test RMSE = 2.1130
Epoch 21: Loss = 2.1375 | Test RMSE = 2.0609
Epoch 22: Loss = 2.0171 | Test RMSE = 2.0121
Epoch 23: Los

{1: [7555, 4261, 3122, 2833, 5292, 1650, 8860, 3445, 3743, 6498],
 2: [5064, 591, 5503, 8918, 42, 3762, 8829, 8270, 634, 3802],
 3: [8721, 1456, 6062, 2563, 3664, 2487, 6287, 3471, 6185, 4499]}

In [39]:
# Recommend & display
model.display_user_recommendations(user_id=1, ratings_df=ratings, movies_df=movies, ridx=ridx, k=10)


Top 5 movies rated by user 1:

   movieId                                      title  rating
0     5060               M*A*S*H (a.k.a. MASH) (1970)     5.0
1     2872                           Excalibur (1981)     5.0
2     1291  Indiana Jones and the Last Crusade (1989)     5.0
3     1298                Pink Floyd: The Wall (1982)     5.0
4     2948               From Russia with Love (1963)     5.0

Top 10 movie recommendations for user 1:

   movieId                                  title
0      737                       Barb Wire (1996)
1     5901                          Empire (2002)
2     6656     Attack of the Puppet People (1958)
3     6748                      Brood, The (1979)
4    64032                Four Christmases (2008)
5    78103      Shake Hands with the Devil (2007)
6    81535  Saw VII 3D - The Final Chapter (2010)
7   134246                        Survivor (2015)
8   151745                      Reptilicus (1961)
9   184253         The Cloverfield Paradox (2018)


# 4.0 Question 4

With access to item features and an open-source LLM, one effective way to improve the recommendation system is to enhance the matrix factorization model by incorporating content-based item embeddings. We can use the LLM to generate dense feature vectors from item metadata (e.g., movie descriptions, tags, or genres), and then integrate these vectors directly into the model’s item representation (item_q). By combining learned latent embeddings with LLM-derived semantic features, we create a hybrid model that captures both collaborative and content-based signals. This improves personalization, reduces cold-start issues, and makes recommendations more robust and explainable.
