In [1]:
import pandas as pd
from cf import ItemCF
from mf import MFModel
from sklearn.model_selection import train_test_split

In [2]:
links = pd.read_csv("./data/links.csv")
movies = pd.read_csv("./data/movies.csv")
ratings = pd.read_csv("./data/ratings.csv")
tags = pd.read_csv("./data/tags.csv")

# Question 2
## Item-Based Collaborative Filtering model with similarity scores

In [4]:
cf = ItemCF(k=10)
cf.fit(ratings)
similar = cf.most_similar([260, 1407, 4993])
print("Top 10 similar movies for ids 260, 1407 and 4993:")
similar

Top 10 similar movies for ids 260, 1407 and 4993:


{260: [1196, 1210, 1198, 2571, 1291, 1270, 2628, 1240, 858, 2028],
 1407: [1717, 2710, 1387, 1573, 2115, 3499, 1517, 2502, 1994, 1393],
 4993: [7153, 5952, 6539, 2571, 4306, 2959, 4226, 5349, 3578, 33794]}

In [5]:
cf.print_similar_table(260, movies)


For movie:
Star Wars: Episode IV - A New Hope (1977) (movieId: 260)

Top 10 Similar Movies:

   movieId                                              title
0      858                              Godfather, The (1972)
1     1196  Star Wars: Episode V - The Empire Strikes Back...
2     1198  Raiders of the Lost Ark (Indiana Jones and the...
3     1210  Star Wars: Episode VI - Return of the Jedi (1983)
4     1240                             Terminator, The (1984)
5     1270                          Back to the Future (1985)
6     1291          Indiana Jones and the Last Crusade (1989)
7     2028                         Saving Private Ryan (1998)
8     2571                                 Matrix, The (1999)
9     2628   Star Wars: Episode I - The Phantom Menace (1999)


# Question 3
## Matrix Factorization model

In [27]:
# Encode IDs
ratings['u_idx'] = ratings.userId.astype('category').cat.codes
ratings['m_idx'] = ratings.movieId.astype('category').cat.codes
uidx = dict(enumerate(ratings.userId.astype('category').cat.categories))
midx = dict(enumerate(ratings.movieId.astype('category').cat.categories))
ridx = {i: r for i, r in midx.items()}

# Train-test split
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)

# Model init + train
n_users = ratings.u_idx.nunique()
n_items = ratings.m_idx.nunique()
model = MFModel(n_users, n_items, n_latent=20)
model.train_model(train_df, test_df, n_epochs=30, lambda_=0.05)

# Recomended movie IDs display
print("\nTop 10 recommended movies for users 1, 2 and 3:")
model.recommend_top_k(user_ids=[1, 2, 3], k=10)


Epoch 1: Loss = 25.2011 | Test RMSE = 4.5230
Epoch 2: Loss = 20.9938 | Test RMSE = 4.2732
Epoch 3: Loss = 17.5252 | Test RMSE = 4.0435
Epoch 4: Loss = 14.6826 | Test RMSE = 3.8331
Epoch 5: Loss = 12.3609 | Test RMSE = 3.6407
Epoch 6: Loss = 10.4684 | Test RMSE = 3.4653
Epoch 7: Loss = 8.9264 | Test RMSE = 3.3053
Epoch 8: Loss = 7.6692 | Test RMSE = 3.1593
Epoch 9: Loss = 6.6423 | Test RMSE = 3.0260
Epoch 10: Loss = 5.8011 | Test RMSE = 2.9042
Epoch 11: Loss = 5.1093 | Test RMSE = 2.7926
Epoch 12: Loss = 4.5378 | Test RMSE = 2.6902
Epoch 13: Loss = 4.0630 | Test RMSE = 2.5960
Epoch 14: Loss = 3.6662 | Test RMSE = 2.5092
Epoch 15: Loss = 3.3325 | Test RMSE = 2.4291
Epoch 16: Loss = 3.0497 | Test RMSE = 2.3549
Epoch 17: Loss = 2.8085 | Test RMSE = 2.2860
Epoch 18: Loss = 2.6013 | Test RMSE = 2.2218
Epoch 19: Loss = 2.4219 | Test RMSE = 2.1620
Epoch 20: Loss = 2.2657 | Test RMSE = 2.1062
Epoch 21: Loss = 2.1286 | Test RMSE = 2.0539
Epoch 22: Loss = 2.0078 | Test RMSE = 2.0049
Epoch 23: Los

{1: [7576, 8840, 8011, 348, 6883, 2527, 3805, 8320, 8249, 3930],
 2: [8002, 9534, 5203, 6081, 7702, 5229, 6534, 4122, 8708, 5353],
 3: [1864, 7813, 1688, 5455, 1926, 1714, 8213, 3856, 4284, 8160]}

In [19]:
# Recommended movie title display
model.display_user_recommendations(user_id=1, ratings_df=ratings, movies_df=movies, ridx=ridx, k=10)


Top 5 movies rated by user 1:

   movieId                                      title  rating
0     5060               M*A*S*H (a.k.a. MASH) (1970)     5.0
1     2872                           Excalibur (1981)     5.0
2     1291  Indiana Jones and the Last Crusade (1989)     5.0
3     1298                Pink Floyd: The Wall (1982)     5.0
4     2948               From Russia with Love (1963)     5.0

Top 10 movie recommendations for user 1:

   movieId                                  title
0     6598                Step Into Liquid (2002)
1     7839                      Love Crazy (1941)
2     8191       Anne of the Thousand Days (1969)
3    59731       Bigger, Stronger, Faster* (2008)
4    97194  Thing: Terror Takes Shape, The (1998)
5    98083                     Jackass 3.5 (2011)
6   101884                       Dark Tide (2012)
7   104760                         Getaway (2013)
8   105835                     Double, The (2013)
9   181659    Craig Ferguson: Tickle Fight (2017)


# Question 4
## improvements in the recommendation system

With access to item features and an open-source LLM, one effective way to improve the recommendation system is to enhance the matrix factorization model by incorporating content-based item embeddings. We can use the LLM to generate dense feature vectors from item metadata (e.g., movie descriptions, tags, or genres), and then integrate these vectors directly into the model’s item representation (item_q). By combining learned latent embeddings with LLM-derived semantic features, we create a hybrid model that captures both collaborative and content-based signals. This improves personalization, reduces cold-start issues, and makes recommendations more robust and explainable.