In [1]:
import pandas as pd
from cf import ItemCF
from mf import MFModel

In [2]:
links = pd.read_csv("./data/links.csv")
movies = pd.read_csv("./data/movies.csv")
ratings = pd.read_csv("./data/ratings.csv")
tags = pd.read_csv("./data/tags.csv")

# Question 2
## Item-Based Collaborative Filtering model with similarity scores

In [4]:
cf = ItemCF(k=10)
cf.fit(ratings)
similar = cf.most_similar([260, 1407, 4993])
print("Top 10 similar movies for ids 260, 1407 and 4993:")
similar

Top 10 similar movies for ids 260, 1407 and 4993:


{260: [1196, 1210, 1198, 2571, 1291, 1270, 2628, 1240, 858, 2028],
 1407: [1717, 2710, 1387, 1573, 2115, 3499, 1517, 2502, 1994, 1393],
 4993: [7153, 5952, 6539, 2571, 4306, 2959, 4226, 5349, 3578, 33794]}

# Question 3
## Matrix Factorization model

In [13]:
# Encode user and movie IDs as categorical indices
ratings['u_idx'] = ratings.userId.astype('category').cat.codes  # maps userId to internal index
ratings['m_idx'] = ratings.movieId.astype('category').cat.codes  # maps movieId to internal index

# Initialize the matrix factorization model
n_users = ratings['u_idx'].nunique()
n_items = ratings['m_idx'].nunique()
model = MFModel(n_users, n_items, n_latent=32)

# Train the model on the ratings data
MFModel.train(model, ratings, n_epochs=30)

# Generate top 10 movie recommendations for user IDs 1, 2, and 3
# map original user IDs to internal indices used in training
original_user_ids = [1, 2, 3]
internal_user_ids = [ratings[ratings.userId == uid].u_idx.values[0] for uid in original_user_ids]

# Get recommendations
recs = MFModel.recommend_top_k(model, internal_user_ids, k=10)

# Map back internal user IDs to original user IDs for display
user_id_map = dict(enumerate(ratings.userId.astype('category').cat.categories))
final_recommendations = {
    user_id_map[uid]: recs[uid] for uid in recs
}

# Display recommendations
print("")
print("Top 10 movie recommendations for users 1, 2, and 3:")
for user, movie_ids in final_recommendations.items():
    print(f"User {user}: {movie_ids}")

Epoch 5: Loss = 30.8307
Epoch 10: Loss = 27.7287
Epoch 15: Loss = 24.9674
Epoch 20: Loss = 22.5225
Epoch 25: Loss = 20.3646
Epoch 30: Loss = 18.4625

Top 10 movie recommendations for users 1, 2, and 3:
User 1: [3371, 5011, 9501, 4453, 8470, 3149, 4467, 1786, 5327, 3051]
User 2: [2284, 9170, 843, 1342, 5519, 623, 753, 8682, 8850, 8536]
User 3: [114, 881, 2823, 8242, 3225, 8583, 1342, 8941, 5396, 7883]


# Question 4
## improvements in the recommendation system

With access to item features and an open-source LLM, one effective way to improve the recommendation system is to enhance the matrix factorization model by incorporating content-based item embeddings. We can use the LLM to generate dense feature vectors from item metadata (e.g., movie descriptions, tags, or genres), and then integrate these vectors directly into the model’s item representation (item_q). By combining learned latent embeddings with LLM-derived semantic features, we create a hybrid model that captures both collaborative and content-based signals. This improves personalization, reduces cold-start issues, and makes recommendations more robust and explainable.