In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [2]:
links = pd.read_csv("./data/links.csv")
movies = pd.read_csv("./data/movies.csv")
ratings = pd.read_csv("./data/ratings.csv")
tags = pd.read_csv("./data/tags.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


# Question 2

In [7]:
uidx = {u: i for i, u in enumerate(ratings.userId.unique())}
midx = {m: i for i, m in enumerate(ratings.movieId.unique())}
ridx = {i: m for m, i in midx.items()}

In [8]:
#ridx

In [9]:
#midx

In [10]:
rows = ratings.movieId.map(midx).values # converting movie ids to indexes
cols = ratings.userId.map(uidx).values
data = ratings.rating.values

In [11]:
csr_mat = csr_matrix((data, (rows, cols)), shape = (len(midx), len(uidx)))

In [12]:
# Train KNN model
model = NearestNeighbors(metric = "cosine", algorithm = "brute", n_neighbors = 11)
model.fit(csr_mat)


In [13]:
# Get similar movies for list

movie_ids = [260, 1407, 4993]
k = 10
out = {}
for mid in movie_ids:
    if mid not in midx:
        out[mid] = []
        continue
    idx = midx[mid]
    _, neighbors = model.kneighbors(csr_mat[idx], n_neighbors = k+1)
    out[mid] = [ridx[i] for i in neighbors[0][1:]] # skipping self

out

{260: [1196, 1210, 1198, 2571, 1291, 1270, 2628, 1240, 858, 2028],
 1407: [1717, 2710, 1387, 1573, 2115, 3499, 1517, 2502, 1994, 1393],
 4993: [7153, 5952, 6539, 2571, 4306, 2959, 4226, 5349, 3578, 33794]}

In [14]:
class ItemCF:
    def __init__(self, k: int = 10, metric: str = "cosine"):
        self.k = k
        self.nn = NearestNeighbors(metric=metric, algorithm="brute")

    def fit(self, ratings: pd.DataFrame) -> None:
        """Expects columns: userId, movieId, rating"""
        
        self.uidx = {u: i for i, u in enumerate(ratings.userId.unique())}
        self.midx = {m: i for i, m in enumerate(ratings.movieId.unique())}
        self.ridx = {i: m for m, i in self.midx.items()}

        rows = ratings.movieId.map(self.midx).values
        cols = ratings.userId.map(self.uidx).values
        data = ratings.rating.values
        csr_mat = csr_matrix((data, (rows, cols)),
                         shape=(len(self.midx), len(self.uidx)))
        self.nn.fit(csr_mat)
        self._mat = csr_mat                     

    def most_similar(self, movie_ids, top: int = 10):
        """Return {movie_id: [similar_movie_ids]}"""
        out = {}
        for mid in movie_ids:
            if mid not in self.midx:
                out[mid] = []
                continue
            i = self.midx[mid]
            _, idx = self.nn.kneighbors(self._mat[i], n_neighbors=top + 1)
            out[mid] = [self.ridx[j] for j in idx[0][1:]]   # drop itself
        return out


In [15]:
cf = ItemCF(k=10); cf.fit(ratings)
similar = cf.most_similar([260, 1407, 4993])
similar


{260: [1196, 1210, 1198, 2571, 1291, 1270, 2628, 1240, 858, 2028],
 1407: [1717, 2710, 1387, 1573, 2115, 3499, 1517, 2502, 1994, 1393],
 4993: [7153, 5952, 6539, 2571, 4306, 2959, 4226, 5349, 3578, 33794]}

# Question 3

In [17]:
import torch
import torch.nn as nn
import torch.optim as optim

In [18]:
uidx = {u : i for i, u in enumerate(ratings.userId.unique())}
midx = {m : i for i, m in enumerate(ratings.movieId.unique())}
ridx = {i : r for i, r in midx.items()}

In [19]:
user_tensor = torch.tensor(ratings.userId.map(uidx).values)
movie_tensor = torch.tensor(ratings.movieId.map(midx).values)
rating_tensor = torch.tensor(ratings.rating.values, dtype=torch.float32)

In [20]:
# model parameters

num_users = len(uidx)
num_movies = len(midx)
n_factors= 20

mu = rating_tensor.mean()
bu = torch.zeros(num_users, requires_grad = True)
bi = torch.zeros(num_movies, requires_grad = True)
pu = nn.Parameter(torch.randn(num_users, n_factors) * 0.01)
qi = nn.Parameter(torch.randn(num_movies, n_factors) * 0.01)

In [21]:
optimizer = optim.Adam([bu, bi, pu, qi], lr=0.05)
epochs = 30
lambda_ = 0.01 
for epoch in range(epochs):
    preds = mu + bu[user_tensor] + bi[movie_tensor] + (pu[user_tensor] * qi[movie_tensor]).sum(1)
    
    reg_term = (
        lambda_ * (
            bu[user_tensor].pow(2).sum() +
            bi[movie_tensor].pow(2).sum() +
            pu[user_tensor].pow(2).sum() +
            qi[movie_tensor].pow(2).sum()
        )
    )

    loss = ((preds - rating_tensor) ** 2).mean() + reg_term / len(user_tensor)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")

Epoch 1: Loss = 1.0869
Epoch 2: Loss = 1.0134
Epoch 3: Loss = 0.9214
Epoch 4: Loss = 0.8291
Epoch 5: Loss = 0.7335
Epoch 6: Loss = 0.6417
Epoch 7: Loss = 0.5645
Epoch 8: Loss = 0.5080
Epoch 9: Loss = 0.4686
Epoch 10: Loss = 0.4395
Epoch 11: Loss = 0.4155
Epoch 12: Loss = 0.3940
Epoch 13: Loss = 0.3736
Epoch 14: Loss = 0.3546
Epoch 15: Loss = 0.3376
Epoch 16: Loss = 0.3232
Epoch 17: Loss = 0.3112
Epoch 18: Loss = 0.3014
Epoch 19: Loss = 0.2932
Epoch 20: Loss = 0.2860
Epoch 21: Loss = 0.2794
Epoch 22: Loss = 0.2732
Epoch 23: Loss = 0.2674
Epoch 24: Loss = 0.2618
Epoch 25: Loss = 0.2566
Epoch 26: Loss = 0.2517
Epoch 27: Loss = 0.2472
Epoch 28: Loss = 0.2431
Epoch 29: Loss = 0.2395
Epoch 30: Loss = 0.2361


In [22]:
user_ids_to_recommend = [1, 2, 3]
for uid in user_ids_to_recommend:
    uidx_i = uidx[uid]
    
    scores = mu + bu[uidx_i] + bi + (pu[uidx_i] * qi).sum(1)
    
    seen = set(ratings[ratings.userId == uid].movieId.map(midx))  # model indices
    sorted_idxs = scores.argsort(descending=True)

    #top_idxs = [i.item() for i in sorted_idxs if i.item() not in seen and i.item() in ridx][:10]
    top_idxs = [i.item() for i in sorted_idxs if i.item() in ridx][:10]
    top_movie_ids = [ridx[i] for i in top_idxs]

    print(f"\nTop 10 recommendations for user {uid}: {top_movie_ids}")


Top 10 recommendations for user 1: [4072, 2702, 6710, 1551, 312, 66, 2597, 1567, 1735, 4910]

Top 10 recommendations for user 2: [5322, 527, 5078, 9370, 2560, 2196, 2921, 6592, 8075, 6736]

Top 10 recommendations for user 3: [3605, 5075, 850, 4889, 6197, 2683, 4065, 687, 9416, 4046]
