In [1]:
!nvidia-smi

Fri Oct  3 18:20:57 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   57C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
!pip install kagglehub gradio torch torchvision torchtext torchmetrics tqdm pandas numpy scikit-learn

Collecting torchtext
  Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Collecting torchmetrics
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchmetrics-1.8.2-py3-none-any.whl (983 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m983.2/983.2 kB[0m [31m61.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.15.2-py3-none-any.whl (29 kB)
Installing collected packages: lightning-utilities, torchtext, torchmetrics
Successfully installed lightning-utilities-0.15.2 torchmetrics-1.8.2 torchtext-0.18.0


In [3]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [4]:
import kagglehub

path = kagglehub.dataset_download("grouplens/movielens-20m-dataset")
print(f"Path to dataset files: {path}")

RATINGS_CSV = os.path.join(path, "rating.csv")
MOVIES_CSV = os.path.join(path, "rating.csv")


Downloading from https://www.kaggle.com/api/v1/datasets/download/grouplens/movielens-20m-dataset?dataset_version_number=1...


100%|██████████| 195M/195M [00:05<00:00, 39.1MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/grouplens/movielens-20m-dataset/versions/1


In [5]:
ratings = pd.read_csv(RATINGS_CSV)
movies = pd.read_csv(MOVIES_CSV)

print("Missing values in ratings:\n", ratings.isnull().sum())
print("Missing values in movies:\n", movies.isnull().sum())

for col in ratings.columns:
  if ratings[col].isnull().sum() > 0:
    if ratings[col].dtype in [np.float64, np.int64]:
      ratings[col].fillna(ratings[col].mean(), inplace=True)
    else:
      ratings.drop(columns=[col], inplace=True)

for col in movies.columns:
  if movies[col].isnull().sum() > 0:
    if movies[col].dtype == 'object':
      movies.drop(column=[col], inplace=True)


user2idx = {u:i for i, u in enumerate(ratings['userId'].unique())}
movie2idx = {m:i for i, m in enumerate(ratings['movieId'].unique())}
idx2movie = {i:m for m, i in movie2idx.items()}

n_users = len(user2idx)
n_movies = len(movie2idx)

print(f"Users: {n_users}, Movies: {n_movies}, Ratings: {len(ratings)}")


Missing values in ratings:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Missing values in movies:
 userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Users: 138493, Movies: 26744, Ratings: 20000263


In [6]:
train_val , test = train_test_split(ratings, test_size=0.20, random_state=42)
train, val = train_test_split(train_val, test_size=0.125, random_state=42)

print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")


Train: 14000183, Val: 2000027, Test: 4000053


In [7]:
class MovieLensDataset(Dataset):
    def __init__(self, df, user2idx, movie2idx):
        # Map raw IDs to contiguous indices
        self.users = torch.tensor(df['userId'].map(user2idx).values, dtype=torch.long)
        self.movies = torch.tensor(df['movieId'].map(movie2idx).values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.movies[idx], self.ratings[idx]


train_ds = MovieLensDataset(train, user2idx, movie2idx)
val_ds   = MovieLensDataset(val, user2idx, movie2idx)
test_ds  = MovieLensDataset(test, user2idx, movie2idx)

train_dl = DataLoader(train_ds, batch_size=1024, shuffle=True)
val_dl   = DataLoader(val_ds, batch_size=1024)
test_dl  = DataLoader(test_ds, batch_size=1024)


In [8]:
class MatrixFactorization(nn.Module):
  def __init__(self, n_users, n_items, n_factors=100):
    super().__init__()
    self.user_emb = nn.Embedding(n_users, n_factors)
    self.item_emb = nn.Embedding(n_items, n_factors)
    self.user_bias = nn.Embedding(n_users, 1)
    self.item_bias = nn.Embedding(n_items, 1)

  def forward(self, users, items):
    u = self.user_emb(users)
    v = self.item_emb(items)
    dot = (u*v).sum(1, keepdim=True)
    return dot + self.user_bias(users) + self.item_bias(items)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MatrixFactorization(n_users, n_movies, n_factors=64).to(device)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

def train_one_epoch():
  model.train()
  total_loss = 0
  for u, m, r in train_dl:
    u, m, r = u.to(device), m.to(device), r.to(device)
    pred = model(u, m).squeeze()
    loss = loss_fn(pred, r)
    opt.zero_grad()
    loss.backward()
    opt.step()
    total_loss += loss.item() * len(r)
  return total_loss / len(train_dl.dataset)

def evaluate(dl):
  model.eval()
  total_loss = 0
  with torch.no_grad():
    for u, m, r in dl:
      u, m, r = u.to(device), m.to(device), r.to(device)
      pred = model(u, m).squeeze()
      loss = loss_fn(pred, r)
      total_loss += loss.item() * len(r)
  return total_loss / len(dl.dataset)


for epoch in range(6):
  train_loss = train_one_epoch()
  val_loss = evaluate(val_dl)
  print(f"Epoch {epoch+1}: Train Loss:{train_loss:.4f} Validation Loss: {val_loss:.4f}")

Epoch 1: Train Loss:31.0421 Validation Loss: 9.3297
Epoch 2: Train Loss:4.5868 Validation Loss: 3.3463
Epoch 3: Train Loss:1.7320 Validation Loss: 2.0332
Epoch 4: Train Loss:1.0606 Validation Loss: 1.5986
Epoch 5: Train Loss:0.8439 Validation Loss: 1.4015
Epoch 6: Train Loss:0.7456 Validation Loss: 1.2898


In [18]:
def recommend_topN_fast(model, user_idx, seen_indices, N=10, batch_size=1024):
    model.eval()
    recs = []

    all_items = torch.arange(n_movies).to(device)
    scores_list = []

    with torch.no_grad():
        for i in range(0, n_movies, batch_size):
            items_batch = all_items[i:i+batch_size]
            users_batch = torch.tensor([user_idx]*len(items_batch)).to(device)
            scores_batch = model(users_batch, items_batch).squeeze()
            scores_list.append(scores_batch)

    scores = torch.cat(scores_list).cpu().numpy()
    scores[list(seen_indices)] = -np.inf  # ignore seen items
    top_idx = np.argsort(-scores)[:N]
    return top_idx

In [15]:
import random
sample_users = random.sample(list(test_truth.keys()), min(2000, len(test_truth)))
test_truth_sample = {u: test_truth[u] for u in sample_users}


In [16]:
# Precompute movie indices for test users
test_truth_idx = {}
seen_indices = {}
for user_id in sample_users:
    test_truth_idx[user_id] = set(movie2idx[m] for m in test_truth[user_id] if m in movie2idx)
    seen_indices[user_id] = set(movie2idx[m] for m in train[train['userId']==user_id]['movieId'] if m in movie2idx)


In [23]:
import math

def precision_recall_ndcg_fast(model, test_users, test_truth_idx, seen_indices, K=10):
    precisions, recalls, ndcgs = [], [], []

    for user_id in test_users:
        recs = recommend_topN_fast(model, user2idx[user_id], seen_indices[user_id], N=K)
        relevant = test_truth_idx[user_id]

        # metrics
        prec = len([r for r in recs if r in relevant]) / K
        rec = len([r for r in recs if r in relevant]) / len(relevant)
        dcg = sum([1 / math.log2(i + 2) for i, r in enumerate(recs) if r in relevant])
        idcg = sum([1 / math.log2(i + 2) for i in range(min(len(relevant), K))])
        ndcg = dcg / idcg if idcg>0 else 0

        precisions.append(prec)
        recalls.append(rec)
        ndcgs.append(ndcg)

    return np.mean(precisions), np.mean(recalls), np.mean(ndcgs)

# Evaluate
p10, r10, n10 = precision_recall_ndcg_fast(model, sample_users, test_truth_idx, seen_indices)
print(f"Precision@10: {p10:.4f}, Recall@10: {r10:.4f}, NDCG@10: {n10:.4f}")

Precision@10: 0.0000, Recall@10: 0.0000, NDCG@10: 0.0000


In [26]:
def recommend_movies(user_id, N=10):
  if user_id not in user2idx:
    print(f"User ID {user_id} not found.")
    return []

  uidx = user2idx[user_id]

  user_train_ratings = train[train['userId'] == user_id]
  seen_movie_ids = set(user_train_ratings['movieId'].values)

  seen_movie_indices = set(movie2idx[movie_id] for movie_id in seen_movie_ids if movie_id in movie2idx)

  rec_idx = recommend_topN_fast(model, uidx, seen_movie_indices, N=N)
  rec_titles = []
  for idx in rec_idx:
    movie_id = idx2movie[idx]
    title_row = movies[movies['movieId']==movie_id]
    if not title_row.empty:
      rec_titles.append(f"Movie ID: {movie_id}")
    else:
      rec_titles.append(f"Movie ID: {movie_id} (Title not found)")
  return rec_titles

user_to_recommend = 1
recommendations = recommend_movies(user_to_recommend, N=10)
print(f"Top 10 recommendations for user {user_to_recommend}:")
for rec in recommendations:
  print(rec)

Top 10 recommendations for user 1:
Movie ID: 94904
Movie ID: 47482
Movie ID: 83138
Movie ID: 131262
Movie ID: 101920
Movie ID: 114340
Movie ID: 130060
Movie ID: 91921
Movie ID: 83823
Movie ID: 117488


In [27]:
import torch, pickle

torch.save(model.state_dict(), "mf_model.pt")
with open("mappings.pkl", "wb") as f:
  pickle.dump({"user2idx": user2idx, "movie2idx": movie2idx, "idx2movie": idx2movie}, f)
movies.to_csv("movies_metadata.csv", index=False)