In [17]:
!nvidia-smi

Sat Oct  4 14:06:06 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   78C    P0             33W /   70W |     334MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Package Installation

In [18]:
!pip install kagglehub gradio torch torchvision torchtext torchmetrics tqdm pandas numpy scikit-learn



## Package Import

In [19]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import kagglehub

## Find and Download Dataset

In [20]:
path = kagglehub.dataset_download("grouplens/movielens-20m-dataset")
print(f"Dataset downloaded to: {path}")

ratings_files = [f for f in os.listdir(path) if "rating" in f.lower() and f.endswith(".csv")]
movies_files = [f for f in os.listdir(path) if "movie" in f.lower() and f.endswith(".csv")]

if not ratings_files or not movies_files:
    raise FileNotFoundError("Ratings or movies CSV not found in the dataset folder.")

RATINGS_CSV = os.path.join(path, ratings_files[0])
MOVIES_CSV = os.path.join(path, movies_files[0])

print(f"Using ratings file: {RATINGS_CSV}")
print(f"Using movies file: {MOVIES_CSV}")

ratings_full = pd.read_csv(RATINGS_CSV)
movies_full = pd.read_csv(MOVIES_CSV)

print(f"Full ratings shape: {ratings_full.shape}")
print(f"Full movies shape: {movies_full.shape}")

## Taking a fraction of dataset for easiertraining purpose.
fraction = 0.2
ratings = ratings_full.sample(frac=fraction, random_state=42)

movies= movies_full[movies_full['movieId'].isin(ratings['movieId'].unique())].copy()

print(f"Fraction ratings shape: {ratings.shape}")
print(f"Fraction movies shape: {movies.shape}")

print(f"Missing values in fraction ratings:\n{ratings.isnull().sum()}")
print(f"Missing values in fraction movies:\n{movies.isnull().sum()}")

Using Colab cache for faster access to the 'movielens-20m-dataset' dataset.
Dataset downloaded to: /kaggle/input/movielens-20m-dataset
Using ratings file: /kaggle/input/movielens-20m-dataset/rating.csv
Using movies file: /kaggle/input/movielens-20m-dataset/movie.csv
Full ratings shape: (20000263, 4)
Full movies shape: (27278, 3)
Fraction ratings shape: (4000053, 4)
Fraction movies shape: (20357, 3)
Missing values in fraction ratings:
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64
Missing values in fraction movies:
movieId    0
title      0
genres     0
dtype: int64


In [21]:
user_ids = ratings['userId'].unique()
movie_ids = ratings['movieId'].unique()

user2idx = {u:i for i,u in enumerate(user_ids)}
movie2idx = {m:i for i,m in enumerate(movie_ids)}
idx2movie = {i:m for m, i in movie2idx.items()}

movieid2title = movies.set_index("movieId")['title'].to_dict()

n_users = len(user2idx)
n_movies = len(movie2idx)

print(f"Users:{n_users}, Movies: {n_movies}, Ratings: {len(ratings)}")

Users:138339, Movies: 20357, Ratings: 4000053


In [22]:
# Train, Test, Validation Split -- 70% /20% / 10%

train_val, test = train_test_split(ratings, test_size=0.20, random_state=42)
train, val = train_test_split(train_val, test_size=0.125, random_state=42)

print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")

Train: 2800036, Val: 400006, Test: 800011


## Defining Dataset

In [23]:
class MovieLensDataset(Dataset):
  def __init__(self, df, user2idx, movie2idx):
    self.users = torch.tensor(df['userId'].map(user2idx).values, dtype=torch.long)
    self.movies = torch.tensor(df['movieId'].map(movie2idx).values, dtype=torch.long)
    self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

  def __len__(self):
    return len(self.ratings)

  def __getitem__(self, idx):
    return self.users[idx], self.movies[idx], self.ratings[idx]



train_ds = MovieLensDataset(train, user2idx, movie2idx)
val_ds = MovieLensDataset(val, user2idx, movie2idx)
test_ds = MovieLensDataset(test, user2idx, movie2idx)

train_dl = DataLoader(train_ds, batch_size=1024, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=1024)
test_dl = DataLoader(test_ds, batch_size=1024)

## Defining Matrix Factorization Model

In [24]:
class MatrixFactorization(nn.Module):
  def __init__(self, n_users, n_items, n_factors=64):
    super().__init__()
    self.user_emb = nn.Embedding(n_users, n_factors)
    self.item_emb = nn.Embedding(n_items, n_factors)
    self.user_bias = nn.Embedding(n_users, 1)
    self.item_bias = nn.Embedding(n_items, 1)

    nn.init.normal_(self.user_emb.weight, std=0.01)
    nn.init.normal_(self.item_emb.weight, std=0.01)
    nn.init.zeros_(self.user_bias.weight)
    nn.init.zeros_(self.item_bias.weight)

  def forward(self, users, items):
    u = self.user_emb(users)
    v = self.item_emb(items)
    dot = (u * v).sum(1)
    return dot + self.user_bias(users).squeeze() + self.item_bias(items).squeeze()


## Training Process

In [25]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MatrixFactorization(n_users, n_movies, n_factors=64).to(device)
opt = torch.optim.Adam(model.parameters(), lr = 1e-3, weight_decay = 1e-5)
loss_fn = nn.MSELoss()

def train_one_epoch():
  model.train()
  total_loss = 0.0
  for u, m, r in train_dl:
    u, m, r = u.to(device), m.to(device), r.to(device)
    pred = model(u, m)
    loss = loss_fn(pred, r)
    opt.zero_grad()
    loss.backward()
    opt.step()
    total_loss += loss.item() * r.size(0)
  return total_loss / len(train_dl.dataset)

def evaluate(dataloader):
  model.eval()
  total_loss = 0.0
  with torch.no_grad():
    for u, m, r in dataloader:
      u, m, r = u.to(device), m.to(device), r.to(device)
      pred = model(u, m)
      loss = loss_fn(pred, r)
      total_loss += loss.item() * r.size(0)
  return total_loss / len(dataloader.dataset)


for epoch in range(60):
  train_loss = train_one_epoch()
  val_loss = evaluate(val_dl)
  print(f"Epoch {epoch+1}: Train Loss: {train_loss:.4f} ; Validation Loss: {val_loss:.4f}")

Epoch 1: Train Loss: 4.4203 ; Validation Loss: 1.3748
Epoch 2: Train Loss: 1.1550 ; Validation Loss: 1.1062
Epoch 3: Train Loss: 1.0361 ; Validation Loss: 1.0522
Epoch 4: Train Loss: 0.9897 ; Validation Loss: 1.0192
Epoch 5: Train Loss: 0.9469 ; Validation Loss: 0.9894
Epoch 6: Train Loss: 0.9030 ; Validation Loss: 0.9648
Epoch 7: Train Loss: 0.8590 ; Validation Loss: 0.9416
Epoch 8: Train Loss: 0.8130 ; Validation Loss: 0.9237
Epoch 9: Train Loss: 0.7650 ; Validation Loss: 0.9101
Epoch 10: Train Loss: 0.7178 ; Validation Loss: 0.8982
Epoch 11: Train Loss: 0.6749 ; Validation Loss: 0.8907
Epoch 12: Train Loss: 0.6393 ; Validation Loss: 0.8846
Epoch 13: Train Loss: 0.6088 ; Validation Loss: 0.8805
Epoch 14: Train Loss: 0.5837 ; Validation Loss: 0.8775
Epoch 15: Train Loss: 0.5638 ; Validation Loss: 0.8751
Epoch 16: Train Loss: 0.5473 ; Validation Loss: 0.8735
Epoch 17: Train Loss: 0.5337 ; Validation Loss: 0.8721
Epoch 18: Train Loss: 0.5234 ; Validation Loss: 0.8714
Epoch 19: Train Los

## Ground Truth for model evaluation

In [26]:
from collections import defaultdict

test_truth = defaultdict(list)
for row in test.itertuples(index=False):
  uid = row.userId
  mid = row.movieId
  if row.rating >= 4.0:
    if uid in user2idx and mid in movie2idx:
      test_truth[uid].append(mid)

test_truth_idx = {}
for uid, mids in test_truth.items():
  test_truth_idx[uid] = set(movie2idx[m] for m in mids if m in movie2idx)

seen_indicies = {}
train_grouped = train.groupby('userId')['movieId'].apply(list).to_dict()
for uid in train_grouped:
  seen_indicies[uid] = set(movie2idx[m] for m in train_grouped[uid] if m in movie2idx)

print(f"Prepared test_truth and seen_indicies for evaluation.")

Prepared test_truth and seen_indicies for evaluation.


## Top N movie recommendation for a given user

In [27]:
def recommend_topN_fast(model, user_idx, seen_indicies_set, N=10, batch_size=2048):
  model.eval()
  all_items = torch.arange(n_movies, device=device)
  scores_chunks = []

  with torch.no_grad():
    for start in range(0, n_movies, batch_size):
      end = min(start + batch_size, n_movies)
      items_batch = all_items[start:end]
      users_batch = torch.full((end - start,), user_idx, dtype = torch.long, device = device)
      scores_batch = model(users_batch, items_batch)
      scores_chunks.append(scores_batch.cpu())


  scores = torch.cat(scores_chunks).numpy()

  if seen_indicies_set:
    mask = np.array(list(seen_indicies_set))
    scores[list(seen_indicies_set)] = -np.inf


  top_idx = np.argpartition(-scores, N)[:N]
  top_idx = top_idx[np.argsort(-scores[top_idx])]
  return top_idx.tolist()




## Evaluation: Precision@K, Recall@K, NDCG@K

In [28]:
import math
import random

def precision_recall_ndcg_fast(model, user_ids, test_truth_idx, seen_indicies, K=10):
  precisions, recalls, ndcgs = [], [], []
  for uid in user_ids:
    if uid not in user2idx:
      continue
    relevent = test_truth_idx.get(uid, set())
    if len(relevent) == 0:
      continue

    recs = recommend_topN_fast(model, user2idx[uid], seen_indicies.get(uid, set()), N=K)
    hit_count = sum(1 for r in recs if r in relevent)
    prec = hit_count / K
    rec = hit_count / len(relevent)
    dcg = sum(1 / math.log2(i+2) for i, r in enumerate(recs) if r in relevent)
    idcg = sum(1 / math.log2(i+2) for i in range(min(len(relevent), K)))
    ndcg = dcg / idcg if idcg > 0 else 0
    precisions.append(prec); recalls.append(rec); ndcgs.append(ndcg)
  return np.mean(precisions), np.mean(recalls), np.mean(ndcgs)

In [29]:
all_test_users = list(test_truth_idx.keys())
sample_users = random.sample(all_test_users, min(2000, len(all_test_users)))
p10, r10, n10 = precision_recall_ndcg_fast(model, sample_users, test_truth_idx, seen_indicies, K=10)
print(f"Precision@10: {p10:.4f}, Recall@10: {r10:.4f}, NDCG@10: {n10:.4f}")

Precision@10: 0.0167, Recall@10: 0.0564, NDCG@10: 0.0380


## Movie Recommendation for user ID input

In [32]:
def recommend_movies(user_id, N=10):
  if user_id not in user2idx:
    print(f"User ID {user_id} not found.0")
    return []

  uidx = user2idx[user_id]
  seen_set = seen_indicies.get(user_id, set())

  top_movies_indicies = recommend_topN_fast(model, uidx, seen_set, N=N)

  titles = []

  for midx in top_movies_indicies:
    raw_movie_id = idx2movie.get(midx, None)
    title = movieid2title.get(raw_movie_id, None)
    if title:
      titles.append(title)
    else:
      titles.append(f"Movie ID {raw_movie_id}")
  return titles


try:
    user_to_recommend = int(input('Enter user ID to get movie recommendation: '))
    recs = recommend_movies(user_to_recommend, N=10)

    if len(recs) == 0:
      print("No recommendations found.")
    else:
      print(f"\nTop{len(recs)} recommendations for user {user_to_recommend}: \n")
      for i, t in enumerate(recs, 1):
        print(f"{i}. {t}")
except ValueError:
       print("Invalid Input. Please enter a numeric user ID...")

Enter user ID to get movie recommendation: 16

Top10 recommendations for user 16: 

1. Shawshank Redemption, The (1994)
2. Schindler's List (1993)
3. Godfather, The (1972)
4. Star Wars: Episode IV - A New Hope (1977)
5. American History X (1998)
6. Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
7. Apollo 13 (1995)
8. Forrest Gump (1994)
9. Usual Suspects, The (1995)
10. Fargo (1996)


# For Deployment

In [33]:
import torch
import pickle

# Save model weights
torch.save(model.state_dict(), "mf_model_64.pt")  # n_factors=64 included in filename for clarity

# Save mappings
with open("mapping_64.pkl", "wb") as f:
    pickle.dump({
        "user2idx": user2idx,
        "movie2idx": movie2idx,
        "idx2movie": idx2movie,
        "movieid2title": movieid2title
    }, f)

# Save movies metadata
movies.to_csv("movies_metadata.csv", index=False)

print("Saved: mf_model_64.pt, mapping_64.pkl, movies_metadata.csv")


Saved: mf_model_64.pt, mapping_64.pkl, movies_metadata.csv
