In [1]:
# Configuration parameters
CFG = {
    'n_neighbors': 12,
    'tfidf_max_features': 5000,
    'emb_dim': 64,
    'hidden_dim': 256,
    'dropout': 0.2,
    'epochs': 2,
    'batch_size': 2048,
    'alpha': 0.8,
    'beta': 0.5,
    'T': 1,
    'precomputed_graph_path': 'processed_data/umap_movie_graph_truncated.csv',
    'symmetrize_graph': True
}
print(CFG)

{'n_neighbors': 12, 'tfidf_max_features': 5000, 'emb_dim': 64, 'hidden_dim': 256, 'dropout': 0.2, 'epochs': 2, 'batch_size': 2048, 'alpha': 0.8, 'beta': 0.5, 'T': 1, 'precomputed_graph_path': 'processed_data/umap_movie_graph_truncated.csv', 'symmetrize_graph': True}


In [2]:
# Imports
import pandas as pd
import numpy as np
import ast
import json
from tqdm import tqdm
from scipy.sparse import csr_matrix, coo_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)

Using device: cpu


In [3]:
# Loading 
movies = pd.read_csv('processed_data/movies_processed.csv')
ratings = pd.read_csv('processed_data/ratings_with_tmdb.csv')
print('Shapes:', movies.shape, ratings.shape)

movie_ids = sorted(movies['id'].unique().tolist())
id_to_idx = {mid:i for i, mid in enumerate(movie_ids)}
print('Unique movies:', len(movie_ids))

Shapes: (46611, 10) (26010786, 3)
Unique movies: 45430


In [4]:
# Preprocessing movie features
def ensure_list(x):
    if isinstance(x, list):
        return x
    if pd.isna(x) or x == '':
        return []
    try:
        val = ast.literal_eval(str(x))
        if isinstance(val, list):
            return val
        return []
    except Exception:
        return []

movies['genres_list'] = movies['genres'].apply(ensure_list)
movies['keywords_list'] = movies['keywords'].apply(ensure_list)
movies['overview'] = movies['overview'].fillna('')
movies['adult'] = movies['adult'].fillna('False').map({'True':1,'False':0}).fillna(0).astype(int)

# Select only movies present in ratings
movies_sel = movies[movies['id'].isin(movie_ids)].copy().set_index('id').loc[movie_ids].reset_index()
print('Selected movies for features:', movies_sel.shape[0])

Selected movies for features: 46611


In [5]:
# Extracting features X (sparse matrix)

# 1) Overview TF-IDF
tfidf = TfidfVectorizer(max_features=CFG['tfidf_max_features'], stop_words='english')
overview_m = tfidf.fit_transform(movies_sel['overview'])

# 2) Genres OHE
mlb_genres = MultiLabelBinarizer()
genres_m = mlb_genres.fit_transform(movies_sel['genres_list'])

# 3) Keywords OHE
mlb_keywords = MultiLabelBinarizer()
keywords_m = mlb_keywords.fit_transform(movies_sel['keywords_list'])

# 4) Adult
adult_m = movies_sel['adult'].values.reshape(-1,1)

# 5) Numeric
num_cols = [c for c in ['popularity','runtime','vote_average','vote_count'] if c in movies_sel.columns]
if len(num_cols) > 0:
    numeric = movies_sel[num_cols].copy()
    if 'vote_count' in numeric.columns:
        numeric['vote_count'] = np.log1p(pd.to_numeric(numeric['vote_count'], errors='coerce').fillna(0).clip(lower=0))
    for c in numeric.columns:
        numeric[c] = pd.to_numeric(numeric[c], errors='coerce').fillna(0)
    scaler = StandardScaler()
    numeric_m = scaler.fit_transform(numeric)
else:
    from scipy.sparse import csr_matrix
    numeric_m = csr_matrix((movies_sel.shape[0], 0))

from scipy.sparse import csr_matrix, hstack
X = hstack([
    overview_m, 
    csr_matrix(genres_m), 
    csr_matrix(keywords_m),
    csr_matrix(adult_m),
    csr_matrix(numeric_m)
], format='csr')

print('Feature matrix shape:', X.shape)

Feature matrix shape: (46611, 24980)


In [6]:
path = CFG['precomputed_graph_path']
print(f"Loading precomputed graph from {path} ...")
edge_df = pd.read_csv(path)

src_col = [c for c in edge_df.columns if c.lower()=='source'][0]
dst_col = [c for c in edge_df.columns if c.lower()=='target'][0]
w_col = [c for c in edge_df.columns if c.lower()=='weight'][0]


# Filter for present films
edge_df = edge_df[edge_df[src_col].isin(movie_ids) & edge_df[dst_col].isin(movie_ids)]

src_idx = edge_df[src_col].map(id_to_idx)
dst_idx = edge_df[dst_col].map(id_to_idx)
weights = edge_df[w_col].astype(float)

A = coo_matrix((weights, (src_idx, dst_idx)), shape=(X.shape[0], X.shape[0])).tocsr()

if CFG['symmetrize_graph']:
    print('Symmetrizing graph ...')
    A = (A + A.T) * 0.5

# Row-normalize adjacency
row_sums = np.asarray(A.sum(axis=1)).reshape(-1)
row_sums[row_sums == 0] = 1.0
D_inv = csr_matrix((1/row_sums, (np.arange(len(row_sums)), np.arange(len(row_sums)))), shape=A.shape)
A_norm = D_inv.dot(A)
print('Adjacency ready. Nodes:', A_norm.shape[0], 'Edges:', A_norm.nnz)

Loading precomputed graph from processed_data/umap_movie_graph_truncated.csv ...
Symmetrizing graph ...
Adjacency ready. Nodes: 46611 Edges: 1005092


In [7]:
# Preprocessing ratings
ratings = ratings[ratings['tmdbId'].isin(movie_ids)].copy()

# Binarize labels based on individual user's average rating
ratings = ratings.groupby('userId', group_keys=False).apply(
    lambda df: df.assign(label=(df['rating'] >= df['rating'].mean()).astype(int))
)

user_ids = np.sort(ratings['userId'].unique())
user_to_idx = {u:i for i, u in enumerate(user_ids)}
ratings['user_idx'] = ratings['userId'].map(user_to_idx)
ratings['movie_idx'] = ratings['tmdbId'].map(id_to_idx)

n_users = len(user_ids)
print('Users:', n_users, 'Interactions:', len(ratings))

  ratings = ratings.groupby('userId', group_keys=False).apply(


Users: 270883 Interactions: 25981567


In [8]:
# Model: ItemUserModel (shared item encoder + user embeddings)
class ItemUserModel(nn.Module):
    def __init__(self, in_dim, emb_dim, n_users, hidden_dim=256, dropout=0.2):
        super().__init__()
        self.item_net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, emb_dim)
        )
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.user_bias = nn.Embedding(n_users, 1)
        nn.init.normal_(self.user_emb.weight, std=0.02)
        nn.init.zeros_(self.user_bias.weight)

    def forward(self, Xb, user_idx):
        h = self.item_net(Xb)                 # [B, D]
        u = self.user_emb(user_idx)           # [B, D]
        b = self.user_bias(user_idx).squeeze(-1)  # [B]
        logit = (h * u).sum(dim=1) + b
        return logit, h

In [9]:
# Preparing data for training the global model (memory-efficient)
# We do not convert the entire feature matrix X to dense to avoid memory overflow.
# We will extract batches from the sparse CSR row-wise.

X_sparse = X  # CSR

movie_idx_all = torch.tensor(ratings['movie_idx'].values, dtype=torch.long)
user_idx_all  = torch.tensor(ratings['user_idx'].values, dtype=torch.long)
y_all         = torch.tensor(ratings['label'].values, dtype=torch.float32)

dataset = TensorDataset(movie_idx_all, user_idx_all, y_all)

# Collate function: converts selected CSR rows into a dense batch tensor
def collate_fn(batch):
    mids = torch.stack([b[0] for b in batch])
    uids = torch.stack([b[1] for b in batch])
    ys   = torch.stack([b[2] for b in batch]).float()
    # Extract rows from the sparse matrix and convert to dense
    rows = [X_sparse[int(m.item())].toarray() for m in mids]
    Xb = torch.tensor(np.vstack(rows), dtype=torch.float32)
    return Xb.to(DEVICE), uids.to(DEVICE), ys.to(DEVICE)

batch_size = CFG['batch_size']
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

model = ItemUserModel(in_dim=X_sparse.shape[1], emb_dim=CFG['emb_dim'], n_users=n_users, hidden_dim=CFG['hidden_dim'], dropout=CFG['dropout']).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.BCEWithLogitsLoss()

# Training with tqdm
epochs = CFG['epochs']
for epoch in range(epochs):
    model.train()
    running = 0.0
    pbar = tqdm(loader, desc=f"Epoch {epoch+1}/{epochs}")
    for Xb, uids, ys in pbar:
        optimizer.zero_grad()
        logits, _ = model(Xb, uids)
        loss = criterion(logits, ys)
        loss.backward()
        optimizer.step()
        running += loss.item() * Xb.size(0)
        pbar.set_postfix({"loss": f"{loss.item():.4f}"})
    print(f"epoch loss: {running/len(dataset):.4f}")

# Precompute item embeddings (batched for GPU memory efficiency)
model.eval()
item_repr_all = []
with torch.no_grad():
    step = 2048
    for start in tqdm(range(0, X_sparse.shape[0], step), desc='Precompute item embeddings'):
        end = min(start + step, X_sparse.shape[0])
        rows = [X_sparse[i].toarray() for i in range(start, end)]
        X_block = torch.tensor(np.vstack(rows), dtype=torch.float32).to(DEVICE)
        emb = model.item_net(X_block).cpu()
        item_repr_all.append(emb)
item_repr_all = torch.vstack(item_repr_all)
print('Item repr shape:', item_repr_all.shape)

Epoch 1/2: 100%|██████████| 12687/12687 [2:53:07<00:00,  1.22it/s, loss=0.5649]    


epoch loss: 0.6065


Epoch 2/2: 100%|██████████| 12687/12687 [48:37<00:00,  4.35it/s, loss=0.5731] 


epoch loss: 0.5811


Precompute item embeddings: 100%|██████████| 23/23 [00:03<00:00,  7.26it/s]

Item repr shape: torch.Size([46611, 64])





In [10]:
# Correct-and-Smooth by user
alpha = CFG['alpha']
beta = CFG['beta']
T = CFG['T']

base_results = []   # metrics for the model only (without correction and smoothing)
cs_results = []     # metrics after correct + smooth

user_emb_weight = model.user_emb.weight.detach().cpu()
user_bias_weight = model.user_bias.weight.detach().cpu().squeeze(-1)

def infer_user_scores(user_idx_int):
    u = user_emb_weight[user_idx_int]
    b = user_bias_weight[user_idx_int]
    logits = (item_repr_all * u).sum(dim=1) + b
    return torch.sigmoid(logits).numpy()

user_groups = ratings.groupby('userId')
for user, user_df in tqdm(user_groups, desc='Users Eval (base & C+S)'):
    if user_df.shape[0] < 10:
        continue
    mean_r = user_df['rating'].mean()
    user_df = user_df.assign(label=(user_df['rating'] >= mean_r).astype(int))

    train_df, test_df = train_test_split(user_df, test_size=0.2, random_state=42)
    train_idx = [id_to_idx[mid] for mid in train_df['tmdbId']]
    test_idx  = [id_to_idx[mid] for mid in test_df['tmdbId']]
    y_train   = train_df['label'].values.astype(float)
    y_true    = test_df['label'].values.astype(int)
    if len(test_idx) < 5:
        continue

    uidx = user_to_idx[user]
    y_soft = infer_user_scores(uidx)  # base model probabilities

    # --- Baseline (model only) ---
    y_base_scores = np.nan_to_num(y_soft[test_idx], nan=0.5)
    y_base_pred = (y_base_scores >= 0.5).astype(int)
    has_both = len(np.unique(y_true)) > 1
    base_results.append({
        'userId': user,
        'accuracy': accuracy_score(y_true, y_base_pred),
        'f1': f1_score(y_true, y_base_pred) if has_both else np.nan,
        'roc_auc': roc_auc_score(y_true, y_base_scores) if has_both else np.nan,
        'precision': precision_score(y_true, y_base_pred) if has_both else np.nan,
        'recall': recall_score(y_true, y_base_pred) if has_both else np.nan,
        'train_size': len(train_idx),
        'test_size': len(test_idx),
        'stage': 'base'
    })

    # --- Correct ---
    y_corr = y_soft.copy()
    y_corr[train_idx] += alpha * (y_train - y_soft[train_idx])

    # --- Smooth (T iterations) ---
    y_sm = y_corr.copy()
    for _ in range(T):
        y_neigh = A_norm.dot(y_sm)
        y_sm = (1-beta)*y_sm + beta*y_neigh

    y_cs_scores = np.nan_to_num(y_sm[test_idx], nan=0.5)
    y_cs_pred = (y_cs_scores >= 0.5).astype(int)
    cs_results.append({
        'userId': user,
        'accuracy': accuracy_score(y_true, y_cs_pred),
        'f1': f1_score(y_true, y_cs_pred) if has_both else np.nan,
        'roc_auc': roc_auc_score(y_true, y_cs_scores) if has_both else np.nan,
        'precision': precision_score(y_true, y_cs_pred) if has_both else np.nan,
        'recall': recall_score(y_true, y_cs_pred) if has_both else np.nan,
        'train_size': len(train_idx),
        'test_size': len(test_idx),
        'stage': 'correct_smooth'
    })

results_df_base = pd.DataFrame(base_results)
results_df_cs   = pd.DataFrame(cs_results)

# Save summaries

results_df_base.to_csv('correct_smooth_results_base.csv', index=False)
results_df_cs.to_csv('correct_smooth_results_correct_smooth.csv', index=False)
print("Saved: correct_smooth_results_base.csv")
print("Saved: correct_smooth_results_correct_smooth.csv")

print('Baseline head:')
display(results_df_base.head())
print('Correct+Smooth head:')
display(results_df_cs.head())



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

Saved: correct_smooth_results_base.csv
Saved: correct_smooth_results_correct_smooth.csv
Baseline head:


Unnamed: 0,userId,accuracy,f1,roc_auc,precision,recall,train_size,test_size,stage
0,1,0.333333,0.333333,0.5,0.25,0.5,21,6,base
1,2,0.6,0.5,1.0,0.333333,1.0,17,5,base
2,4,0.538462,0.5,0.666667,0.5,0.5,49,13,base
3,5,0.333333,0.333333,0.5,0.25,0.5,20,6,base
4,7,0.727273,0.823529,0.714286,0.7,1.0,42,11,base


Correct+Smooth head:


Unnamed: 0,userId,accuracy,f1,roc_auc,precision,recall,train_size,test_size,stage
0,1,0.333333,0.333333,0.375,0.25,0.5,21,6,correct_smooth
1,2,0.6,0.5,1.0,0.333333,1.0,17,5,correct_smooth
2,4,0.538462,0.625,0.666667,0.5,0.833333,49,13,correct_smooth
3,5,0.666667,0.666667,0.75,0.5,1.0,20,6,correct_smooth
4,7,0.727273,0.823529,0.75,0.7,1.0,42,11,correct_smooth


In [11]:
if len(results_df_base) and len(results_df_cs):
    metrics = ['accuracy','f1','roc_auc','precision','recall']
    base_mean = results_df_base[metrics].mean(numeric_only=True)
    cs_mean   = results_df_cs[metrics].mean(numeric_only=True)
    diff = (cs_mean - base_mean).to_frame('delta')
    print('\nMean baseline:')
    display(base_mean)
    print('\nMean correct+smooth:')
    display(cs_mean)
    print('\nDelta (C+S - base):')
    display(diff)


Mean baseline:


accuracy     0.706760
f1           0.692148
roc_auc      0.754867
precision    0.685370
recall       0.751366
dtype: float64


Mean correct+smooth:


accuracy     0.695607
f1           0.653698
roc_auc      0.742436
precision    0.685258
recall       0.690635
dtype: float64


Delta (C+S - base):


Unnamed: 0,delta
accuracy,-0.011153
f1,-0.03845
roc_auc,-0.012431
precision,-0.000111
recall,-0.060731
