In [None]:
# Configuration and Environment Setup
CFG = {
    'graph_path': '/kaggle/input/moviesdatasetprocessed/movie_similarity_graph.csv', 
    'symmetrize_graph': True,
    'add_self_loops': False,
    'tfidf_max_features': 5000,
    'svd_dim': 256, # SVD dimensionality for dense node features
    'hidden_dim': 256,
    'dropout': 0.2,
    'epochs': 5,
    'patience': 2,
    'lr': 1e-3,
    'min_interactions': 10, # per-user min interactions
    'test_size': 0.2,
    'random_state': 42,
    'sample_user_count': 500, # number of users to sample for training/eval
    'user_sample_seed': 123, # seed for user sampling
    'strat_bins_max': 20, # maximum number of quantile bins for stratification
    'results_dir': 'results/gnn'
}

print('CFG:', CFG)

try:
    import torch_geometric 
    print('PyG available')
except Exception as e:
    print('PyG not available:', e)

CFG: {'graph_path': '/kaggle/input/moviesdatasetprocessed/movie_similarity_graph.csv', 'symmetrize_graph': True, 'add_self_loops': False, 'tfidf_max_features': 5000, 'svd_dim': 256, 'hidden_dim': 256, 'dropout': 0.2, 'epochs': 5, 'patience': 2, 'lr': 0.001, 'min_interactions': 10, 'test_size': 0.2, 'random_state': 42, 'sample_user_count': 500, 'user_sample_seed': 123, 'strat_bins_max': 20, 'results_dir': 'results/gnn'}
PyG available


In [None]:
# Imports
import warnings, os
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, SAGEConv
from tqdm import tqdm

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', DEVICE)

Using device: cuda


In [None]:
# Load Processed Data
movies = pd.read_csv('/kaggle/input/moviesdatasetprocessed/movies_processed.csv')
ratings = pd.read_csv('/kaggle/input/moviesdatasetprocessed/ratings_with_tmdb.csv')
edges_raw = pd.read_csv(CFG['graph_path'])
print('Loaded shapes:', movies.shape, ratings.shape, edges_raw.shape)
print(edges_raw.head())

Loaded shapes: (46611, 10) (26010786, 3) (885616, 3)
   source  target    weight
0     862   10681  0.784691
1     862   72105  0.762008
2     862   20760  0.756459
3     862  324852  0.756147
4     862  109439  0.754481


In [None]:
# Build Movie Index Mappings and Filter Ratings
movie_ids = sorted(movies['id'].dropna().unique().tolist())
id_to_idx = {mid: i for i, mid in enumerate(movie_ids)}
idx_to_id = {i: mid for mid, i in id_to_idx.items()}

In [None]:
# Filter ratings to present movies
ratings = ratings[ratings['tmdbId'].isin(movie_ids)].copy()

In [None]:
#  Construct edge_index and edge_weight
src_col = [c for c in edges_raw.columns if c.lower()=='source'][0]
dst_col = [c for c in edges_raw.columns if c.lower()=='target'][0]
w_col   = [c for c in edges_raw.columns if c.lower()=='weight'][0]

edges_df = edges_raw[edges_raw[src_col].isin(movie_ids) & edges_raw[dst_col].isin(movie_ids)].copy()
edges_df['src_idx'] = edges_df[src_col].map(id_to_idx)
edges_df['dst_idx'] = edges_df[dst_col].map(id_to_idx)

edge_index_list = []
edge_weight_list = []
for r in edges_df.itertuples():
    edge_index_list.append([r.src_idx, r.dst_idx])
    edge_weight_list.append(float(getattr(r, w_col)))

if CFG['symmetrize_graph']:
    rev_edges = []
    rev_weights = []
    for (s,d), w in zip(edge_index_list, edge_weight_list):
        rev_edges.append([d,s])
        rev_weights.append(w)
    edge_index_list.extend(rev_edges)
    edge_weight_list.extend(rev_weights)

edge_index = torch.tensor(edge_index_list, dtype=torch.long).t().contiguous() 
edge_weight = torch.tensor(edge_weight_list, dtype=torch.float)

In [None]:
if CFG['add_self_loops']:
    self_loops = torch.arange(len(movie_ids), dtype=torch.long)
    self_loop_index = torch.stack([self_loops, self_loops], dim=0)
    edge_index = torch.cat([edge_index, self_loop_index], dim=1)
    edge_weight = torch.cat([edge_weight, torch.ones(self_loops.size(0))], dim=0)
    print('Added self loops. New edge count:', edge_index.size(1))

In [None]:
print('Unique movies:', len(movie_ids))
print('Filtered ratings shape:', ratings.shape)
print('Filtered edges:', edges_df.shape)
print('edge_index shape:', edge_index.shape, 'edge_weight shape:', edge_weight.shape)

Unique movies: 45430
Filtered ratings shape: (25981567, 3)
Filtered edges: (885616, 5)
edge_index shape: torch.Size([2, 1771232]) edge_weight shape: torch.Size([1771232])


In [None]:
# Build Sparse Movie Features
def ensure_list(x):
    if isinstance(x, list): return x
    if pd.isna(x) or x == '': return []
    try:
        val = eval(str(x))
        if isinstance(val, list): return val
        return []
    except Exception:
        return []

movies['genres_list'] = movies['genres'].apply(ensure_list)
movies['keywords_list'] = movies['keywords'].apply(ensure_list)
movies['overview'] = movies['overview'].fillna('')
movies['adult_flag'] = movies['adult'].fillna('False').map({'True':1,'False':0}).fillna(0).astype(int)

# Align movie order
movies_sel = movies[movies['id'].isin(movie_ids)].copy().set_index('id').loc[movie_ids].reset_index()
print('Movies selected for features:', movies_sel.shape)

tfidf = TfidfVectorizer(max_features=CFG['tfidf_max_features'], stop_words='english')
overview_m = tfidf.fit_transform(movies_sel['overview'])
mlb_genres = MultiLabelBinarizer()
genres_m = mlb_genres.fit_transform(movies_sel['genres_list'])
mlb_keywords = MultiLabelBinarizer()
keywords_m = mlb_keywords.fit_transform(movies_sel['keywords_list'])
adult_m = movies_sel['adult_flag'].values.reshape(-1,1)

num_cols = [c for c in ['popularity','runtime','vote_average','vote_count'] if c in movies_sel.columns]
if len(num_cols):
    numeric = movies_sel[num_cols].copy()
    if 'vote_count' in numeric.columns:
        numeric['vote_count'] = np.log1p(pd.to_numeric(numeric['vote_count'], errors='coerce').fillna(0).clip(lower=0))
    for c in numeric.columns:
        numeric[c] = pd.to_numeric(numeric[c], errors='coerce').fillna(0)
    scaler_num = StandardScaler()
    numeric_m = scaler_num.fit_transform(numeric)
else:
    numeric_m = np.zeros((movies_sel.shape[0],0))

X_sparse = hstack([overview_m, csr_matrix(genres_m), csr_matrix(keywords_m), csr_matrix(adult_m), csr_matrix(numeric_m)], format='csr')
print('Sparse feature matrix shape:', X_sparse.shape)

Movies selected for features: (46611, 13)
Sparse feature matrix shape: (46611, 24980)


In [None]:
# Reduce Features with TruncatedSVD and Scale (dense X for GNN)
svd = TruncatedSVD(n_components=CFG['svd_dim'], random_state=CFG['random_state'])
X_dense = svd.fit_transform(X_sparse)
scaler_dense = StandardScaler()
X_dense = scaler_dense.fit_transform(X_dense).astype(np.float32)
x = torch.tensor(X_dense, dtype=torch.float)
print('Dense feature matrix:', x.shape)

Dense feature matrix: torch.Size([46611, 256])


In [None]:
# Create PyG Data Object
data = Data(x=x, edge_index=edge_index, edge_weight=edge_weight)
print(data)
data = data.to(DEVICE)

Data(x=[46611, 256], edge_index=[2, 1771232], edge_weight=[1771232])


In [None]:
def build_user_masks(user_df):
    # Compute labels by user mean
    mean_r = user_df['rating'].mean()
    user_df = user_df.assign(label=(user_df['rating'] >= mean_r).astype(int))
    train_df, test_df = train_test_split(user_df, test_size=CFG['test_size'], random_state=CFG['random_state'])
    if test_df.shape[0] < 5:
        return None

    y = torch.full((data.num_nodes,), -1.0)
    for r in train_df.itertuples():
        y[id_to_idx[r.tmdbId]] = float(r.label)
    for r in test_df.itertuples():
        y[id_to_idx[r.tmdbId]] = float(r.label)

    # Masks
    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    test_mask  = torch.zeros(data.num_nodes, dtype=torch.bool)
    train_idx = [id_to_idx[mid] for mid in train_df['tmdbId']]
    test_idx  = [id_to_idx[mid] for mid in test_df['tmdbId']]
    train_mask[train_idx] = True
    test_mask[test_idx]  = True
    return y.to(DEVICE), train_mask.to(DEVICE), test_mask.to(DEVICE), train_df, test_df

# Utilities: Seeding, Metrics, Early Stopping, Train/Eval Helpers
def seed_all(seed=CFG['random_state']):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def compute_metrics(y_true, y_score):
    y_pred = (y_score >= 0.5).astype(int)
    uniq = np.unique(y_true)
    has_both = len(uniq) > 1
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'f1': f1_score(y_true, y_pred) if has_both else np.nan,
        'roc_auc': roc_auc_score(y_true, y_score) if has_both else np.nan,
        'precision': precision_score(y_true, y_pred) if has_both else np.nan,
        'recall': recall_score(y_true, y_pred) if has_both else np.nan,
    }

class EarlyStopping:
    def __init__(self, patience=3):
        self.patience = patience
        self.best = None
        self.count = 0
        self.stop = False
    def step(self, val):
        if self.best is None or val < self.best:
            self.best = val
            self.count = 0
        else:
            self.count += 1
            if self.count >= self.patience:
                self.stop = True

In [None]:
# Prepare Per-User Labels and Splits
user_groups = ratings.groupby('userId')
eligible_users = []
user_interactions = {}
for user, df in user_groups:
    if df.shape[0] >= CFG['min_interactions']:
        eligible_users.append(user)
        user_interactions[user] = df.copy()
print('Eligible users:', len(eligible_users))

seed_all()

# Stratified sampling of users preserving interaction count distribution
import math
all_counts = np.array([user_interactions[u].shape[0] for u in eligible_users])
counts_df = pd.DataFrame({'userId': eligible_users, 'n_interactions': all_counts})

# Determine number of bins (quantile-based)
unique_counts = counts_df['n_interactions'].nunique()
q_bins = min(CFG['strat_bins_max'], unique_counts)

# Use qcut for quantile bins (handle duplicates)
counts_df['bin'] = pd.qcut(counts_df['n_interactions'], q=q_bins, duplicates='drop')

# Compute desired sample size per bin proportional to original bin frequency
bin_sizes = counts_df['bin'].value_counts().sort_index()
proportions = bin_sizes / bin_sizes.sum()
raw_alloc = proportions * CFG['sample_user_count']
alloc = raw_alloc.round().astype(int)

# Adjust rounding to exact total
diff = CFG['sample_user_count'] - alloc.sum()

if diff != 0:
    # distribute diff across bins (positive or negative) by largest fractional parts
    fractional = (raw_alloc - raw_alloc.floor()).abs()
    order = fractional.sort_values(ascending=False).index.tolist()
    idx = 0
    while diff != 0 and idx < len(order):
        b = order[idx]
        if diff > 0:
            alloc[b] += 1; diff -= 1
        else:
            if alloc[b] > 1: # keep at least 1 per bin
                alloc[b] -= 1; diff += 1
        idx += 1

sampled_users = []
rng = np.random.default_rng(CFG['user_sample_seed'])
for b in bin_sizes.sort_index().index:
    subset = counts_df[counts_df['bin'] == b]
    k = min(alloc.loc[b], subset.shape[0])
    chosen = subset.sample(n=k, random_state=CFG['user_sample_seed'])['userId'].tolist()
    sampled_users.extend(chosen)

sampled_users = sampled_users[:CFG['sample_user_count']]  # safety trim
print(f"Sampled users: {len(sampled_users)}")

# Save sampled users with counts
os.makedirs(CFG['results_dir'], exist_ok=True)
sampled_df = counts_df[counts_df['userId'].isin(sampled_users)].copy()
sampled_path = os.path.join(CFG['results_dir'], f"selected_users_{CFG['sample_user_count']}.csv")
sampled_df.to_csv(sampled_path, index=False)
print('Saved sampled users to:', sampled_path)

# Distribution summaries
orig_stats = counts_df['n_interactions'].describe(percentiles=[0.1,0.25,0.5,0.75,0.9])
sample_stats = sampled_df['n_interactions'].describe(percentiles=[0.1,0.25,0.5,0.75,0.9])
print('\nOriginal distribution (eligible users):')
print(orig_stats)
print('\nSample distribution:')
print(sample_stats)

print('\nBin allocation (bin interval -> original count -> sample count):')
for b in bin_sizes.sort_index().index:
    orig_c = bin_sizes.loc[b]
    samp_c = sampled_df[sampled_df['bin'] == b].shape[0]
    print(f"{b}: orig={orig_c}, sample={samp_c}")

Eligible users: 233814
Sampled users: 500
Saved sampled users to: results/gnn/selected_users_500.csv

Original distribution (eligible users):
count    233814.000000
mean        110.362365
std         217.503322
min          10.000000
10%          15.000000
25%          18.000000
50%          40.000000
75%         110.000000
90%         265.000000
max       18230.000000
Name: n_interactions, dtype: float64

Sample distribution:
count     500.00000
mean      110.91800
std       196.80391
min        10.00000
10%        15.00000
25%        17.00000
50%        40.00000
75%       107.75000
90%       265.30000
max      1541.00000
Name: n_interactions, dtype: float64

Bin allocation (bin interval -> original count -> sample count):
(9.999, 12.0]: orig=14821, sample=32
(12.0, 15.0]: orig=25532, sample=55
(15.0, 16.0]: orig=10033, sample=21
(16.0, 18.0]: orig=12526, sample=27
(18.0, 20.0]: orig=9532, sample=20
(20.0, 23.0]: orig=9882, sample=21
(23.0, 28.0]: orig=12921, sample=28
(28.0, 33.0]: o

In [None]:
# GCN Model
class GCNClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim=256, dropout=0.2):
        super().__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, 1)  # output logits per node
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, edge_index, edge_weight=None):
        x = self.conv1(x, edge_index, edge_weight=edge_weight)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index, edge_weight=edge_weight)
        return x.squeeze(-1)  # [N] logits

In [None]:
# Train and Evaluate GCN Per User
os.makedirs(CFG['results_dir'], exist_ok=True)
gcn_results = []

for user in tqdm(sampled_users, desc='Users (GCN)'):
    user_df = user_interactions[user]
    masks = build_user_masks(user_df)
    if masks is None:
        continue
    y, train_mask, test_mask, train_df, test_df = masks
    model = GCNClassifier(in_dim=data.x.size(-1), hidden_dim=CFG['hidden_dim'], dropout=CFG['dropout']).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=CFG['lr'], weight_decay=1e-5)
    criterion = nn.BCEWithLogitsLoss()
    stopper = EarlyStopping(patience=CFG['patience'])

    for epoch in range(CFG['epochs']):
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.edge_weight)
        loss = criterion(out[train_mask], y[train_mask])
        loss.backward()
        optimizer.step()
        stopper.step(loss.item())
        if stopper.stop:
            break

    model.eval()
    with torch.no_grad():
        logits = model(data.x, data.edge_index, data.edge_weight)
        probs = torch.sigmoid(logits)
    test_probs = probs[test_mask].detach().cpu().numpy()
    test_labels = y[test_mask].detach().cpu().numpy().astype(int)
    metrics = compute_metrics(test_labels, test_probs)
    metrics.update({'userId': user,'train_size': int(train_mask.sum().item()),'test_size': int(test_mask.sum().item()),'model':'GCN'})
    gcn_results.append(metrics)

gcn_df = pd.DataFrame(gcn_results)
gcn_path = os.path.join(CFG['results_dir'], 'gnn_gcn_results.csv')
gcn_df.to_csv(gcn_path, index=False)
print('Saved GCN results ->', gcn_path)
display(gcn_df.head())

Users (GCN): 100%|██████████| 500/500 [03:42<00:00,  2.25it/s] 

Saved GCN results -> results/gnn/gnn_gcn_results.csv





Unnamed: 0,accuracy,f1,roc_auc,precision,recall,userId,train_size,test_size,model
0,0.6,0.75,0.833333,0.6,1.0,267903,16,5,GCN
1,0.6,0.75,0.5,0.6,1.0,263665,18,5,GCN
2,0.0,,,,,98503,18,5,GCN
3,1.0,,,,,204829,18,5,GCN
4,0.6,0.666667,1.0,0.5,1.0,81787,16,5,GCN


In [None]:
# GraphSAGE Model
class GraphSAGEClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim=256, dropout=0.2):
        super().__init__()
        self.conv1 = SAGEConv(in_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, edge_index, edge_weight=None):
        # SAGEConv ignores edge_weight by default; we proceed without it
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.conv2(x, edge_index)
        return x.squeeze(-1)

In [None]:
# Train and Evaluate GraphSAGE Per User
sage_results = []

for user in tqdm(sampled_users, desc='Users (GraphSAGE)'):
    user_df = user_interactions[user]
    masks = build_user_masks(user_df)
    if masks is None:
        continue
    y, train_mask, test_mask, train_df, test_df = masks
    model = GraphSAGEClassifier(in_dim=data.x.size(-1), hidden_dim=CFG['hidden_dim'], dropout=CFG['dropout']).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=CFG['lr'], weight_decay=1e-5)
    criterion = nn.BCEWithLogitsLoss()
    stopper = EarlyStopping(patience=CFG['patience'])

    for epoch in range(CFG['epochs']):
        model.train()
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.edge_weight)
        loss = criterion(out[train_mask], y[train_mask])
        loss.backward()
        optimizer.step()
        stopper.step(loss.item())
        if stopper.stop:
            break

    model.eval()
    with torch.no_grad():
        logits = model(data.x, data.edge_index, data.edge_weight)
        probs = torch.sigmoid(logits)
    test_probs = probs[test_mask].detach().cpu().numpy()
    test_labels = y[test_mask].detach().cpu().numpy().astype(int)
    metrics = compute_metrics(test_labels, test_probs)
    metrics.update({'userId': user,'train_size': int(train_mask.sum().item()),'test_size': int(test_mask.sum().item()),'model':'GraphSAGE'})
    sage_results.append(metrics)

sage_df = pd.DataFrame(sage_results)
sage_path = os.path.join(CFG['results_dir'], 'gnn_graphsage_results.csv')
sage_df.to_csv(sage_path, index=False)
print('Saved GraphSAGE results ->', sage_path)
display(sage_df.head())

Users (GraphSAGE): 100%|██████████| 500/500 [03:59<00:00,  2.09it/s]

Saved GraphSAGE results -> results/gnn/gnn_graphsage_results.csv





Unnamed: 0,accuracy,f1,roc_auc,precision,recall,userId,train_size,test_size,model
0,0.8,0.857143,1.0,0.75,1.0,267903,16,5,GraphSAGE
1,0.4,0.571429,0.333333,0.5,0.666667,263665,18,5,GraphSAGE
2,0.4,,,,,98503,18,5,GraphSAGE
3,1.0,,,,,204829,18,5,GraphSAGE
4,0.6,0.666667,0.333333,0.5,1.0,81787,16,5,GraphSAGE


In [None]:
# Mean Metrics and Comparison
metrics_cols = ['accuracy','f1','roc_auc','precision','recall']
if len(gcn_df) and len(sage_df):
    gcn_mean = gcn_df[metrics_cols].mean(numeric_only=True)
    sage_mean = sage_df[metrics_cols].mean(numeric_only=True)
    comp = pd.DataFrame({'GCN': gcn_mean, 'GraphSAGE': sage_mean, 'Delta(SAGE-GCN)': sage_mean - gcn_mean})
    print('Mean metrics comparison:')
    display(comp)
    print('\nGCN head:')
    display(gcn_df.head())
    print('\nGraphSAGE head:')
    display(sage_df.head())
else:
    print('Insufficient results to summarize.')

Mean metrics comparison:


Unnamed: 0,GCN,GraphSAGE,Delta(SAGE-GCN)
accuracy,0.598064,0.577583,-0.020481
f1,0.587176,0.556989,-0.030187
roc_auc,0.597168,0.563802,-0.033366
precision,0.57449,0.550423,-0.024067
recall,0.711477,0.63676,-0.074717



GCN head:


Unnamed: 0,accuracy,f1,roc_auc,precision,recall,userId,train_size,test_size,model
0,0.6,0.75,0.833333,0.6,1.0,267903,16,5,GCN
1,0.6,0.75,0.5,0.6,1.0,263665,18,5,GCN
2,0.0,,,,,98503,18,5,GCN
3,1.0,,,,,204829,18,5,GCN
4,0.6,0.666667,1.0,0.5,1.0,81787,16,5,GCN



GraphSAGE head:


Unnamed: 0,accuracy,f1,roc_auc,precision,recall,userId,train_size,test_size,model
0,0.8,0.857143,1.0,0.75,1.0,267903,16,5,GraphSAGE
1,0.4,0.571429,0.333333,0.5,0.666667,263665,18,5,GraphSAGE
2,0.4,,,,,98503,18,5,GraphSAGE
3,1.0,,,,,204829,18,5,GraphSAGE
4,0.6,0.666667,0.333333,0.5,1.0,81787,16,5,GraphSAGE
