In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# --- 1. Dataset (Optimized for Speed) ---
class CoffeeDataset(Dataset):
    def __init__(self, interactions, recipes, users):
        # Create fast lookup dictionaries
        self.user_map = users.set_index('user_id')[['taste_pref_bitterness', 'taste_pref_sweetness', 'taste_pref_acidity', 'taste_pref_body']].T.to_dict('list')
        self.recipe_map = recipes.set_index('recipe_id')[['taste_bitterness', 'taste_sweetness', 'taste_acidity', 'taste_body']].T.to_dict('list')
        
        # Filter valid interactions
        valid_interactions = []
        for _, row in interactions.iterrows():
            if row['user_id'] in self.user_map and row['recipe_id'] in self.recipe_map:
                valid_interactions.append(row)
        
        self.data = pd.DataFrame(valid_interactions)
        self.u_ids = self.data['user_id'].values
        self.r_ids = self.data['recipe_id'].values
        
        # --- CRITICAL CHANGE: BINARY TARGETS ---
        # We teach the model: 1 if rating >= 4 (Good), 0 otherwise.
        # This makes ranking much easier to learn.
        self.targets = (self.data['rating'].values >= 4).astype(np.float32)
        
        # We keep original ratings for NDCG calculation
        self.raw_ratings = self.data['rating'].values.astype(np.float32)

    def __len__(self): return len(self.targets)

    def __getitem__(self, idx):
        u_feat = np.array(self.user_map[self.u_ids[idx]], dtype=np.float32)
        r_feat = np.array(self.recipe_map[self.r_ids[idx]], dtype=np.float32)
        return u_feat, r_feat, self.targets[idx], self.raw_ratings[idx]

# --- 2. The Concatenation Model (MLP) ---
# This architecture learns relationships BETTER than dot products
class ConcatModel(nn.Module):
    def __init__(self, input_dim=8): # 4 user feats + 4 recipe feats
        super(ConcatModel, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.2), # Helps prevent overfitting
            
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            
            nn.Linear(32, 1) # Output one score (logit)
        )

    def forward(self, user_features, item_features):
        # Combine user and item features into one vector
        combined = torch.cat([user_features, item_features], dim=1)
        return self.layers(combined).squeeze()

# --- 3. NDCG Evaluation ---
def evaluate_ndcg(model, val_df, recipes, users, k=5):
    model.eval()
    user_ndcgs = []
    
    # Fast Map Setup
    u_map = users.set_index('user_id')[['taste_pref_bitterness', 'taste_pref_sweetness', 'taste_pref_acidity', 'taste_pref_body']].T.to_dict('list')
    r_map = recipes.set_index('recipe_id')[['taste_bitterness', 'taste_sweetness', 'taste_acidity', 'taste_body']].T.to_dict('list')
    
    # Filter valid data
    valid_val_df = val_df[val_df['user_id'].isin(u_map.keys()) & val_df['recipe_id'].isin(r_map.keys())]
    grouped = valid_val_df.groupby('user_id')
    
    with torch.no_grad():
        for user_id, group in grouped:
            if len(group) < 2: continue
            
            # Prepare Batch
            u_feat = torch.tensor([u_map[user_id]] * len(group), dtype=torch.float32)
            r_feat = torch.tensor([r_map[r] for r in group['recipe_id'].values], dtype=torch.float32)
            
            # Ground Truth (Original 1-5 Ratings)
            true_ratings = torch.tensor(group['rating'].values, dtype=torch.float32)
            
            # Get Scores (Logits)
            preds = model(u_feat, r_feat) # Higher logit = higher probability of being "Good"
            
            # Sort by Model's Score
            _, indices = torch.sort(preds, descending=True)
            relevance_at_k = true_ratings[indices[:k]]
            
            # Ideal Sort
            ideal_relevance, _ = torch.sort(true_ratings, descending=True)
            ideal_relevance = ideal_relevance[:k]
            
            # Calc NDCG
            discounts = torch.log2(torch.arange(2, len(relevance_at_k) + 2).float())
            dcg = torch.sum(relevance_at_k / discounts)
            idcg = torch.sum(ideal_relevance / discounts)
            
            ndcg = (dcg / idcg) if idcg > 0 else torch.tensor(0.0)
            user_ndcgs.append(ndcg.item())
            
    return np.mean(user_ndcgs) if user_ndcgs else 0.0

# --- 4. Main Training Loop ---
if __name__ == '__main__':
    # Load Data
    try:
        users_df = pd.read_csv('student_data/users.csv').fillna(0)
        recipes_df = pd.read_csv('student_data/recipes.csv').fillna(0)
        interactions_df = pd.read_csv('student_data/interactions_train.csv').fillna(2.5)
        val_csv = pd.read_csv('student_data/interactions_val.csv').fillna(2.5) 
    except Exception as e:
        print(f"Error: {e}")
        exit()

    # Split Train
    train_df, internal_val = train_test_split(interactions_df, test_size=0.1, random_state=42)
    
    # Setup
    train_dataset = CoffeeDataset(train_df, recipes_df, users_df)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    
    # Initialize MLP Model
    model = ConcatModel(input_dim=8) # 4 user + 4 recipe features
    
    # BCEWithLogitsLoss is standard for Binary Classification
    criterion = nn.BCEWithLogitsLoss() 
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5) # Added weight_decay
    
    print(f"Training MLP on {len(train_df)} samples...")

    for epoch in range(10): # Increased epochs
        model.train()
        total_loss = 0
        
        for u_feat, r_feat, target, _ in train_loader:
            optimizer.zero_grad()
            logits = model(u_feat, r_feat)
            loss = criterion(logits, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            
        # Eval
        if (epoch+1) % 2 == 0:
            val_ndcg = evaluate_ndcg(model, internal_val, recipes_df, users_df)
            print(f"Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f} | Train-Val NDCG: {val_ndcg:.4f}")

    print("-" * 30)
    # FINAL TEST ON YOUR SEPARATE VALIDATION FILE
    final_ndcg = evaluate_ndcg(model, val_csv, recipes_df, users_df)
    print(f"FINAL NDCG (Provided Val Set): {final_ndcg:.4f}")

Training MLP on 68305 samples...
Epoch 2 | Loss: 0.4945 | Train-Val NDCG: 0.9067
Epoch 4 | Loss: 0.4903 | Train-Val NDCG: 0.9086
Epoch 6 | Loss: 0.4881 | Train-Val NDCG: 0.9089
Epoch 8 | Loss: 0.4875 | Train-Val NDCG: 0.9092
Epoch 10 | Loss: 0.4884 | Train-Val NDCG: 0.9084
------------------------------
FINAL NDCG (Provided Val Set): 0.8133


In [2]:
val_cold_csv = pd.read_csv('student_data/interactions_val_cold.csv').fillna(2.5)

final_ndcg = evaluate_ndcg(model, val_cold_csv, recipes_df, users_df)
print(f"FINAL NDCG (Provided Val Cold Set): {final_ndcg:.4f}")

FINAL NDCG (Provided Val Cold Set): 0.8243
