## Configuration

In [None]:
# ========== CONFIGURATION ==========
TRAIN_PATH = '/home/stargix/Desktop/hackathons/datathon/train/train'
TARGET_COL = "iap_revenue_d7"
TRAIN_SAMPLE_FRAC = 0.10  # Adjust for more/less data

# PyTorch settings
DEVICE = 'cuda' if __import__('torch').cuda.is_available() else 'cpu'
BATCH_SIZE = 256
TEACHER_EPOCHS = 5
STUDENT_EPOCHS = 5
LEARNING_RATE = 1e-3
DISTILL_ALPHA = 0.6  # weight for hard loss

print(f"Device: {DEVICE}")
print(f"Sample fraction: {TRAIN_SAMPLE_FRAC}")

## Imports

In [None]:
import numpy as np
import pandas as pd
import dask
import dask.dataframe as dd
from sklearn.metrics import mean_squared_log_error, roc_auc_score
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import gc
import os
from glob import glob

# Reproducibility
RSEED = 42
np.random.seed(RSEED)
torch.manual_seed(RSEED)

dask.config.set({"dataframe.convert-string": False})
print("Libraries imported successfully")

## Helper Functions

In [None]:
# Columnas problemáticas (listas/dicts) que se ignoran
IGNORE_BIG_COLS = [
    "bundles_ins", "user_bundles", "user_bundles_l28d",
    "city_hist", "country_hist", "region_hist",
    "dev_language_hist", "dev_osv_hist",
    "bcat", "bcat_bottom_taxonomy",
    "bundles_cat", "bundles_cat_bottom_taxonomy",
    "first_request_ts_bundle", "first_request_ts_category_bottom_taxonomy",
    "last_buy_ts_bundle", "last_buy_ts_category",
    "last_install_ts_bundle", "last_install_ts_category",
    "advertiser_actions_action_count", "advertiser_actions_action_last_timestamp",
    "user_actions_bundles_action_count", "user_actions_bundles_action_last_timestamp",
    "new_bundles",
    "whale_users_bundle_num_buys_prank", "whale_users_bundle_revenue_prank",
    "whale_users_bundle_total_num_buys", "whale_users_bundle_total_revenue",
]

LABEL_COLS = [
    "buyer_d1", "buyer_d7", "buyer_d14", "buyer_d28",
    "buy_d7", "buy_d14", "buy_d28",
    "iap_revenue_d7", "iap_revenue_d14", "iap_revenue_d28",
    "registration",
    "retention_d1_to_d7", "retention_d3_to_d7", "retention_d7_to_d14",
    "retention_d1", "retention_d3", "retention_d7",
]

def reduce_memory(df: pd.DataFrame) -> pd.DataFrame:
    """Downcast numeric columns to save memory."""
    df = df.copy()
    for col in df.columns:
        col_type = df[col].dtype
        if col_type == "float64":
            df[col] = df[col].astype("float32")
        elif col_type == "int64":
            df[col] = df[col].astype("int32")
    return df

def detect_listlike_columns(df: pd.DataFrame, cols=None):
    """Detect columns containing lists or dicts."""
    if cols is None:
        cols = df.columns
    listlike = []
    for c in cols:
        sample_vals = df[c].head(100)
        if sample_vals.apply(lambda v: isinstance(v, (list, dict))).any():
            listlike.append(c)
    return listlike

def preprocess_train_valid(X_train, X_valid, num_cols, cat_cols):
    """Preprocess train and validation sets."""
    X_train = X_train.copy()
    X_valid = X_valid.copy()
    
    # Numeric: fill NaN with 0
    for c in num_cols:
        X_train[c] = X_train[c].fillna(0)
        X_valid[c] = X_valid[c].fillna(0)
    
    # Categorical: convert to strings and encode as integers
    cat_mappings = {}
    for c in cat_cols:
        X_train[c] = X_train[c].astype("object").fillna("unknown").astype(str)
        X_train[c] = X_train[c].astype("category")
        
        # Create mapping
        cats = X_train[c].cat.categories
        cat_mappings[c] = {cat: i for i, cat in enumerate(cats)}
        
        # Encode train
        X_train[c] = X_train[c].cat.codes
        
        # Encode valid (handle unseen categories)
        X_valid[c] = X_valid[c].astype("object").fillna("unknown").astype(str)
        X_valid[c] = X_valid[c].map(cat_mappings[c]).fillna(-1).astype(np.int32)
    
    return X_train, X_valid, cat_mappings

print("Helper functions loaded.")

## Load and Prepare Data

In [None]:
# Train: Oct 1-5, Valid: Oct 6
filters_train = [("datetime", ">=", "2025-10-01-00-00"),
                 ("datetime", "<",  "2025-10-06-00-00")]
filters_valid = [("datetime", ">=", "2025-10-06-00-00"),
                 ("datetime", "<",  "2025-10-07-00-00")]

# Get list of parquet files
parquet_files_all = glob(os.path.join(TRAIN_PATH, '**/part-*.parquet'), recursive=True)

# Reduce number of files for faster training
num_files_train = max(1, int(len(parquet_files_all) * 0.15))
parquet_files_train = parquet_files_all[:num_files_train]

print(f"Using {num_files_train} out of {len(parquet_files_all)} train files")

# Columns to drop early
cols_to_drop_early = IGNORE_BIG_COLS + ["row_id", "datetime"]

# Load TRAIN
print("Loading train data...")
dd_train = dd.read_parquet(
    parquet_files_train, 
    filters=filters_train,
    engine='pyarrow'
)

# Drop heavy columns BEFORE compute
existing_cols = [c for c in cols_to_drop_early if c in dd_train.columns]
dd_train = dd_train.drop(columns=existing_cols)

# Sample in Dask
train_sample = dd_train.sample(frac=TRAIN_SAMPLE_FRAC, random_state=RSEED).compute()
train_sample = reduce_memory(train_sample)

print(f"Train loaded: {train_sample.shape}, Memory: {train_sample.memory_usage(deep=True).sum() / 1e9:.2f} GB")

# Clean memory
del dd_train
gc.collect()

# Load VALID
print("\nLoading validation data...")
dd_valid = dd.read_parquet(
    parquet_files_train,
    filters=filters_valid,
    engine='pyarrow'
)

existing_cols = [c for c in cols_to_drop_early if c in dd_valid.columns]
dd_valid = dd_valid.drop(columns=existing_cols)

# Sample less in validation
valid_df = dd_valid.sample(frac=min(0.5, TRAIN_SAMPLE_FRAC), random_state=RSEED).compute()
valid_df = reduce_memory(valid_df)

print(f"Valid loaded: {valid_df.shape}, Memory: {valid_df.memory_usage(deep=True).sum() / 1e9:.2f} GB")

del dd_valid
gc.collect()

print(f"\n✓ Data loaded successfully")
print(f"Total memory: ~{(train_sample.memory_usage(deep=True).sum() + valid_df.memory_usage(deep=True).sum()) / 1e9:.2f} GB")

In [None]:
# Extract targets
y_train = train_sample[TARGET_COL].values
y_valid = valid_df[TARGET_COL].values

# Extract buyer labels
y_train_buyer = train_sample["buyer_d7"].values
y_valid_buyer = valid_df["buyer_d7"].values

print(f"Buyer ratio in train: {y_train_buyer.mean():.4f}")
print(f"Buyer ratio in valid: {y_valid_buyer.mean():.4f}")

# Target transform: log1p for stability (MSLE)
y_train_log = np.log1p(y_train.clip(min=0.0))
y_valid_log = np.log1p(y_valid.clip(min=0.0))

# Prepare features
cols_to_drop = ["row_id", "datetime"] + LABEL_COLS
feature_cols = [c for c in train_sample.columns if c not in cols_to_drop]

X_train = train_sample[feature_cols].copy()
X_valid = valid_df[feature_cols].copy()

# Detect and remove list-like columns
listlike_cols = detect_listlike_columns(X_train, cols=feature_cols)
print(f"Removing {len(listlike_cols)} list-like columns: {listlike_cols}")
X_train = X_train.drop(columns=listlike_cols)
X_valid = X_valid.drop(columns=listlike_cols)

# Identify numeric and categorical columns
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

print(f"Features: {len(X_train.columns)} ({len(num_cols)} numeric, {len(cat_cols)} categorical)")

# Preprocess
X_train_prep, X_valid_prep, cat_mappings = preprocess_train_valid(X_train, X_valid, num_cols, cat_cols)
print(f"Data prepared: X_train {X_train_prep.shape}, X_valid {X_valid_prep.shape}")

## PyTorch Dataset

In [None]:
class TabularDataset(Dataset):
    def __init__(self, df, cat_cols, num_cols, y, y_buyer=None):
        self.cat = df[cat_cols].values.astype(np.int64) if len(cat_cols) > 0 else None
        self.num = df[num_cols].values.astype(np.float32) if len(num_cols) > 0 else None
        self.y = y.astype(np.float32)
        self.buyer = y_buyer.astype(np.float32) if y_buyer is not None else None
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        item = {}
        if self.cat is not None:
            item['cat'] = torch.tensor(self.cat[idx], dtype=torch.long)
        if self.num is not None:
            item['num'] = torch.tensor(self.num[idx], dtype=torch.float32)
        item['y'] = torch.tensor(self.y[idx], dtype=torch.float32)
        if self.buyer is not None:
            item['buyer'] = torch.tensor(self.buyer[idx], dtype=torch.float32)
        return item

# Create datasets and dataloaders
train_ds = TabularDataset(X_train_prep, cat_cols, num_cols, y_train_log, y_train_buyer)
val_ds = TabularDataset(X_valid_prep, cat_cols, num_cols, y_valid_log, y_valid_buyer)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

print(f"Dataloaders ready. Batches per epoch: train={len(train_loader)}, val={len(val_loader)}")

## Model Definitions

In [None]:
def get_embedding_sizes(cat_cols, cat_mappings, max_emb_dim=50):
    """Calculate embedding sizes for categorical features."""
    emb_sizes = []
    for c in cat_cols:
        n_unique = len(cat_mappings[c]) + 1  # +1 for unseen category
        emb_dim = min(max(1, n_unique // 10), max_emb_dim)
        emb_sizes.append((n_unique, emb_dim))
    return emb_sizes

class TeacherModel(nn.Module):
    """Larger teacher model with dual heads (regression + classification)."""
    def __init__(self, emb_sizes, num_len):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(categories, dim) for categories, dim in emb_sizes])
        emb_dim_sum = sum([dim for _, dim in emb_sizes]) if len(emb_sizes) > 0 else 0
        input_dim = emb_dim_sum + (num_len if num_len > 0 else 0)
        
        # Main network
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.15),
            nn.Linear(256, 128),
            nn.ReLU(),
        )
        
        # Regression head (revenue prediction)
        self.reg_head = nn.Linear(128, 1)
        
        # Classification head (buyer prediction)
        self.buyer_head = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x_cat, x_num):
        if x_cat is not None and len(self.embs) > 0:
            embs = [emb(x_cat[:, i]) for i, emb in enumerate(self.embs)]
            x = torch.cat(embs + ([x_num] if x_num is not None else []), dim=1)
        else:
            x = x_num
        
        feat = self.net(x)
        out_reg = self.reg_head(feat)
        out_buyer = self.buyer_head(feat)
        
        return out_reg.view(-1), out_buyer.view(-1)

class StudentModel(nn.Module):
    """Smaller student model (faster inference)."""
    def __init__(self, emb_sizes, num_len):
        super().__init__()
        # Reduce embedding dimensions by half
        small_embs = [(n, max(1, d // 2)) for n, d in emb_sizes]
        self.embs = nn.ModuleList([nn.Embedding(categories, dim) for categories, dim in small_embs])
        emb_dim_sum = sum([dim for _, dim in small_embs]) if len(small_embs) > 0 else 0
        input_dim = emb_dim_sum + (num_len if num_len > 0 else 0)
        
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    
    def forward(self, x_cat, x_num):
        if x_cat is not None and len(self.embs) > 0:
            embs = [emb(x_cat[:, i]) for i, emb in enumerate(self.embs)]
            x = torch.cat(embs + ([x_num] if x_num is not None else []), dim=1)
        else:
            x = x_num
        
        out = self.net(x)
        return out.view(-1)

print("Model classes defined.")

In [None]:
# Initialize models
emb_sizes = get_embedding_sizes(cat_cols, cat_mappings, max_emb_dim=50)
print(f"Embedding sizes (categories, dim): {emb_sizes[:5]}...")  # Show first 5

teacher = TeacherModel(emb_sizes, len(num_cols)).to(DEVICE)
student = StudentModel(emb_sizes, len(num_cols)).to(DEVICE)

teacher_params = sum(p.numel() for p in teacher.parameters() if p.requires_grad)
student_params = sum(p.numel() for p in student.parameters() if p.requires_grad)

print(f"\nTeacher params: {teacher_params:,}")
print(f"Student params: {student_params:,}")
print(f"Compression ratio: {teacher_params / student_params:.2f}x")

## Train Teacher Model

In [None]:
def train_teacher(model, train_loader, val_loader, epochs=5, lr=1e-3, device='cpu'):
    """Train teacher model with dual objectives."""
    opt = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    mse_loss = nn.MSELoss()
    bce_loss = nn.BCELoss()
    
    model.to(device)
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0.0
        train_loss_reg = 0.0
        train_loss_buyer = 0.0
        
        for batch in train_loader:
            x_cat = batch.get('cat', None).to(device) if 'cat' in batch else None
            x_num = batch.get('num', None).to(device) if 'num' in batch else None
            y = batch['y'].to(device)
            buyer = batch['buyer'].to(device)
            
            opt.zero_grad()
            pred_log1p, pred_buyer = model(x_cat, x_num)
            
            loss_reg = mse_loss(pred_log1p, y)
            loss_buyer = bce_loss(pred_buyer, buyer)
            loss = loss_reg + 0.5 * loss_buyer
            
            loss.backward()
            opt.step()
            
            train_loss += loss.item() * len(y)
            train_loss_reg += loss_reg.item() * len(y)
            train_loss_buyer += loss_buyer.item() * len(y)
        
        train_loss /= len(train_loader.dataset)
        train_loss_reg /= len(train_loader.dataset)
        train_loss_buyer /= len(train_loader.dataset)
        
        # Validation
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                x_cat = batch.get('cat', None).to(device) if 'cat' in batch else None
                x_num = batch.get('num', None).to(device) if 'num' in batch else None
                y = batch['y'].to(device)
                buyer = batch['buyer'].to(device)
                
                pred_log1p, pred_buyer = model(x_cat, x_num)
                loss_reg = mse_loss(pred_log1p, y)
                loss_buyer = bce_loss(pred_buyer, buyer)
                loss = loss_reg + 0.5 * loss_buyer
                
                val_loss += loss.item() * len(y)
        
        val_loss /= len(val_loader.dataset)
        
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"  Train - Total: {train_loss:.6f}, Reg: {train_loss_reg:.6f}, Buyer: {train_loss_buyer:.6f}")
        print(f"  Valid - Total: {val_loss:.6f}")
    
    torch.save(model.state_dict(), 'teacher_model_v2.pt')
    print("\n✓ Teacher saved to teacher_model_v2.pt")

print("Teacher training function defined. Running training...")
train_teacher(teacher, train_loader, val_loader, epochs=TEACHER_EPOCHS, lr=LEARNING_RATE, device=DEVICE)

## Knowledge Distillation

In [None]:
def train_student_with_distillation(student, teacher, train_loader, val_loader, epochs=5, lr=1e-3, alpha=0.6, device='cpu'):
    """Train student using knowledge distillation from teacher.
    
    Loss = alpha * L_hard + (1 - alpha) * L_soft
    where L_hard = MSE(student, true_label) and L_soft = MSE(student, teacher)
    """
    opt = torch.optim.Adam(student.parameters(), lr=lr, weight_decay=1e-5)
    mse = nn.MSELoss()
    
    teacher.to(device)
    teacher.eval()
    student.to(device)
    
    for epoch in range(epochs):
        # Training
        student.train()
        total_loss = 0.0
        hard_loss_sum = 0.0
        soft_loss_sum = 0.0
        
        for batch in train_loader:
            x_cat = batch.get('cat', None).to(device) if 'cat' in batch else None
            x_num = batch.get('num', None).to(device) if 'num' in batch else None
            y = batch['y'].to(device)
            
            # Get teacher predictions (soft targets)
            with torch.no_grad():
                t_pred_log1p, _ = teacher(x_cat, x_num)
            
            # Get student predictions
            s_pred = student(x_cat, x_num)
            
            # Combined loss
            loss_hard = mse(s_pred, y)
            loss_soft = mse(s_pred, t_pred_log1p)
            loss = alpha * loss_hard + (1.0 - alpha) * loss_soft
            
            opt.zero_grad()
            loss.backward()
            opt.step()
            
            total_loss += loss.item() * len(y)
            hard_loss_sum += loss_hard.item() * len(y)
            soft_loss_sum += loss_soft.item() * len(y)
        
        total_loss /= len(train_loader.dataset)
        hard_loss_sum /= len(train_loader.dataset)
        soft_loss_sum /= len(train_loader.dataset)
        
        # Validation
        student.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                x_cat = batch.get('cat', None).to(device) if 'cat' in batch else None
                x_num = batch.get('num', None).to(device) if 'num' in batch else None
                y = batch['y'].to(device)
                
                s_pred = student(x_cat, x_num)
                loss = mse(s_pred, y)
                val_loss += loss.item() * len(y)
        
        val_loss /= len(val_loader.dataset)
        
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"  Train - Total: {total_loss:.6f}, Hard: {hard_loss_sum:.6f}, Soft: {soft_loss_sum:.6f}")
        print(f"  Valid - MSE: {val_loss:.6f}")
    
    torch.save(student.state_dict(), 'student_model_v2.pt')
    print("\n✓ Student saved to student_model_v2.pt")

print("Distillation training function defined. Running distillation...")
train_student_with_distillation(student, teacher, train_loader, val_loader, 
                                epochs=STUDENT_EPOCHS, lr=LEARNING_RATE, 
                                alpha=DISTILL_ALPHA, device=DEVICE)

## Evaluation

In [None]:
def predict_model(model, loader, device='cpu', return_buyer=False):
    """Generate predictions from model."""
    model.to(device)
    model.eval()
    preds = []
    trues = []
    buyers_true = []
    buyers_pred = []
    
    with torch.no_grad():
        for batch in loader:
            x_cat = batch.get('cat', None).to(device) if 'cat' in batch else None
            x_num = batch.get('num', None).to(device) if 'num' in batch else None
            y = batch['y'].to(device)
            
            # Check if model has dual heads (teacher) or single head (student)
            out = model(x_cat, x_num)
            if isinstance(out, tuple):  # Teacher model
                pred, buyer_pred = out
                if return_buyer:
                    buyers_pred.append(buyer_pred.cpu().numpy())
            else:  # Student model
                pred = out
            
            preds.append(pred.cpu().numpy())
            trues.append(y.cpu().numpy())
            
            if 'buyer' in batch:
                buyers_true.append(batch['buyer'].numpy())
    
    preds = np.concatenate(preds).ravel()
    trues = np.concatenate(trues).ravel()
    buyers_true = np.concatenate(buyers_true).ravel() if buyers_true else None
    buyers_pred = np.concatenate(buyers_pred).ravel() if buyers_pred else None
    
    return preds, trues, buyers_true, buyers_pred

def msle_from_log_predictions(pred_log1p, true_log1p):
    """Calculate MSLE from log-transformed predictions."""
    pred = np.expm1(pred_log1p)
    true = np.expm1(true_log1p)
    pred = np.clip(pred, 0, None)
    true = np.clip(true, 0, None)
    return mean_squared_log_error(true, pred)

print("Evaluation utilities defined.")

In [None]:
# Evaluate Teacher
print("=" * 50)
print("TEACHER MODEL EVALUATION")
print("=" * 50)

teacher_preds_log, teacher_trues_log, buyers_true, buyers_pred = predict_model(
    teacher, val_loader, device=DEVICE, return_buyer=True
)

teacher_msle = msle_from_log_predictions(teacher_preds_log, teacher_trues_log)
print(f"Teacher MSLE: {teacher_msle:.6f}")

if buyers_pred is not None:
    buyer_auc = roc_auc_score(buyers_true, buyers_pred)
    print(f"Teacher Buyer AUC: {buyer_auc:.4f}")

# Evaluate Student
print("\n" + "=" * 50)
print("STUDENT MODEL EVALUATION")
print("=" * 50)

student_preds_log, student_trues_log, _, _ = predict_model(
    student, val_loader, device=DEVICE
)

student_msle = msle_from_log_predictions(student_preds_log, student_trues_log)
print(f"Student MSLE: {student_msle:.6f}")

# Baseline (all zeros)
baseline_msle = msle_from_log_predictions(
    np.zeros_like(student_trues_log), student_trues_log
)
print(f"\nBaseline (all zeros) MSLE: {baseline_msle:.6f}")

# Comparison
print("\n" + "=" * 50)
print("COMPARISON")
print("=" * 50)
print(f"Teacher improvement: {((baseline_msle - teacher_msle) / baseline_msle * 100):.2f}%")
print(f"Student improvement: {((baseline_msle - student_msle) / baseline_msle * 100):.2f}%")
print(f"Student vs Teacher gap: {((student_msle - teacher_msle) / teacher_msle * 100):.2f}%")

# Distribution stats
teacher_preds = np.expm1(teacher_preds_log)
student_preds = np.expm1(student_preds_log)

print("\nTeacher predictions:")
print(f"  Mean: {teacher_preds.mean():.4f}, Median: {np.median(teacher_preds):.4f}, Max: {teacher_preds.max():.4f}")
print(f"  % Non-zero: {(teacher_preds > 0).mean() * 100:.2f}%")

print("\nStudent predictions:")
print(f"  Mean: {student_preds.mean():.4f}, Median: {np.median(student_preds):.4f}, Max: {student_preds.max():.4f}")
print(f"  % Non-zero: {(student_preds > 0).mean() * 100:.2f}%")

## Summary

Este notebook combina:
1. **Carga de datos robusta** del `simplified_model_comparison.ipynb` (manejo de parquet con Dask, sampling, preprocesado)
2. **Modelos de deep learning** (Teacher-Student distillation en PyTorch)

### Ventajas del Student Model:
- **~50% menos parámetros** que el teacher
- **Inferencia más rápida** para producción
- **Aprende del teacher** (soft targets) además de los labels reales

### Para producción:
- Exportar el student a ONNX: `torch.onnx.export(student, ...)`
- Aplicar quantization para reducir tamaño y acelerar más
- Usar solo el student model (descartar teacher)

### Próximos pasos:
- Ajustar hiperparámetros (learning rate, arquitectura, alpha)
- Probar diferentes embedding dimensions
- Añadir más épocas si hay suficiente memoria/tiempo
- Implementar early stopping basado en validation loss