In [2]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from pandas.api.types import is_numeric_dtype

# ===========================
# Configuración general
# ===========================
RANDOM_STATE = 42
TARGET_COL   = "iap_revenue_d7"

TRAIN_PATH = "/kaggle/input/smadex-challenge-predict-the-revenue/train/train"
TEST_PATH  = "/kaggle/input/smadex-challenge-predict-the-revenue/test/test"

pd.set_option("display.max_columns", 200)

# ===========================
# Funciones auxiliares
# ===========================
def reduce_memory(df: pd.DataFrame) -> pd.DataFrame:
    """Downcast numéricas para ahorrar memoria."""
    df = df.copy()
    for col in df.columns:
        col_type = df[col].dtype
        if col_type == "float64":
            df[col] = df[col].astype("float32")
        elif col_type == "int64":
            df[col] = df[col].astype("int32")
    return df

def preprocess_train_valid(X_train, X_valid, num_cols, cat_cols):
    """Preprocesado para train/valid."""
    X_train = X_train.copy()
    X_valid = X_valid.copy()
    
    # Numéricas: NaN -> 0
    for c in num_cols:
        X_train[c] = X_train[c].fillna(0)
        X_valid[c] = X_valid[c].fillna(0)
    
    # Categóricas: strings + categorías fijas basadas en TRAIN
    for c in cat_cols:
        X_train[c] = X_train[c].astype("object").fillna("unknown").astype(str)
        X_train[c] = X_train[c].astype("category")
        
        cats = X_train[c].cat.categories
        
        X_valid[c] = X_valid[c].astype("object").fillna("unknown").astype(str)
        X_valid[c] = X_valid[c].astype(
            pd.api.types.CategoricalDtype(categories=cats)
        )
    
    return X_train, X_valid

def preprocess_new(X_new, num_cols, cat_cols, cat_ref_df):
    """Preprocesado para test usando las categorías de train."""
    X_new = X_new.copy()
    
    for c in num_cols:
        if c in X_new.columns:
            X_new[c] = X_new[c].fillna(0)
    
    for c in cat_cols:
        if c in X_new.columns:
            X_new[c] = X_new[c].astype("object").fillna("unknown").astype(str)
            cats = cat_ref_df[c].cat.categories
            X_new[c] = X_new[c].astype(
                pd.api.types.CategoricalDtype(categories=cats)
            )
    
    return X_new

# ===========================
# 0) Definir tus features
# ===========================
BASE_FEATURE_COLS = [
    "advertiser_bundle",
    "advertiser_subcategory",
    "country",
    "release_msrp",
    "dev_model",
    "advertiser_category",
    "dev_osv",
    "release_date",
    "hour",
    "dev_make",
    "dev_os",
    "weekday",
]

# Columnas que REALMENTE vamos a leer del parquet
# (features + target + opcionales row_id/datetime)
NEEDED_COLS = sorted(set(
    BASE_FEATURE_COLS
    + [TARGET_COL, "row_id", "datetime"]
))

print("Columnas que se van a leer del parquet:", NEEDED_COLS)
print("Total columnas leídas:", len(NEEDED_COLS))

# ===========================
# 1) Filtros por fecha (como antes)
# ===========================
filters_train = [("datetime", ">=", "2025-10-01-00-00"),
                 ("datetime", "<",  "2025-10-06-00-00")]

filters_valid = [("datetime", ">=", "2025-10-06-00-00"),
                 ("datetime", "<",  "2025-10-07-00-00")]

# ===========================
# 2) Leer train/valid SOLO con esas columnas
# ===========================
dd_train = dd.read_parquet(TRAIN_PATH, filters=filters_train, columns=NEEDED_COLS)
dd_valid = dd.read_parquet(TRAIN_PATH, filters=filters_valid, columns=NEEDED_COLS)

print("dd_train columns:", list(dd_train.columns))

# ===========================
# 3) Muestreo + paso a pandas
# ===========================
frac_train = 1

train_df = dd_train.sample(frac=frac_train, random_state=RANDOM_STATE).compute()
valid_df = dd_valid.compute()

train_df = reduce_memory(train_df)
valid_df = reduce_memory(valid_df)

print("Train shape:", train_df.shape)
print("Valid shape:", valid_df.shape)
print("Train memory (GB):", train_df.memory_usage(deep=True).sum() / (1024**3))
print("Valid memory (GB):", valid_df.memory_usage(deep=True).sum() / (1024**3))

# ===========================
# 4) y_train / y_valid (escala ORIGINAL)
# ===========================
assert TARGET_COL in train_df.columns, "Falta iap_revenue_d7 en train_df"
assert TARGET_COL in valid_df.columns, "Falta iap_revenue_d7 en valid_df"

y_train = train_df[TARGET_COL].astype("float32").values
y_valid = valid_df[TARGET_COL].astype("float32").values

# ===========================
# 5) Features (solo las que tú quieres y existen)
# ===========================
feature_cols = [c for c in BASE_FEATURE_COLS if c in train_df.columns]

print("Features usadas realmente:", feature_cols)

X_train = train_df[feature_cols].copy()
X_valid = valid_df[feature_cols].copy()

print("X_train shape (antes de prep):", X_train.shape)
print("X_valid shape (antes de prep):", X_valid.shape)

# ===========================
# 6) Numéricas / categóricas + preprocesado
# ===========================
num_cols = [c for c in X_train.columns if is_numeric_dtype(X_train[c])]
cat_cols = [c for c in X_train.columns if not is_numeric_dtype(X_train[c])]

print("Numéricas:", len(num_cols), "->", num_cols)
print("Categóricas:", len(cat_cols), "->", cat_cols)

X_train_prep, X_valid_prep = preprocess_train_valid(X_train, X_valid, num_cols, cat_cols)

print("X_train_prep shape:", X_train_prep.shape)
print("X_valid_prep shape:", X_valid_prep.shape)

Columnas que se van a leer del parquet: ['advertiser_bundle', 'advertiser_category', 'advertiser_subcategory', 'country', 'datetime', 'dev_make', 'dev_model', 'dev_os', 'dev_osv', 'hour', 'iap_revenue_d7', 'release_date', 'release_msrp', 'row_id', 'weekday']
Total columnas leídas: 15
dd_train columns: ['advertiser_bundle', 'advertiser_category', 'advertiser_subcategory', 'country', 'datetime', 'dev_make', 'dev_model', 'dev_os', 'dev_osv', 'hour', 'iap_revenue_d7', 'release_date', 'release_msrp', 'row_id', 'weekday']
Train shape: (17294102, 15)
Valid shape: (3306478, 15)
Train memory (GB): 3.9395919451490045
Valid memory (GB): 0.7526155570521951
Features usadas realmente: ['advertiser_bundle', 'advertiser_subcategory', 'country', 'release_msrp', 'dev_model', 'advertiser_category', 'dev_osv', 'release_date', 'hour', 'dev_make', 'dev_os', 'weekday']
X_train shape (antes de prep): (17294102, 12)
X_valid shape (antes de prep): (3306478, 12)
Numéricas: 2 -> ['release_msrp', 'weekday']
Categó

In [3]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from pandas.api.types import CategoricalDtype

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device, "| Num GPUs:", torch.cuda.device_count())

# ===========================
# 1) Preparar X_train_nn / X_valid_nn con códigos categóricos
# ===========================
X_train_nn = X_train_prep.copy()
X_valid_nn = X_valid_prep.copy()

# 1.1 Asegurar dtype 'category' y categorías consistentes
for c in cat_cols:
    if c not in X_train_nn.columns:
        print(f"[AVISO] '{c}' no está en X_train_nn, la salto.")
        continue

    # TRAIN
    X_train_nn[c] = (
        X_train_nn[c]
        .astype("object")
        .fillna("unknown")
        .astype(str)
        .astype("category")
    )
    cats = X_train_nn[c].cat.categories

    # VALID con mismas categorías
    if c in X_valid_nn.columns:
        X_valid_nn[c] = (
            X_valid_nn[c]
            .astype("object")
            .fillna("unknown")
            .astype(str)
        )
        X_valid_nn[c] = X_valid_nn[c].astype(
            CategoricalDtype(categories=cats)
        )
    else:
        print(f"[AVISO] '{c}' no está en X_valid_nn, la salto en valid.")

# 1.2 Pasar categóricas a códigos enteros y clamped (sin -1)
for c in cat_cols:
    if c not in X_train_nn.columns:
        continue

    train_codes = X_train_nn[c].cat.codes.astype("int64")
    valid_codes = X_valid_nn[c].cat.codes.astype("int64") if c in X_valid_nn.columns else None

    # -1 -> 0 (categoría "desconocida")
    train_codes = train_codes.where(train_codes >= 0, 0)
    if valid_codes is not None:
        valid_codes = valid_codes.where(valid_codes >= 0, 0)

    X_train_nn[c] = train_codes
    if valid_codes is not None:
        X_valid_nn[c] = valid_codes

# 1.3 Cardinalidades de cada columna categórica
cat_cardinalities = []
for c in cat_cols:
    if c not in X_train_nn.columns:
        continue

    max_train = int(X_train_nn[c].max())
    if c in X_valid_nn.columns:
        max_valid = int(X_valid_nn[c].max())
    else:
        max_valid = max_train

    max_code = max(max_train, max_valid)
    card = max_code + 1
    cat_cardinalities.append(card)

    print(f"[CARD] {c}: max_train={max_train}, max_valid={max_valid}, card={card}")

print("Num cols numéricas:", len(num_cols))
print("Num cols categóricas:", len(cat_cardinalities))

assert len(cat_cardinalities) == len(cat_cols), \
    f"Descuadre: len(cat_cardinalities)={len(cat_cardinalities)}, len(cat_cols)={len(cat_cols)}"

for i, card in enumerate(cat_cardinalities):
    assert isinstance(card, int) and card > 0, f"Cardinalidad inválida en índice {i}: {card}"

# ===========================
# 2) Numpy arrays para la red
# ===========================
# Numéricas
X_train_num = X_train_nn[num_cols].to_numpy(dtype="float32")
X_valid_num = X_valid_nn[num_cols].to_numpy(dtype="float32")

# Categóricas (si no hay, dejamos matrices vacías)
if len(cat_cols) > 0:
    X_train_cat = X_train_nn[cat_cols].to_numpy(dtype="int64")
    X_valid_cat = X_valid_nn[cat_cols].to_numpy(dtype="int64")
else:
    X_train_cat = np.zeros((X_train_num.shape[0], 0), dtype="int64")
    X_valid_cat = np.zeros((X_valid_num.shape[0], 0), dtype="int64")

# Targets en LOG1P (como antes)
y_train_log = np.log1p(y_train.astype("float32"))
y_valid_log = np.log1p(y_valid.astype("float32"))

print("Shapes -> X_num:", X_train_num.shape, "X_cat:", X_train_cat.shape)
print("y_train_log shape:", y_train_log.shape)

# ===========================
# 3) Dataset y DataLoader
# ===========================
class TabularDataset(Dataset):
    def __init__(self, X_num, X_cat, y_log):
        self.X_num = torch.from_numpy(X_num.astype("float32"))
        self.X_cat = torch.from_numpy(X_cat.astype("int64"))
        # y en log, guardado como (N,1) para evitar broadcasting raro
        self.y     = torch.from_numpy(y_log.astype("float32")).view(-1, 1)
    
    def __len__(self):
        return self.X_num.shape[0]
    
    def __getitem__(self, idx):
        return (
            self.X_num[idx],
            self.X_cat[idx],
            self.y[idx],
        )

train_dataset = TabularDataset(X_train_num, X_train_cat, y_train_log)
valid_dataset = TabularDataset(X_valid_num, X_valid_cat, y_valid_log)

BATCH_SIZE = 4096  # ajustable

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=False,
    num_workers=2
)

valid_loader = DataLoader(
    valid_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    drop_last=False,
    num_workers=2
)

print("Tamaño train_dataset:", len(train_dataset))
print("Tamaño valid_dataset:", len(valid_dataset))

# ===========================
# 4) Definición del Teacher
# ===========================
class TeacherNet(nn.Module):
    def __init__(self, num_numeric, cat_cardinalities, emb_max_dim=64):
        super().__init__()
        
        self.num_numeric = num_numeric
        self.n_cat = len(cat_cardinalities)
        
        # Embeddings para cada columna categórica
        emb_layers = []
        emb_dims = []
        for card in cat_cardinalities:
            emb_dim = min(emb_max_dim, max(4, card // 2))  # regla simple
            emb_layers.append(nn.Embedding(card, emb_dim))
            emb_dims.append(emb_dim)
        
        self.emb_layers = nn.ModuleList(emb_layers)
        self.emb_dims = emb_dims
        
        input_dim = num_numeric + sum(emb_dims)
        
        hidden_sizes = [512, 512, 256, 128]  # Teacher "grande"
        layers = []
        in_dim = input_dim
        
        for h in hidden_sizes:
            layers.append(nn.Linear(in_dim, h))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(h))
            layers.append(nn.Dropout(0.2))
            in_dim = h
        
        layers.append(nn.Linear(in_dim, 1))  # salida escalar (log revenue)
        
        self.mlp = nn.Sequential(*layers)
    
    def forward(self, x_num, x_cat):
        # x_num: (B, num_numeric)
        # x_cat: (B, n_cat)
        if self.n_cat > 0:
            emb_list = []
            for i, emb in enumerate(self.emb_layers):
                emb_i = emb(x_cat[:, i])
                emb_list.append(emb_i)
            x_emb = torch.cat(emb_list, dim=1)
            x = torch.cat([x_num, x_emb], dim=1)
        else:
            x = x_num
        out = self.mlp(x)  # (B, 1) en log-space
        return out

num_numeric_features = len(num_cols)
model_teacher = TeacherNet(num_numeric_features, cat_cardinalities).to(device)
print(model_teacher)

# Test rápido de forward en GPU
x_num_dbg, x_cat_dbg, y_dbg = next(iter(train_loader))
x_num_dbg = x_num_dbg.to(device)
x_cat_dbg = x_cat_dbg.to(device)

with torch.no_grad():
    out_dbg = model_teacher(x_num_dbg, x_cat_dbg)
    print("GPU forward OK. out shape:", out_dbg.shape)

# ===========================
# 5) Entrenamiento en log-space (MSE pequeño como antes)
# ===========================
criterion = nn.MSELoss()  # MSE sobre log1p(revenue)
optimizer = torch.optim.AdamW(model_teacher.parameters(), lr=1e-3, weight_decay=1e-4)

EPOCHS = 20
best_val_loss = np.inf
patience = 5
patience_counter = 0

for epoch in range(1, EPOCHS + 1):
    # --------- TRAIN ----------
    model_teacher.train()
    train_loss_sum = 0.0
    n_train = 0
    
    for x_num, x_cat, y_log in train_loader:
        x_num = x_num.to(device)
        x_cat = x_cat.to(device)
        y_log = y_log.to(device)
        
        optimizer.zero_grad()
        y_pred_log = model_teacher(x_num, x_cat)
        
        loss = criterion(y_pred_log, y_log)
        loss.backward()
        optimizer.step()
        
        train_loss_sum += loss.item() * y_log.size(0)
        n_train += y_log.size(0)
    
    train_loss = train_loss_sum / max(n_train, 1)
    
    # --------- VALID ----------
    model_teacher.eval()
    val_loss_sum = 0.0
    n_val = 0
    
    with torch.no_grad():
        for x_num, x_cat, y_log in valid_loader:
            x_num = x_num.to(device)
            x_cat = x_cat.to(device)
            y_log = y_log.to(device)
            
            y_pred_log = model_teacher(x_num, x_cat)
            loss = criterion(y_pred_log, y_log)
            
            val_loss_sum += loss.item() * y_log.size(0)
            n_val += y_log.size(0)
    
    val_loss = val_loss_sum / max(n_val, 1)
    
    print(f"Epoch {epoch:02d} | train MSE_log: {train_loss:.6f} | valid MSE_log: {val_loss:.6f}")
    
    # Early stopping
    if val_loss < best_val_loss - 1e-4:
        best_val_loss = val_loss
        patience_counter = 0
        torch.save(model_teacher.state_dict(), "teacher_best.pth")
        print("  -> New best model saved.")
    else:
        patience_counter += 1
        print(f"  -> No improvement. Patience: {patience_counter}/{patience}")
        if patience_counter >= patience:
            print("Early stopping activado.")
            break

# Cargar el mejor modelo al final
model_teacher.load_state_dict(torch.load("teacher_best.pth", map_location=device))
model_teacher.eval()

# ===========================
# 6) Métricas: log-space y escala original
# ===========================
def eval_metrics(model, loader):
    model.eval()
    preds_log_list = []
    targets_log_list = []
    
    with torch.no_grad():
        for x_num, x_cat, y_log in loader:
            x_num = x_num.to(device)
            x_cat = x_cat.to(device)
            y_log = y_log.to(device)
            
            y_pred_log = model(x_num, x_cat)
            preds_log_list.append(y_pred_log.cpu().numpy())
            targets_log_list.append(y_log.cpu().numpy())
    
    preds_log = np.concatenate(preds_log_list, axis=0).ravel()
    targets_log = np.concatenate(targets_log_list, axis=0).ravel()
    
    mse_log = np.mean((preds_log - targets_log) ** 2)
    
    preds = np.expm1(preds_log)
    targets = np.expm1(targets_log)
    
    mse_orig = np.mean((preds - targets) ** 2)
    rmse_orig = np.sqrt(mse_orig)
    return mse_log, mse_orig, rmse_orig

mse_log_valid, mse_valid_orig, rmse_valid_orig = eval_metrics(model_teacher, valid_loader)
print("Valid MSE_log         :", float(mse_log_valid))
print("Valid MSE  (original) :", float(mse_valid_orig))
print("Valid RMSE (original) :", float(rmse_valid_orig))

Device: cuda | Num GPUs: 2
[CARD] advertiser_bundle: max_train=513, max_valid=513, card=514
[CARD] advertiser_subcategory: max_train=55, max_valid=55, card=56
[CARD] country: max_train=238, max_valid=238, card=239
[CARD] dev_model: max_train=14669, max_valid=14662, card=14670
[CARD] advertiser_category: max_train=21, max_valid=21, card=22
[CARD] dev_osv: max_train=238, max_valid=238, card=239
[CARD] release_date: max_train=196, max_valid=196, card=197
[CARD] hour: max_train=23, max_valid=23, card=24
[CARD] dev_make: max_train=957, max_valid=957, card=958
[CARD] dev_os: max_train=2, max_valid=2, card=3
Num cols numéricas: 2
Num cols categóricas: 10
Shapes -> X_num: (17294102, 2) X_cat: (17294102, 10)
y_train_log shape: (17294102,)
Tamaño train_dataset: 17294102
Tamaño valid_dataset: 3306478
TeacherNet(
  (emb_layers): ModuleList(
    (0): Embedding(514, 64)
    (1): Embedding(56, 28)
    (2): Embedding(239, 64)
    (3): Embedding(14670, 64)
    (4): Embedding(22, 11)
    (5): Embedding(