In [1]:
import pandas as pd
import dask.dataframe as dd
import dask
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from glob import glob
import os
import gc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import tqdm
from time import time

gc.collect()

27

In [2]:
train_path = '/home/stargix/Desktop/hackathons/datathon/train/train'
test_path = '/home/stargix/Desktop/hackathons/datathon/test/test'

In [3]:
# Definir columnas necesarias
required_columns = [
    'row_id', 'datetime',
    'buyer_d7', 'iap_revenue_d7',
    'advertiser_bundle', 'advertiser_category', 'advertiser_subcategory', 
    'advertiser_bottom_taxonomy_level',
    'country', 'region',
    'dev_make', 'dev_model', 'dev_os', 'dev_osv',
    'carrier',
    'hour', 'weekday', 'weekend_ratio', 'hour_ratio',
    'release_date', 'release_msrp',
    'avg_act_days', 'avg_daily_sessions', 'avg_days_ins', 'avg_duration',
    'weeks_since_first_seen', 'wifi_ratio',
    'retentiond7',
    'city_hist', 'country_hist', 'region_hist', 'dev_language_hist', 'dev_osv_hist',
    'cpm', 'cpm_pct_rk', 'ctr', 'ctr_pct_rk',
    'iap_revenue_usd_bundle', 'iap_revenue_usd_category',
    'num_buys_bundle', 'num_buys_category',
    'last_buy', 'last_ins',
    'bcat', 'bcat_bottom_taxonomy',
    'bundles_cat', 'bundles_cat_bottom_taxonomy', 
    'bundles_ins',
    'new_bundles', 'user_bundles', 'user_bundles_l28d',
    'advertiser_actions_action_count', 'advertiser_actions_action_last_timestamp',
    'user_actions_bundles_action_count', 'user_actions_bundles_action_last_timestamp',
    'last_advertiser_action',
    'first_request_ts', 'first_request_ts_bundle', 
    'first_request_ts_category_bottom_taxonomy',
    'rwd_prank',
    'whale_users_bundle_num_buys_prank', 'whale_users_bundle_revenue_prank'
]

# Obtener todos los archivos parquet
parquet_files_train = glob(os.path.join(train_path, '**/part-*.parquet'), recursive=True)
print(f"Total archivos disponibles: {len(parquet_files_train)}")

# üî• Reducir a 5% para evitar crash de memoria
batch_size = max(1, int(len(parquet_files_train) * 0.05))
num_batches = (len(parquet_files_train) + batch_size - 1) // batch_size
print(f"Batch size: {batch_size} archivos (5% por batch)")
print(f"N√∫mero de batches: {num_batches}")

Total archivos disponibles: 144
Batch size: 7 archivos (5% por batch)
N√∫mero de batches: 21


In [4]:
# Funciones de preprocesamiento
import ast

columns_to_sum = [
    'iap_revenue_usd_bundle',
    'num_buys_bundle',
    'rwd_prank',
    'whale_users_bundle_num_buys_prank',
    'whale_users_bundle_revenue_prank'
]

def sum_values(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return 0
    try:
        if isinstance(x, str):
            x = ast.literal_eval(x)
        if isinstance(x, list) and len(x) > 0:
            total = sum([item[1] for item in x if isinstance(item, tuple) and len(item) > 1])
            return total
        return 0
    except:
        return 0

def preprocess_dataframe(df, label_encoders=None, fit_encoders=False):
    """Preprocesa DataFrame de forma eficiente en memoria"""
    # Sumar columnas con listas
    for col in columns_to_sum:
        if col in df.columns:
            df[col] = df[col].apply(sum_values)
    
    # Label encoding
    cat_features = [
        'advertiser_bundle', 'advertiser_category', 'advertiser_subcategory',
        'country', 'region', 'dev_make', 'dev_model', 'dev_os', 'dev_osv'
    ]
    cat_features = [c for c in cat_features if c in df.columns]
    
    if fit_encoders:
        label_encoders = {}
        for col in cat_features:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str).fillna("__NA__"))
            label_encoders[col] = le
    else:
        for col in cat_features:
            if col in label_encoders:
                df[col] = df[col].astype(str).fillna("__NA__")
                df[col] = df[col].map(lambda x: label_encoders[col].transform([x])[0] if x in label_encoders[col].classes_ else -1)
    
    # üî• Optimizaci√≥n: Convertir a tipos m√°s eficientes
    for col in df.select_dtypes(include=['int64']).columns:
        if df[col].max() < 2147483647 and df[col].min() > -2147483648:
            df[col] = df[col].astype('int32')
    
    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')
    
    return df, label_encoders, cat_features

print("‚úì Funciones de preprocesamiento definidas")

‚úì Funciones de preprocesamiento definidas


In [5]:
# ARQUITECTURA RESNET 1D
class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout=0.2):
        super(ResidualBlock, self).__init__()
        self.block = nn.Sequential(
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim)
        )
        self.relu = nn.ReLU()
    
    def forward(self, x):
        return self.relu(x + self.block(x))

class RevenueResNet(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, num_blocks=4, dropout=0.2):
        super(RevenueResNet, self).__init__()
        
        # Projection inicial
        self.proj = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )
        
        # Residual blocks
        self.blocks = nn.Sequential(*[
            ResidualBlock(hidden_dim, dropout=dropout) for _ in range(num_blocks)
        ])
        
        # Head de regresi√≥n
        self.head = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    
    def forward(self, x):
        x = self.proj(x)
        x = self.blocks(x)
        return self.head(x)

print("‚úì Arquitectura ResNet definida")

‚úì Arquitectura ResNet definida


In [6]:
# Cargar primer batch para configuraci√≥n

import pickle
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("=" * 60)
print("CARGANDO PRIMER BATCH PARA CONFIGURACI√ìN")
print("=" * 60)

first_batch_files = parquet_files_train[:batch_size]
try:
    first_ddf = dd.read_parquet(first_batch_files, engine='pyarrow', columns=required_columns)
except Exception as e:
    print(f"‚ö†Ô∏è Cargando todas las columnas: {e}")
    first_ddf = dd.read_parquet(first_batch_files, engine='pyarrow')

first_df = first_ddf.compute(scheduler='synchronous')
print(f"‚úì Primer batch cargado: {first_df.shape}")

# Preprocesar y definir encoders
first_df, label_encoders, cat_features = preprocess_dataframe(first_df, fit_encoders=True)

# Definir features
labels_to_exclude = [
    'buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28',
    'buy_d7', 'buy_d14', 'buy_d28',
    'iap_revenue_d7', 'iap_revenue_d14', 'iap_revenue_d28',
    'registration', 'retention_d1_to_d7', 'retention_d3_to_d7',
    'retention_d7_to_d14', 'retention_d1', 'retention_d3', 'retention_d7',
    'row_id', 'datetime',
    'advertiser_actions_action_last_timestamp',
    'user_actions_bundles_action_last_timestamp',
    'first_request_ts', 'first_request_ts_bundle',
    'first_request_ts_category_bottom_taxonomy'
]

numeric_features = [
    c for c in first_df.columns
    if c not in labels_to_exclude and c not in cat_features
    and first_df[c].dtype in ['int64', 'int32', 'int16', 'int8', 'float32', 'float64']
]

features = numeric_features + cat_features
print(f"‚úì Features definidas: {len(features)} ({len(numeric_features)} num√©ricas + {len(cat_features)} categ√≥ricas)")

# Crear y fitear StandardScaler
scaler = StandardScaler()
first_df[numeric_features] = first_df[numeric_features].fillna(0)
scaler.fit(first_df[numeric_features])
print(f"‚úì StandardScaler fiteado con {len(first_df)} muestras")

# Guardar artifacts
with open('/home/stargix/Desktop/hackathons/datathon/scaler_resnet.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('/home/stargix/Desktop/hackathons/datathon/label_encoders_resnet.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)
with open('/home/stargix/Desktop/hackathons/datathon/features_resnet.pkl', 'wb') as f:
    pickle.dump({'features': features, 'numeric_features': numeric_features, 'cat_features': cat_features}, f)
print("‚úì Artifacts guardados")

del first_df, first_ddf
gc.collect()

CARGANDO PRIMER BATCH PARA CONFIGURACI√ìN


‚úì Primer batch cargado: (1026857, 62)
‚úì Features definidas: 24 (15 num√©ricas + 9 categ√≥ricas)
‚úì StandardScaler fiteado con 1026857 muestras
‚úì Artifacts guardados
‚úì Features definidas: 24 (15 num√©ricas + 9 categ√≥ricas)
‚úì StandardScaler fiteado con 1026857 muestras
‚úì Artifacts guardados


0

In [7]:
# Inicializar modelo
INPUT_DIM = len(features)
HIDDEN_DIM = 128  # üî• Reducido de 256 para ahorrar memoria
NUM_BLOCKS = 3     # üî• Reducido de 4
BATCH_SIZE = 1024  # üî• Reducido de 2048 para que quepa en memoria
LEARNING_RATE = 0.001
EPOCHS_PER_BATCH = 2  # üî• Reducido de 3 para procesar m√°s r√°pido
VALIDATION_DATE = pd.Timestamp('2025-10-06', tz='UTC')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = RevenueResNet(input_dim=INPUT_DIM, hidden_dim=HIDDEN_DIM, num_blocks=NUM_BLOCKS).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3)

print("=" * 60)
print("MODELO RESNET INICIALIZADO (OPTIMIZADO)")
print("=" * 60)
print(f"Input dim: {INPUT_DIM}")
print(f"Hidden dim: {HIDDEN_DIM} (reducido para memoria)")
print(f"Num blocks: {NUM_BLOCKS} (reducido para memoria)")
print(f"Batch size: {BATCH_SIZE} (reducido para memoria)")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Epochs per batch: {EPOCHS_PER_BATCH}")
print(f"Device: {device}")
print("=" * 60)

# Contar par√°metros
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total par√°metros: {total_params:,}")
print(f"Par√°metros entrenables: {trainable_params:,}")
print("=" * 60)

MODELO RESNET INICIALIZADO (OPTIMIZADO)
Input dim: 24
Hidden dim: 128 (reducido para memoria)
Num blocks: 3 (reducido para memoria)
Batch size: 1024 (reducido para memoria)
Learning rate: 0.001
Epochs per batch: 2
Device: cuda
Total par√°metros: 128,897
Par√°metros entrenables: 128,897


In [8]:
# ENTRENAMIENTO POR BATCHES (OPTIMIZADO)
print("\n" + "=" * 60)
print("ENTRENAMIENTO POR BATCHES (MEMORIA OPTIMIZADA)")
print("=" * 60)

best_val_loss = float('inf')
global_epoch = 0

for batch_idx in range(num_batches):
    print(f"\n{'='*60}")
    print(f"BATCH {batch_idx + 1}/{num_batches}")
    print(f"{'='*60}")
    
    # üî• Limpiar memoria antes de cargar
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    # Cargar batch
    start_idx = batch_idx * batch_size
    end_idx = min(start_idx + batch_size, len(parquet_files_train))
    batch_files = parquet_files_train[start_idx:end_idx]
    
    try:
        batch_ddf = dd.read_parquet(batch_files, engine='pyarrow', columns=required_columns)
    except:
        batch_ddf = dd.read_parquet(batch_files, engine='pyarrow')
    
    batch_df = batch_ddf.compute(scheduler='synchronous')
    print(f"‚úì Batch cargado: {batch_df.shape}")
    del batch_ddf  # üî• Liberar inmediatamente
    
    # Preprocesar
    batch_df, _, _ = preprocess_dataframe(batch_df, label_encoders=label_encoders, fit_encoders=False)
    
    # Split temporal
    batch_df['datetime'] = pd.to_datetime(batch_df['datetime'].astype(str))
    val_mask = batch_df['datetime'].dt.date == VALIDATION_DATE.date()
    train_mask = ~val_mask
    
    if train_mask.sum() == 0:
        print("‚ö†Ô∏è No hay datos de entrenamiento en este batch, saltando...")
        del batch_df
        gc.collect()
        continue
    
    # Preparar datos (solo train)
    X_train_batch = batch_df[train_mask][features].copy()
    y_train_batch = batch_df[train_mask]['iap_revenue_d7'].copy()
    del batch_df  # üî• Liberar DataFrame completo
    
    # Preprocesar
    X_train_batch[numeric_features] = X_train_batch[numeric_features].fillna(0)
    X_train_batch[numeric_features] = scaler.transform(X_train_batch[numeric_features])
    X_train_batch = X_train_batch.fillna(0)
    
    # üî• Limitar tama√±o si es muy grande
    if len(X_train_batch) > 500000:
        print(f"‚ö†Ô∏è Batch muy grande ({len(X_train_batch):,}), usando solo 500k samples...")
        indices = np.random.choice(len(X_train_batch), 500000, replace=False)
        X_train_batch = X_train_batch.iloc[indices]
        y_train_batch = y_train_batch.iloc[indices]
    
    # A tensors
    X_tensor = torch.FloatTensor(X_train_batch.values).to(device)
    y_tensor = torch.FloatTensor(np.log1p(y_train_batch.values)).reshape(-1, 1).to(device)
    del X_train_batch, y_train_batch  # üî• Liberar DataFrames
    
    train_dataset = TensorDataset(X_tensor, y_tensor)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    
    # Entrenar epochs en este batch
    for epoch in range(EPOCHS_PER_BATCH):
        model.train()
        train_loss = 0.0
        
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * batch_X.size(0)
        
        train_loss /= len(train_loader.dataset)
        global_epoch += 1
        
        print(f"  Epoch {epoch+1}/{EPOCHS_PER_BATCH} (Global: {global_epoch}) | Loss: {train_loss:.6f}")
    
    print(f"‚úì Batch {batch_idx + 1} completado")
    
    # üî• Limpiar agresivamente
    del X_tensor, y_tensor, train_dataset, train_loader
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

print("\n" + "=" * 60)
print("‚úì ENTRENAMIENTO COMPLETADO")
print(f"Total epochs: {global_epoch}")
print("=" * 60)

# Guardar modelo
torch.save(model.state_dict(), '/home/stargix/Desktop/hackathons/datathon/resnet_model.pt')
print("‚úì Modelo guardado: resnet_model.pt")


ENTRENAMIENTO POR BATCHES (MEMORIA OPTIMIZADA)

BATCH 1/21
‚úì Batch cargado: (1026857, 62)
‚úì Batch cargado: (1026857, 62)


KeyboardInterrupt: 

In [None]:
# EVALUACI√ìN EN VALIDACI√ìN (OPTIMIZADA)
print("\n" + "=" * 60)
print("EVALUACI√ìN EN VALIDACI√ìN")
print("=" * 60)

# üî• Procesar validaci√≥n en batches para no llenar memoria
val_predictions = []
val_targets = []

for batch_idx in range(num_batches):
    start_idx = batch_idx * batch_size
    end_idx = min(start_idx + batch_size, len(parquet_files_train))
    batch_files = parquet_files_train[start_idx:end_idx]
    
    try:
        batch_ddf = dd.read_parquet(batch_files, engine='pyarrow', columns=required_columns)
    except:
        batch_ddf = dd.read_parquet(batch_files, engine='pyarrow')
    
    batch_df = batch_ddf.compute(scheduler='synchronous')
    del batch_ddf
    
    batch_df, _, _ = preprocess_dataframe(batch_df, label_encoders=label_encoders, fit_encoders=False)
    
    batch_df['datetime'] = pd.to_datetime(batch_df['datetime'].astype(str))
    val_mask = batch_df['datetime'].dt.date == VALIDATION_DATE.date()
    
    if val_mask.sum() > 0:
        X_val_batch = batch_df[val_mask][features].copy()
        y_val_batch = batch_df[val_mask]['iap_revenue_d7'].copy()
        
        # Preprocesar
        X_val_batch[numeric_features] = X_val_batch[numeric_features].fillna(0)
        X_val_batch[numeric_features] = scaler.transform(X_val_batch[numeric_features])
        X_val_batch = X_val_batch.fillna(0)
        
        # Predecir
        model.eval()
        X_val_tensor = torch.FloatTensor(X_val_batch.values).to(device)
        
        with torch.no_grad():
            pred_log = model(X_val_tensor).cpu().numpy().flatten()
        
        val_predictions.append(np.expm1(pred_log).clip(0, None))
        val_targets.append(y_val_batch.values)
        
        del X_val_batch, y_val_batch, X_val_tensor, pred_log
    
    del batch_df
    gc.collect()

# Combinar resultados
pred = np.concatenate(val_predictions)
y_val = np.concatenate(val_targets)

print(f"‚úì Datos de validaci√≥n: {len(y_val):,} muestras")

# M√©tricas
msle = mean_squared_log_error(y_val, pred)
rmse = mean_squared_error(y_val, pred, squared=False)

print("=" * 60)
print("RESULTADOS EN VALIDACI√ìN (RESNET)")
print("=" * 60)
print(f"MSLE: {msle:.6f}")
print(f"RMSE: ${rmse:.2f}")
print(f"Revenue promedio predicho: ${pred.mean():.2f}")
print(f"Revenue promedio real: ${y_val.mean():.2f}")
print("=" * 60)

del val_predictions, val_targets, pred, y_val
gc.collect()

In [None]:
# INFERENCIA R√ÅPIDA EN TEST (OPTIMIZADA)
TEST_PATH = "/home/stargix/Desktop/hackathons/datathon/test/test"
SUBMISSION_PATH = "/home/stargix/Desktop/hackathons/datathon/submission_resnet.csv"

print("=" * 60)
print("INFERENCIA R√ÅPIDA EN TEST (OPTIMIZADA)")
print("=" * 60)

# Preparar CSV
import csv
with open(SUBMISSION_PATH, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['row_id', 'iap_revenue_d7'])

# Repartir test en chunks m√°s peque√±os
test_meta = dd.read_parquet(TEST_PATH, engine="pyarrow", index=False).head(0)
available_cols = [c for c in ["row_id"] + features if c in test_meta.columns]

dd_test = dd.read_parquet(
    TEST_PATH,
    engine="pyarrow",
    columns=available_cols,
    blocksize="64MB"  # üî• Reducido de 128MB a 64MB
)
dd_test = dd_test.repartition(npartitions=512)  # üî• M√°s particiones, chunks m√°s peque√±os
delayed_parts = dd_test.to_delayed()
print(f"Chunks: {len(delayed_parts)} (m√°s peque√±os para evitar crash)\\n")

# Mapeos para label encoding
le_maps = {
    col: {cls: idx for idx, cls in enumerate(enc.classes_)}
    for col, enc in label_encoders.items()
}

def sum_list_series(series: pd.Series) -> pd.Series:
    return series.apply(
        lambda x: sum(item[1] for item in x if isinstance(item, tuple) and len(item) > 1)
        if isinstance(x, list) else 0
    )

model.eval()
total_rows = 0
total_time = 0

for i, delayed_part in enumerate(delayed_parts, 1):
    t0 = time.perf_counter()
    
    # üî• Limpiar memoria antes de cada chunk
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    part_df = delayed_part.compute()
    row_ids = part_df["row_id"].values

    X_part = part_df.reindex(columns=features, fill_value=0)

    # Sumar columnas con listas
    for col in columns_to_sum:
        if col in X_part.columns:
            X_part[col] = sum_list_series(X_part[col])

    # Num√©ricas
    if numeric_features:
        X_part[numeric_features] = X_part[numeric_features].apply(pd.to_numeric, errors="coerce")
        X_part[numeric_features] = X_part[numeric_features].fillna(0)
        X_part[numeric_features] = scaler.transform(X_part[numeric_features])

    # Categ√≥ricas
    for col in cat_features:
        if col in X_part.columns:
            mapped = X_part[col].astype(str).map(le_maps[col]).fillna(-1).astype("int32")
            X_part[col] = mapped
    
    X_part = X_part.fillna(0)

    # Predecir (R√ÅPIDO)
    X_tensor = torch.FloatTensor(X_part.values).to(device)
    with torch.no_grad():
        pred_log = model(X_tensor).cpu().numpy().flatten()
    
    pred = np.expm1(pred_log).clip(0, None)

    # Guardar inmediatamente
    chunk_df = pd.DataFrame({"row_id": row_ids, "iap_revenue_d7": pred})
    chunk_df.to_csv(SUBMISSION_PATH, mode='a', header=False, index=False)
    
    elapsed = time.perf_counter() - t0
    total_rows += len(row_ids)
    total_time += elapsed
    
    if i % 10 == 0:  # üî• Print cada 10 chunks para reducir output
        print(f"[{i}/{len(delayed_parts)}] {total_rows:,} filas totales | {total_rows/total_time:,.0f} samples/s")
    
    # üî• Limpiar agresivamente
    del part_df, X_part, X_tensor, pred_log, pred, row_ids, chunk_df
    gc.collect()

print("\\n" + "=" * 60)
print("‚úì SUBMISSION COMPLETADO")
print("=" * 60)
print(f"Total filas: {total_rows:,}")
print(f"Tiempo total: {total_time:.2f}s")
print(f"Velocidad promedio: {total_rows/total_time:,.0f} samples/s")
print(f"Archivo: {SUBMISSION_PATH}")
print("=" * 60)

# Verificar
submission = pd.read_csv(SUBMISSION_PATH)
print(f"\\n‚úì Verificaci√≥n:")
print(f"  Filas: {len(submission):,}")
print(f"  Row IDs √∫nicos: {submission['row_id'].nunique():,}")
print(f"  Duplicados: {submission['row_id'].duplicated().sum()}")
print(f"  NaNs: {submission['iap_revenue_d7'].isna().sum()}")
print(f"\\nEstad√≠sticas:")
print(submission['iap_revenue_d7'].describe())
submission.head(10)