In [1]:
# ========== CONFIGURATION ==========
MODEL_VERSION = "single"  # Options: "single" or "two_step"

TRAIN_PATH = '/home/stargix/Desktop/hackathons/datathon/train/train'
TEST_PATH = '/home/stargix/Desktop/hackathons/datathon/test/test'
TARGET_COL = "iap_revenue_d7"
TRAIN_SAMPLE_FRAC = 0.10  # Adjust for more/less data

print(f"Selected model: {MODEL_VERSION}")

Selected model: single


In [2]:
import numpy as np
import pandas as pd
import dask
import dask.dataframe as dd
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.metrics import mean_squared_log_error
import gc
import os

dask.config.set({"dataframe.convert-string": False})

<dask.config.set at 0x7683f2bbed10>

## Helper Functions

In [3]:
# Columnas problem√°ticas (listas/dicts) que se ignoran
IGNORE_BIG_COLS = [
    "bundles_ins", "user_bundles", "user_bundles_l28d",
    "city_hist", "country_hist", "region_hist",
    "dev_language_hist", "dev_osv_hist",
    "bcat", "bcat_bottom_taxonomy",
    "bundles_cat", "bundles_cat_bottom_taxonomy",
    "first_request_ts_bundle", "first_request_ts_category_bottom_taxonomy",
    "last_buy_ts_bundle", "last_buy_ts_category",
    "last_install_ts_bundle", "last_install_ts_category",
    "advertiser_actions_action_count", "advertiser_actions_action_last_timestamp",
    "user_actions_bundles_action_count", "user_actions_bundles_action_last_timestamp",
    "new_bundles",
    "whale_users_bundle_num_buys_prank", "whale_users_bundle_revenue_prank",
    "whale_users_bundle_total_num_buys", "whale_users_bundle_total_revenue",
]

LABEL_COLS = [
    "buyer_d1", "buyer_d7", "buyer_d14", "buyer_d28",
    "buy_d7", "buy_d14", "buy_d28",
    "iap_revenue_d7", "iap_revenue_d14", "iap_revenue_d28",
    "registration",
    "retention_d1_to_d7", "retention_d3_to_d7", "retention_d7_to_d14",
    "retention_d1", "retention_d3", "retention_d7",
]

# Variables cr√≠ticas seg√∫n EDA (features m√°s importantes)
TIER1_FEATURES = [
    "iap_revenue_usd_bundle", "iap_revenue_usd_category",
    "num_buys_bundle", "num_buys_category",
    "advertiser_bundle", "advertiser_category",
    "country", "cpm", "ctr", "avg_daily_sessions"
]

def reduce_memory(df: pd.DataFrame) -> pd.DataFrame:
    """Downcast numeric columns to save memory."""
    df = df.copy()
    for col in df.columns:
        col_type = df[col].dtype
        if col_type == "float64":
            df[col] = df[col].astype("float32")
        elif col_type == "int64":
            df[col] = df[col].astype("int32")
    return df

def detect_listlike_columns(df: pd.DataFrame, cols=None):
    """Detect columns containing lists or dicts."""
    if cols is None:
        cols = df.columns
    listlike = []
    for c in cols:
        sample_vals = df[c].head(100)
        if sample_vals.apply(lambda v: isinstance(v, (list, dict))).any():
            listlike.append(c)
    return listlike

def create_engineered_features(df):
    """Feature engineering basado en EDA."""
    df = df.copy()
    
    # Helper para verificar que columna es num√©rica
    def is_numeric_col(df, col):
        return col in df.columns and pd.api.types.is_numeric_dtype(df[col])
    
    # 1. Flags para missing (informaci√≥n valiosa: "usuario nuevo")
    if is_numeric_col(df, "iap_revenue_usd_bundle"):
        df["has_previous_revenue"] = (df["iap_revenue_usd_bundle"].fillna(0) > 0).astype(int)
    
    if is_numeric_col(df, "num_buys_bundle"):
        df["has_previous_buys"] = (df["num_buys_bundle"].fillna(0) > 0).astype(int)
    
    # 2. Ratios (muy predictivos seg√∫n EDA)
    if is_numeric_col(df, "iap_revenue_usd_bundle") and is_numeric_col(df, "num_buys_bundle"):
        df["revenue_per_buy"] = df["iap_revenue_usd_bundle"] / (df["num_buys_bundle"] + 1)
    
    if is_numeric_col(df, "avg_daily_sessions") and is_numeric_col(df, "avg_duration"):
        df["session_intensity"] = df["avg_daily_sessions"] * df["avg_duration"]
    
    # 3. Features temporales (importantes seg√∫n EDA)
    if is_numeric_col(df, "weekday"):
        df["is_weekend"] = df["weekday"].isin([5, 6]).astype(int)
    
    if is_numeric_col(df, "hour"):
        df["is_peak_hour"] = df["hour"].isin([18, 19, 20, 21, 22]).astype(int)
        df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
        df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
    
    # 4. Bucketing de variables continuas
    if is_numeric_col(df, "release_msrp"):
        try:
            df["device_price_tier"] = pd.cut(df["release_msrp"], 
                                              bins=[0, 200, 500, 1000, 10000], 
                                              labels=[0, 1, 2, 3]).astype(float)
        except:
            # Si falla el binning, crear feature simple
            df["device_price_tier"] = 0
    
    return df

def remove_low_variance_features(X_train, X_valid, threshold=0.01):
    """Eliminar features con baja varianza (casi constantes)."""
    low_var_cols = []
    for col in X_train.select_dtypes(include=[np.number]).columns:
        unique_ratio = X_train[col].nunique() / len(X_train)
        if unique_ratio < threshold:
            low_var_cols.append(col)
    
    if low_var_cols:
        print(f"Removing {len(low_var_cols)} low variance features: {low_var_cols[:5]}...")
        X_train = X_train.drop(columns=low_var_cols)
        X_valid = X_valid.drop(columns=[c for c in low_var_cols if c in X_valid.columns])
    
    return X_train, X_valid

def preprocess_train_valid(X_train, X_valid, num_cols, cat_cols, y_train=None):
    """Preprocess train and validation sets with improved strategy."""
    X_train = X_train.copy()
    X_valid = X_valid.copy()
    
    # Feature engineering primero
    X_train = create_engineered_features(X_train)
    X_valid = create_engineered_features(X_valid)
    
    # Actualizar listas de columnas despu√©s de feature engineering
    new_num_cols = [c for c in X_train.columns if pd.api.types.is_numeric_dtype(X_train[c])]
    new_cat_cols = [c for c in X_train.columns if c not in new_num_cols]
    
    # Numeric: estrategia mejorada de imputaci√≥n
    for c in new_num_cols:
        # Para features de usuario (historial), -1 indica "no data" vs 0 "no activity"
        is_user_feature = any(x in c for x in ["revenue", "buys", "sessions", "days", "avg_"])
        
        if is_user_feature:
            X_train[c] = X_train[c].fillna(-1)
            X_valid[c] = X_valid[c].fillna(-1)
        else:
            # Para otras features, usar mediana
            median_val = X_train[c].median()
            X_train[c] = X_train[c].fillna(median_val)
            X_valid[c] = X_valid[c].fillna(median_val)
    
    # Categorical: frequency encoding para alta cardinalidad
    freq_encoding_cols = []
    for c in new_cat_cols:
        try:
            # Convertir a string primero para evitar problemas de tipos
            X_train[c] = X_train[c].astype(str)
            X_valid[c] = X_valid[c].astype(str)
            
            n_unique = X_train[c].nunique()
            
            # Alta cardinalidad (>100 valores) ‚Üí frequency encoding
            if n_unique > 100:
                freq_encoding_cols.append(c)
                freq_map = X_train[c].value_counts(normalize=True).to_dict()
                X_train[c + "_freq"] = X_train[c].map(freq_map).fillna(0).astype(np.float32)
                X_valid[c + "_freq"] = X_valid[c].map(freq_map).fillna(0).astype(np.float32)
                # Eliminar original
                X_train = X_train.drop(columns=[c])
                X_valid = X_valid.drop(columns=[c])
            else:
                # Baja cardinalidad ‚Üí categorical encoding normal
                X_train[c] = X_train[c].fillna("MISSING")
                X_train[c] = X_train[c].astype("category")
                
                cats = X_train[c].cat.categories
                X_valid[c] = X_valid[c].fillna("MISSING")
                X_valid[c] = X_valid[c].astype(pd.api.types.CategoricalDtype(categories=cats))
        except Exception as e:
            print(f"Warning: Could not process categorical column {c}: {e}")
            # Si falla, eliminar la columna
            if c in X_train.columns:
                X_train = X_train.drop(columns=[c])
            if c in X_valid.columns:
                X_valid = X_valid.drop(columns=[c])
    
    if freq_encoding_cols:
        print(f"Applied frequency encoding to {len(freq_encoding_cols)} high-cardinality features")
    
    # Eliminar features de baja varianza
    X_train, X_valid = remove_low_variance_features(X_train, X_valid)
    
    return X_train, X_valid

def preprocess_new(X_new, num_cols, cat_cols, cat_ref_df, freq_encoding_info=None):
    """Preprocess test data using train statistics."""
    X_new = X_new.copy()
    
    # Feature engineering
    X_new = create_engineered_features(X_new)
    
    # Actualizar listas despu√©s de feature engineering
    new_num_cols = [c for c in X_new.columns if pd.api.types.is_numeric_dtype(X_new[c])]
    new_cat_cols = [c for c in X_new.columns if c not in new_num_cols]
    
    # Numeric imputation
    for c in new_num_cols:
        if c in X_new.columns:
            is_user_feature = any(x in c for x in ["revenue", "buys", "sessions", "days", "avg_"])
            if is_user_feature:
                X_new[c] = X_new[c].fillna(-1)
            else:
                # Usar mediana del train si est√° disponible
                if c in cat_ref_df.columns and pd.api.types.is_numeric_dtype(cat_ref_df[c]):
                    median_val = cat_ref_df[c].median()
                else:
                    median_val = 0
                X_new[c] = X_new[c].fillna(median_val)
    
    # Categorical
    for c in new_cat_cols:
        if c in X_new.columns:
            try:
                X_new[c] = X_new[c].astype(str).fillna("MISSING")
                
                if c in cat_ref_df.columns and hasattr(cat_ref_df[c], 'cat'):
                    cats = cat_ref_df[c].cat.categories
                    X_new[c] = X_new[c].astype(pd.api.types.CategoricalDtype(categories=cats))
            except Exception as e:
                print(f"Warning: Could not process test column {c}: {e}")
                # Si falla, eliminar
                if c in X_new.columns:
                    X_new = X_new.drop(columns=[c])
    
    return X_new

print("Helper functions loaded.")

Helper functions loaded.


## Load and Prepare Data

In [4]:
from glob import glob
import os

# Train: Oct 1-5, Valid: Oct 6
filters_train = [("datetime", ">=", "2025-10-01-00-00"),
                 ("datetime", "<",  "2025-10-06-00-00")]
filters_valid = [("datetime", ">=", "2025-10-06-00-00"),
                 ("datetime", "<",  "2025-10-07-00-00")]

# Obtener lista de archivos parquet
parquet_files_all = glob(os.path.join(TRAIN_PATH, '**/part-*.parquet'), recursive=True)

# üî• REDUCIR M√ÅS: Solo 5-10% de archivos
num_files_train = max(1, int(len(parquet_files_all) * 0.15))  # Cambi√© a 5%
parquet_files_train = parquet_files_all[:num_files_train]

print(f"Using {num_files_train} out of {len(parquet_files_all)} train files")

# üî• DROPEAR COLUMNAS ANTES DE COMPUTE
cols_to_drop_early = IGNORE_BIG_COLS + ["row_id", "datetime"]

# Cargar TRAIN
print("Loading train data...")
dd_train = dd.read_parquet(
    parquet_files_train, 
    filters=filters_train,
    engine='pyarrow'
)

# Dropear columnas pesadas ANTES de compute
existing_cols = [c for c in cols_to_drop_early if c in dd_train.columns]
dd_train = dd_train.drop(columns=existing_cols)

# üî• SAMPLE EN DASK (no en pandas)
train_sample = dd_train.sample(frac=TRAIN_SAMPLE_FRAC, random_state=42).compute()
train_sample = reduce_memory(train_sample)

print(f"Train loaded: {train_sample.shape}, Memory: {train_sample.memory_usage(deep=True).sum() / 1e9:.2f} GB")

# Limpiar memoria
del dd_train
gc.collect()

# Cargar VALID (despu√©s de liberar train)
print("\nLoading validation data...")
dd_valid = dd.read_parquet(
    parquet_files_train,  # Mismos archivos
    filters=filters_valid,
    engine='pyarrow'
)

existing_cols = [c for c in cols_to_drop_early if c in dd_valid.columns]
dd_valid = dd_valid.drop(columns=existing_cols)

# üî• SAMPLE MENOS EN VALID (solo necesitas evaluar, no entrenar)
valid_df = dd_valid.sample(frac=min(0.5, TRAIN_SAMPLE_FRAC), random_state=42).compute()
valid_df = reduce_memory(valid_df)

print(f"Valid loaded: {valid_df.shape}, Memory: {valid_df.memory_usage(deep=True).sum() / 1e9:.2f} GB")

del dd_valid
gc.collect()

print(f"\n‚úì Data loaded successfully")
print(f"Total memory: ~{(train_sample.memory_usage(deep=True).sum() + valid_df.memory_usage(deep=True).sum()) / 1e9:.2f} GB")

Using 21 out of 144 train files
Loading train data...
Train loaded: (271487, 56), Memory: 0.40 GB

Loading validation data...
Valid loaded: (28373, 56), Memory: 0.04 GB

‚úì Data loaded successfully
Total memory: ~0.44 GB


In [5]:
# Extract targets
y_train = train_sample[TARGET_COL].values
y_valid = valid_df[TARGET_COL].values

# Always extract buyer labels (needed for two-step model)
y_train_buyer = train_sample["buyer_d7"].values
y_valid_buyer = valid_df["buyer_d7"].values

if MODEL_VERSION == "two_step":
    print(f"Buyer ratio in train: {y_train_buyer.mean():.4f}")
    print(f"Buyer ratio in valid: {y_valid_buyer.mean():.4f}")

# Prepare features
cols_to_drop = ["row_id", "datetime"] + LABEL_COLS
feature_cols = [c for c in train_sample.columns if c not in cols_to_drop]

X_train = train_sample[feature_cols].copy()
X_valid = valid_df[feature_cols].copy()

# Detect and remove list-like columns
listlike_cols = detect_listlike_columns(X_train, cols=feature_cols)
print(f"Removing {len(listlike_cols)} list-like columns: {listlike_cols}")
X_train = X_train.drop(columns=listlike_cols)
X_valid = X_valid.drop(columns=listlike_cols)

# Identify numeric and categorical columns BEFORE preprocessing
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

print(f"Features before preprocessing: {len(X_train.columns)} ({len(num_cols)} numeric, {len(cat_cols)} categorical)")

# Preprocess with improved strategy
X_train_prep, X_valid_prep = preprocess_train_valid(X_train, X_valid, num_cols, cat_cols, y_train)

# Actualizar num_cols y cat_cols despu√©s del preprocessing
final_num_cols = X_train_prep.select_dtypes(include=[np.number]).columns.tolist()
final_cat_cols = [c for c in X_train_prep.columns if c not in final_num_cols]

print(f"Features after preprocessing: {len(X_train_prep.columns)} ({len(final_num_cols)} numeric, {len(final_cat_cols)} categorical)")
print(f"Data prepared: X_train {X_train_prep.shape}, X_valid {X_valid_prep.shape}")

Removing 14 list-like columns: ['avg_daily_sessions', 'avg_duration', 'cpm', 'cpm_pct_rk', 'ctr', 'ctr_pct_rk', 'hour_ratio', 'iap_revenue_usd_bundle', 'iap_revenue_usd_category', 'iap_revenue_usd_category_bottom_taxonomy', 'num_buys_bundle', 'num_buys_category', 'num_buys_category_bottom_taxonomy', 'rwd_prank']
Features before preprocessing: 26 (11 numeric, 15 categorical)
Applied frequency encoding to 9 high-cardinality features
Removing 18 low variance features: ['retentiond7', 'release_msrp', 'weekday', 'avg_act_days', 'avg_days_ins']...
Features after preprocessing: 10 (4 numeric, 6 categorical)
Data prepared: X_train (271487, 10), X_valid (28373, 10)


## Train Models

In [6]:
MODEL_VERSION == "single"

if MODEL_VERSION == "single":
    print("=" * 50)
    print("TRAINING SINGLE LGBM REGRESSOR")
    print("=" * 50)
    
    # Transform target to log space
    y_train_log = np.log1p(y_train)
    y_valid_log = np.log1p(y_valid)
    
    # Train single regressor with optimized params (based on EDA recommendations)
    model = LGBMRegressor(
        objective="regression",
        metric="rmse",
        n_estimators=800,
        learning_rate=0.05,
        num_leaves=127,  # Reducido para evitar overfitting
        max_depth=10,    # Limitado seg√∫n EDA
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,   # L1 regularization (ayuda con features irrelevantes)
        reg_lambda=0.1,  # L2 regularization
        min_child_samples=20,  # Evitar hojas con muy pocos samples
        verbosity=-1,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(
        X_train_prep, y_train_log,
        eval_set=[(X_valid_prep, y_valid_log)],
        eval_metric='rmse'
    )
    print("‚úì Model trained")
    
    # Predict
    valid_pred_log = model.predict(X_valid_prep)
    valid_pred = np.expm1(valid_pred_log)
    valid_pred = np.clip(valid_pred, 0, None)
    
    # Store for submission
    models = {"regressor": model}
    
    # Feature importance (top 20)
    importance = pd.DataFrame({
        'feature': X_train_prep.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nüî• Top 20 Most Important Features:")
    print(importance.head(20).to_string(index=False))
    
elif MODEL_VERSION == "two_step":
    print("=" * 50)
    print("TRAINING TWO-STEP MODEL (CLASSIFIER + REGRESSOR)")
    print("=" * 50)
    
    # Step 1: Train buyer classifier
    print("\n[1/2] Training buyer classifier...")
    buyer_classifier = LGBMClassifier(
        objective="binary",
        n_estimators=600,
        learning_rate=0.05,
        num_leaves=127,
        max_depth=10,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        min_child_samples=20,
        verbosity=-1,
        random_state=42,
        n_jobs=-1
    )
    
    buyer_classifier.fit(X_train_prep, y_train_buyer)
    print("‚úì Buyer classifier trained")
    
    # Get buyer probabilities
    buyer_prob_train = buyer_classifier.predict_proba(X_train_prep)[:, 1]
    buyer_prob_valid = buyer_classifier.predict_proba(X_valid_prep)[:, 1]
    
    # Step 2: Train revenue regressor on buyers only
    print("\n[2/2] Training revenue regressor (buyers only)...")
    buyer_mask_train = y_train > 0
    
    X_train_buyers = X_train_prep[buyer_mask_train]
    y_train_buyers_log = np.log1p(y_train[buyer_mask_train])
    
    print(f"Training on {buyer_mask_train.sum()} buyers out of {len(y_train)} samples")
    
    revenue_regressor = LGBMRegressor(
        objective="regression",
        metric="rmse",
        n_estimators=800,
        learning_rate=0.05,
        num_leaves=127,
        max_depth=10,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.1,
        reg_lambda=0.1,
        min_child_samples=20,
        verbosity=-1,
        random_state=42,
        n_jobs=-1
    )
    
    revenue_regressor.fit(X_train_buyers, y_train_buyers_log)
    print("‚úì Revenue regressor trained")
    
    # Step 3: Combined predictions
    valid_revenue_pred_log = revenue_regressor.predict(X_valid_prep)
    valid_revenue_pred = np.expm1(valid_revenue_pred_log)
    valid_revenue_pred = np.clip(valid_revenue_pred, 0, None)
    
    # Final prediction: P(buyer) * E[revenue | buyer]
    valid_pred = buyer_prob_valid * valid_revenue_pred
    
    # Store for submission
    models = {
        "classifier": buyer_classifier,
        "regressor": revenue_regressor
    }
    
    # Additional metrics
    buyer_acc = (buyer_classifier.predict(X_valid_prep) == y_valid_buyer).mean()
    print(f"\nBuyer classifier accuracy: {buyer_acc:.4f}")
    print(f"Average predicted buyer probability: {buyer_prob_valid.mean():.4f}")

else:
    raise ValueError(f"Invalid MODEL_VERSION: {MODEL_VERSION}")

TRAINING SINGLE LGBM REGRESSOR
‚úì Model trained

üî• Top 20 Most Important Features:
                         feature  importance
                   weekend_ratio       23492
                      wifi_ratio       22193
                        last_ins       12300
                        last_buy        9256
                            hour        6444
          advertiser_subcategory        5726
advertiser_bottom_taxonomy_level        5429
                          dev_os        2126
             advertiser_category        1949
          last_advertiser_action         473


## Evaluate

In [7]:
# Calculate MSLE
msle_model = mean_squared_log_error(y_valid, valid_pred)
msle_baseline = mean_squared_log_error(y_valid, np.zeros_like(y_valid))

print("\n" + "=" * 50)
print("VALIDATION RESULTS")
print("=" * 50)
print(f"Model: {MODEL_VERSION.upper()}")
print(f"MSLE: {msle_model:.6f}")
print(f"Baseline (all zeros): {msle_baseline:.6f}")
print(f"Improvement: {((msle_baseline - msle_model) / msle_baseline * 100):.2f}%")

# Distribution stats
print(f"\nPrediction stats:")
print(f"  Mean: {valid_pred.mean():.4f}")
print(f"  Median: {np.median(valid_pred):.4f}")
print(f"  Max: {valid_pred.max():.4f}")
print(f"  % Non-zero: {(valid_pred > 0).mean() * 100:.2f}%")


VALIDATION RESULTS
Model: SINGLE
MSLE: 0.204424
Baseline (all zeros): 0.228072
Improvement: 10.37%

Prediction stats:
  Mean: 0.0960
  Median: 0.0084
  Max: 65.5665
  % Non-zero: 69.90%


In [8]:
## Grid Search for Single Model

import itertools
from datetime import datetime

# Verificar que estamos en modo single
if MODEL_VERSION != "single":
    print("‚ö†Ô∏è Grid search only works with MODEL_VERSION='single'")
    print("Please change MODEL_VERSION in the first cell and re-run from the beginning")
else:
    print("=" * 60)
    print("GRID SEARCH - SINGLE LGBM REGRESSOR")
    print("=" * 60)
    
    # Define parameter grid
    param_grid = {
        'n_estimators': [400, 600, 800],
        'learning_rate': [0.03, 0.05, 0.07],
        'num_leaves': [127, 255, 511],
        'max_depth': [-1, 10, 15],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.7, 0.8, 0.9],
    }
    
    # Generate all combinations
    keys = param_grid.keys()
    values = param_grid.values()
    combinations = list(itertools.product(*values))
    
    print(f"\nTotal combinations: {len(combinations)}")
    print(f"Estimated time: ~{len(combinations) * 2} minutes (aprox 2 min/model)\n")
    
    # Store results
    results = []
    
    # Transform target once
    y_train_log = np.log1p(y_train)
    
    # Grid search
    for i, params in enumerate(combinations[:10], 1):  # Limit to first 10 for testing
        param_dict = dict(zip(keys, params))
        
        # üî• FIX: Convertir par√°metros int expl√≠citamente
        param_dict['n_estimators'] = int(param_dict['n_estimators'])
        param_dict['num_leaves'] = int(param_dict['num_leaves'])
        param_dict['max_depth'] = int(param_dict['max_depth'])
        
        print(f"[{i}/{min(10, len(combinations))}] Testing: {param_dict}")
        
        # Train model
        model = LGBMRegressor(
            objective="regression",
            reg_alpha=0.0,
            reg_lambda=0.0,
            verbosity=-1,
            random_state=42,
            **param_dict
        )
        
        start_time = datetime.now()
        model.fit(X_train_prep, y_train_log)
        train_time = (datetime.now() - start_time).total_seconds()
        
        # Predict
        valid_pred_log = model.predict(X_valid_prep)
        valid_pred = np.expm1(valid_pred_log)
        valid_pred = np.clip(valid_pred, 0, None)
        
        # Evaluate
        msle = mean_squared_log_error(y_valid, valid_pred)
        
        # Store result
        result = {
            **param_dict,
            'msle': msle,
            'train_time': train_time
        }
        results.append(result)
        
        print(f"  MSLE: {msle:.6f} | Time: {train_time:.1f}s\n")
    
    # Convert to DataFrame and sort
    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values('msle')
    
    print("\n" + "=" * 60)
    print("TOP 5 BEST MODELS")
    print("=" * 60)
    print(results_df.head(5).to_string(index=False))
    
    print("\n" + "=" * 60)
    print("BEST MODEL PARAMETERS")
    print("=" * 60)
    best_params = results_df.iloc[0].to_dict()
    for key, value in best_params.items():
        print(f"  {key}: {value}")
    
    # Train final model with best params
    print("\n" + "=" * 60)
    print("TRAINING FINAL MODEL WITH BEST PARAMETERS")
    print("=" * 60)
    
    best_model_params = {k: v for k, v in best_params.items() 
                         if k not in ['msle', 'train_time']}
    
    # üî• FIX: Asegurar que los par√°metros finales tambi√©n son int
    best_model_params['n_estimators'] = int(best_model_params['n_estimators'])
    best_model_params['num_leaves'] = int(best_model_params['num_leaves'])
    best_model_params['max_depth'] = int(best_model_params['max_depth'])
    
    final_model = LGBMRegressor(
        objective="regression",
        reg_alpha=0.0,
        reg_lambda=0.0,
        verbosity=-1,
        random_state=42,
        **best_model_params
    )
    
    final_model.fit(X_train_prep, y_train_log)
    
    # Final predictions
    valid_pred_log = final_model.predict(X_valid_prep)
    valid_pred = np.expm1(valid_pred_log)
    valid_pred = np.clip(valid_pred, 0, None)
    
    msle_final = mean_squared_log_error(y_valid, valid_pred)
    msle_baseline = mean_squared_log_error(y_valid, np.zeros_like(y_valid))
    
    print(f"\n‚úì Final Model MSLE: {msle_final:.6f}")
    print(f"  Baseline MSLE: {msle_baseline:.6f}")
    print(f"  Improvement: {((msle_baseline - msle_final) / msle_baseline * 100):.2f}%")
    
    # Update models dict for submission
    models = {"regressor": final_model}
    
    # Save results to CSV
    results_df.to_csv("grid_search_results.csv", index=False)
    print(f"\n‚úì Results saved to: grid_search_results.csv")

GRID SEARCH - SINGLE LGBM REGRESSOR

Total combinations: 729
Estimated time: ~1458 minutes (aprox 2 min/model)

[1/10] Testing: {'n_estimators': 400, 'learning_rate': 0.03, 'num_leaves': 127, 'max_depth': -1, 'subsample': 0.7, 'colsample_bytree': 0.7}
  MSLE: 0.202018 | Time: 2.5s

[2/10] Testing: {'n_estimators': 400, 'learning_rate': 0.03, 'num_leaves': 127, 'max_depth': -1, 'subsample': 0.7, 'colsample_bytree': 0.8}
  MSLE: 0.202734 | Time: 2.3s

[3/10] Testing: {'n_estimators': 400, 'learning_rate': 0.03, 'num_leaves': 127, 'max_depth': -1, 'subsample': 0.7, 'colsample_bytree': 0.9}
  MSLE: 0.203288 | Time: 2.5s

[4/10] Testing: {'n_estimators': 400, 'learning_rate': 0.03, 'num_leaves': 127, 'max_depth': -1, 'subsample': 0.8, 'colsample_bytree': 0.7}
  MSLE: 0.202018 | Time: 2.4s

[5/10] Testing: {'n_estimators': 400, 'learning_rate': 0.03, 'num_leaves': 127, 'max_depth': -1, 'subsample': 0.8, 'colsample_bytree': 0.8}
  MSLE: 0.202734 | Time: 2.2s

[6/10] Testing: {'n_estimators': 

## Generate Submission

In [None]:
print("Generating test predictions...")

dd_test = dd.read_parquet(TEST_PATH, engine='pyarrow')
existing_big_cols_test = [c for c in IGNORE_BIG_COLS if c in dd_test.columns]
dd_test = dd_test.drop(columns=existing_big_cols_test)

delayed_parts = dd_test.to_delayed()
print(f"Processing {len(delayed_parts)} test chunks...")

pred_dfs = []
feature_cols_final = X_train_prep.columns.tolist()

for i, d in enumerate(delayed_parts):
    if (i + 1) % 10 == 0:
        print(f"  Chunk {i+1}/{len(delayed_parts)}...")
    
    part_df = d.compute()
    part_df = reduce_memory(part_df)
    
    row_ids = part_df["row_id"].values
    
    # üî• IMPORTANTE: Solo seleccionar columnas que existen en test
    available_cols = [c for c in feature_cols if c in part_df.columns]
    X_part = part_df[available_cols].copy()
    
    # üî• A√±adir columnas faltantes con valores por defecto
    for col in feature_cols:
        if col not in X_part.columns:
            if col in num_cols:
                X_part[col] = 0  # Numeric ‚Üí 0
            else:
                X_part[col] = "MISSING"  # Categorical ‚Üí MISSING
    
    # Reordenar columnas para que coincidan con el train original
    X_part = X_part[feature_cols]
    
    # Aplicar el mismo preprocessing (con frequency encoding, feature engineering, etc.)
    X_part_prep = preprocess_new(X_part, num_cols, cat_cols, X_train_prep)
    
    # Asegurar que tiene exactamente las mismas columnas que train
    missing_in_test = set(feature_cols_final) - set(X_part_prep.columns)
    if missing_in_test:
        for col in missing_in_test:
            if col in final_num_cols:
                X_part_prep[col] = 0
            else:
                X_part_prep[col] = "MISSING"
    
    # Reordenar para match exacto
    X_part_prep = X_part_prep[feature_cols_final]
    
    # Predict based on model version
    if MODEL_VERSION == "single":
        part_pred_log = models["regressor"].predict(X_part_prep)
        part_pred = np.expm1(part_pred_log)
        part_pred = np.clip(part_pred, 0, None)
        
    elif MODEL_VERSION == "two_step":
        buyer_prob = models["classifier"].predict_proba(X_part_prep)[:, 1]
        revenue_pred_log = models["regressor"].predict(X_part_prep)
        revenue_pred = np.expm1(revenue_pred_log)
        revenue_pred = np.clip(revenue_pred, 0, None)
        part_pred = buyer_prob * revenue_pred
    
    pred_dfs.append(pd.DataFrame({
        "row_id": row_ids,
        "iap_revenue_d7": part_pred
    }))
    
    del part_df, X_part, X_part_prep, row_ids, part_pred
    gc.collect()

# Combine and save
submission = pd.concat(pred_dfs, ignore_index=True)
output_file = f"submission_{MODEL_VERSION}.csv"
submission.to_csv(output_file, index=False)

print(f"\n‚úì Submission saved: {output_file}")
print(f"Shape: {submission.shape}")
print(f"Sample:\n{submission.head()}")

# Validation checks
print(f"\nValidation checks:")
print(f"  NaN values: {submission.isna().sum().sum()}")
print(f"  Negative values: {(submission['iap_revenue_d7'] < 0).sum()}")

Generating test predictions...
Processing 96 test chunks...
  Chunk 10/96...
  Chunk 20/96...
  Chunk 30/96...
  Chunk 40/96...
  Chunk 50/96...
  Chunk 60/96...
  Chunk 70/96...
  Chunk 80/96...
  Chunk 90/96...


## Summary

**To switch models**: Change `MODEL_VERSION` in the first code cell to:
- `"single"` - Single LightGBM regressor (direct approach)
- `"two_step"` - Classifier + Regressor (probabilistic approach)

**To adjust data size**: Change `TRAIN_SAMPLE_FRAC` (default 0.10 = 10%)

Then re-run all cells!