In [1]:
# ========== CONFIGURATION ==========
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import joblib
from time import time
from glob import glob
import dask.dataframe as dd
import gc

# Paths
TRAIN_PATH = '/home/stargix/Desktop/hackathons/datathon/train/train'
TEST_PATH = '/home/stargix/Desktop/hackathons/datathon/test/test'

# Data config
SAMPLE_FRAC = 0.15  # Adjust for more/less data (0.15 = 15%)
RANDOM_STATE = 42
TARGET = "iap_revenue_d7"
ID_COL = "row_id"
OUT_MODEL = "lgbm_simple.joblib"

# Columnas a ignorar (complejas: listas/dicts)
IGNORE_BIG_COLS = [
    "bundles_ins", "user_bundles", "user_bundles_l28d",
    "city_hist", "country_hist", "region_hist",
    "dev_language_hist", "dev_osv_hist",
    "bcat", "bcat_bottom_taxonomy",
    "bundles_cat", "bundles_cat_bottom_taxonomy",
    "first_request_ts_bundle", "first_request_ts_category_bottom_taxonomy",
    "last_buy_ts_bundle", "last_buy_ts_category",
    "last_install_ts_bundle", "last_install_ts_category",
    "advertiser_actions_action_count", "advertiser_actions_action_last_timestamp",
    "user_actions_bundles_action_count", "user_actions_bundles_action_last_timestamp",
    "new_bundles",
    "whale_users_bundle_num_buys_prank", "whale_users_bundle_revenue_prank",
    "whale_users_bundle_total_num_buys", "whale_users_bundle_total_revenue",
]

LABEL_COLS = [
    "buyer_d1", "buyer_d7", "buyer_d14", "buyer_d28",
    "buy_d7", "buy_d14", "buy_d28",
    "iap_revenue_d7", "iap_revenue_d14", "iap_revenue_d28",
    "registration",
    "retention_d1_to_d7", "retention_d3_to_d7", "retention_d7_to_d14",
    "retention_d1", "retention_d3", "retention_d7",
]

print(f"Configuration: SAMPLE_FRAC={SAMPLE_FRAC}, TARGET={TARGET}")

Configuration: SAMPLE_FRAC=0.15, TARGET=iap_revenue_d7


In [2]:
# ========== UTILITY FUNCTIONS ==========

def frequency_encoding(df, col):
    """Frequency encoding para columnas de alta cardinalidad."""
    freqs = df[col].value_counts(dropna=False)
    return df[col].map(freqs).astype(np.float32)

def safe_label_encode(train_ser, valid_ser, test_ser=None):
    """Label encoding safe para categorÃ­as desconocidas."""
    le = LabelEncoder()
    # manejar nulos como string
    train_vals = train_ser.fillna("__NA__").astype(str)
    le.fit(train_vals)
    def transform(s):
        return le.transform(s.fillna("__NA__").astype(str))
    return transform(train_ser), transform(valid_ser), transform(test_ser) if test_ser is not None else None, le

def detect_listlike_columns(df, sample_size=100):
    """Detecta columnas con tipos complejos (listas, dicts)."""
    listlike = []
    for c in df.columns:
        sample_vals = df[c].head(sample_size)
        if sample_vals.apply(lambda v: isinstance(v, (list, dict))).any():
            listlike.append(c)
    return listlike

def reduce_memory(df):
    """Downcast numeric columns para ahorrar memoria."""
    df = df.copy()
    for col in df.columns:
        col_type = df[col].dtype
        if col_type == "float64":
            df[col] = df[col].astype("float32")
        elif col_type == "int64":
            df[col] = df[col].astype("int32")
    return df

print("âœ“ Utility functions loaded")

âœ“ Utility functions loaded


## Load Data

In [3]:
print("Loading training data with Dask...")

# Obtener archivos parquet
parquet_files_all = glob(os.path.join(TRAIN_PATH, '**/part-*.parquet'), recursive=True)
num_files = max(1, int(len(parquet_files_all) * SAMPLE_FRAC))
parquet_files_train = parquet_files_all[:num_files]

print(f"Using {num_files} out of {len(parquet_files_all)} files")

# Leer con Dask y dropear columnas pesadas antes de compute
cols_to_drop_early = IGNORE_BIG_COLS + ["datetime"]

dd_train = dd.read_parquet(parquet_files_train, engine='pyarrow')
existing_cols = [c for c in cols_to_drop_early if c in dd_train.columns]
dd_train = dd_train.drop(columns=existing_cols)

# Compute a pandas
df = dd_train.compute()
df = reduce_memory(df)

print(f"âœ“ Data loaded: {df.shape}")
print(f"  Memory: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB")

del dd_train
gc.collect()

Loading training data with Dask...
Using 21 out of 144 files
âœ“ Data loaded: (2998607, 57)
âœ“ Data loaded: (2998607, 57)
  Memory: 2.79 GB
  Memory: 2.79 GB


0

## Feature Engineering

In [4]:
print("Feature Engineering...")

# Detectar y remover columnas complejas
listlike_cols = detect_listlike_columns(df)
print(f"Removing {len(listlike_cols)} list-like columns")
df = df.drop(columns=listlike_cols, errors='ignore')

# Remover target columns y ID
cols_to_drop = [ID_COL] + LABEL_COLS
df = df.drop(columns=[c for c in cols_to_drop if c in df.columns], errors='ignore')

# Numeric columns: fill with 0
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric columns: {len(numeric_cols)}")
for c in numeric_cols:
    df[c] = df[c].fillna(0).astype(np.float32)

# Categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"Categorical columns: {len(cat_cols)}")

# Strategy: 
# - High cardinality (>100) -> frequency encoding
# - Low cardinality (<=100) -> label encoding
fe_cols = []  # frequency encoded
le_cols = []  # label encoded

for c in cat_cols:
    nunique = df[c].nunique(dropna=False)
    if nunique > 100:
        df[f"{c}_freq"] = frequency_encoding(df, c)
        fe_cols.append(f"{c}_freq")
    else:
        le_cols.append(c)

# Build feature list
features = numeric_cols + fe_cols + le_cols

print(f"Total features: {len(features)}")
print(f"  Numeric: {len(numeric_cols)}")
print(f"  Frequency encoded: {len(fe_cols)}")
print(f"  Label encoded: {len(le_cols)}")

# Remove rows with NaN target
df = df[~df[TARGET].isna()].reset_index(drop=True)
print(f"âœ“ After removing NaN targets: {df.shape}")

Feature Engineering...
Removing 13 list-like columns
Numeric columns: 11
Categorical columns: 2
Numeric columns: 11
Categorical columns: 2


TypeError: unhashable type: 'list'

In [None]:
print("Train/Valid Split and Encoding...")

# Split 80/20
train_df, valid_df = train_test_split(df, test_size=0.2, random_state=RANDOM_STATE)

print(f"Train: {train_df.shape}, Valid: {valid_df.shape}")

# Label encode small-cardinality categories
label_encoders = {}
for col in le_cols:
    train_vals = train_df[col].fillna("__NA__").astype(str)
    le = LabelEncoder()
    le.fit(train_vals)
    
    train_df[col] = le.transform(train_vals)
    valid_df[col] = le.transform(valid_df[col].fillna("__NA__").astype(str))
    
    label_encoders[col] = le

# Target transform (log1p)
y_train = np.log1p(train_df[TARGET].values.astype(np.float32))
y_valid = np.log1p(valid_df[TARGET].values.astype(np.float32))

# Prepare feature matrices
X_train = train_df[features].astype(np.float32)
X_valid = valid_df[features].astype(np.float32)

print(f"X_train: {X_train.shape}, X_valid: {X_valid.shape}")
print(f"y_train: {y_train.shape}, y_valid: {y_valid.shape}")
print(f"âœ“ Ready for training")

Removing 14 list-like columns: ['avg_daily_sessions', 'avg_duration', 'cpm', 'cpm_pct_rk', 'ctr', 'ctr_pct_rk', 'hour_ratio', 'iap_revenue_usd_bundle', 'iap_revenue_usd_category', 'iap_revenue_usd_category_bottom_taxonomy', 'num_buys_bundle', 'num_buys_category', 'num_buys_category_bottom_taxonomy', 'rwd_prank']
Features: 26 (11 numeric, 15 categorical)

Applying preprocessing (KNN imputation + StandardScaler + categorical encoding)...
  Filling 11 numeric columns with 0 for missing values...
  Applying StandardScaler normalization...
Data prepared: X_train (271487, 26), X_valid (28373, 26)
âœ“ Memory optimized
âœ“ Scaler saved for test predictions
Data prepared: X_train (271487, 26), X_valid (28373, 26)
âœ“ Memory optimized
âœ“ Scaler saved for test predictions


## Train LightGBM

In [None]:
print("=" * 60)
print("TRAINING LIGHTGBM REGRESSOR")
print("=" * 60)

# LightGBM params - simple y balanceado
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "n_estimators": 2000,
    "learning_rate": 0.05,
    "num_leaves": 64,
    "max_depth": -1,
    "subsample": 0.8,
    "colsample_bytree": 0.6,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "random_state": RANDOM_STATE,
    "n_jobs": 8,
    "verbose": -1
}

# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

# Train with early stopping
start = time()
model = lgb.train(
    params=lgb_params,
    train_set=train_data,
    num_boost_round=lgb_params["n_estimators"],
    valid_sets=[train_data, valid_data],
    valid_names=["train", "valid"],
    early_stopping_rounds=50,
    verbose_eval=100
)
elapsed = time() - start

print(f"\nâœ“ Training completed in {elapsed:.1f}s")
print(f"âœ“ Best iteration: {model.best_iteration}")

# Predictions
pred_valid_log = model.predict(X_valid, num_iteration=model.best_iteration)
pred_valid = np.expm1(pred_valid_log)
pred_valid = np.clip(pred_valid, 0, None)

# Evaluation
msle = mean_squared_log_error(valid_df[TARGET].values, pred_valid)
print(f"\n{'=' * 60}")
print(f"VALIDATION METRICS")
print(f"{'=' * 60}")
print(f"MSLE: {msle:.6f}")

# Additional metrics
frac_zero_true = (valid_df[TARGET] == 0).mean()
frac_zero_pred = (pred_valid == 0).mean()
print(f"True zero fraction: {frac_zero_true:.3f}")
print(f"Pred zero fraction: {frac_zero_pred:.3f}")
print(f"Mean prediction: {pred_valid.mean():.4f}")
print(f"Max prediction: {pred_valid.max():.4f}")

TRAINING SINGLE LGBM REGRESSOR
âœ“ Model trained
âœ“ Model trained


## Feature Importance

In [None]:
import matplotlib.pyplot as plt

print("Top 20 Most Important Features:")
print("=" * 60)

importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': model.feature_importance()
}).sort_values('Importance', ascending=False)

print(importance_df.head(20).to_string(index=False))

# Visualize
plt.figure(figsize=(12, 6))
top_20 = importance_df.head(20)
plt.barh(range(len(top_20)), top_20['Importance'].values)
plt.yticks(range(len(top_20)), top_20['Feature'].values)
plt.xlabel('Importance')
plt.title('Top 20 Features')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nâœ“ Saved to feature_importance.png")


VALIDATION RESULTS
Model: SINGLE
MSLE: 0.185813
Baseline (all zeros): 0.228072
Improvement: 18.53%

Prediction stats:
  Mean: 0.1316
  Median: 0.0011
  Max: 50.3014
  % Non-zero: 60.28%


## Generate Submission

In [None]:
print("Saving model and artifacts...")

# Save model + metadata
artifacts = {
    "model": model,
    "features": features,
    "label_encoders": label_encoders,
    "numeric_cols": numeric_cols,
    "fe_cols": fe_cols,
    "le_cols": le_cols,
    "lgb_params": lgb_params
}

joblib.dump(artifacts, OUT_MODEL)
print(f"âœ“ Saved to {OUT_MODEL}")

# Function for batch prediction
def predict_batch(df_batch, artifacts_path=OUT_MODEL):
    """Predict on new batch using saved artifacts."""
    art = joblib.load(artifacts_path)
    mdl = art["model"]
    feats = art["features"]
    les = art["label_encoders"]
    num_cols_saved = art["numeric_cols"]
    fe_cols_saved = art["fe_cols"]
    le_cols_saved = art["le_cols"]
    
    df_batch = df_batch.copy()
    
    # Fill numeric
    for c in num_cols_saved:
        if c in df_batch.columns:
            df_batch[c] = df_batch[c].fillna(0).astype(np.float32)
        else:
            df_batch[c] = 0.0
    
    # Frequency encode high-cardinality cols
    for c in fe_cols_saved:
        orig_c = c.replace("_freq", "")
        if orig_c in df_batch.columns:
            df_batch[c] = frequency_encoding(df_batch, orig_c)
        else:
            df_batch[c] = 0.0
    
    # Label encode small-cardinality cols
    for col in le_cols_saved:
        if col in df_batch.columns:
            le = les[col]
            df_batch[col] = le.transform(df_batch[col].fillna("__NA__").astype(str)).astype(np.float32)
        else:
            df_batch[col] = 0.0
    
    # Ensure all features are present
    for f in feats:
        if f not in df_batch.columns:
            df_batch[f] = 0.0
    
    X = df_batch[feats].astype(np.float32)
    pred_log = mdl.predict(X, num_iteration=mdl.best_iteration)
    pred = np.expm1(pred_log)
    pred = np.clip(pred, 0, None)
    return pred

print("âœ“ Prediction function ready")
print("\nGenerating test predictions...")

# Process test in chunks (memory-efficient)
dd_test = dd.read_parquet(TEST_PATH, engine='pyarrow')
existing_cols = [c for c in IGNORE_BIG_COLS if c in dd_test.columns]
dd_test = dd_test.drop(columns=existing_cols, errors='ignore')

delayed_parts = dd_test.to_delayed()
print(f"Processing {len(delayed_parts)} test chunks...")

pred_dfs = []

for i, d in enumerate(delayed_parts):
    if (i + 1) % 20 == 0:
        print(f"  Chunk {i+1}/{len(delayed_parts)}...")
    
    part_df = d.compute()
    part_df = reduce_memory(part_df)
    
    row_ids = part_df[ID_COL].values
    
    # Predict
    part_pred = predict_batch(part_df)
    
    pred_dfs.append(pd.DataFrame({
        ID_COL: row_ids,
        TARGET: part_pred
    }))
    
    del part_df, part_pred
    gc.collect()

# Combine
submission = pd.concat(pred_dfs, ignore_index=True)
submission = submission.sort_values(ID_COL).reset_index(drop=True)

output_file = "submission.csv"
submission.to_csv(output_file, index=False)

print(f"\n{'=' * 60}")
print(f"âœ“ Submission saved: {output_file}")
print(f"Shape: {submission.shape}")
print(f"Sample:\n{submission.head(10)}")

# Validation
print(f"\nValidation checks:")
print(f"  NaN values: {submission.isna().sum().sum()}")
print(f"  Negative values: {(submission[TARGET] < 0).sum()}")
print(f"  Min: {submission[TARGET].min():.4f}, Max: {submission[TARGET].max():.4f}")
print(f"  Mean: {submission[TARGET].mean():.4f}")

Generating test predictions...
Processing 96 test chunks...


TypeError: preprocess_new() got an unexpected keyword argument 'scaler'

## Summary

This simplified pipeline:

1. **Loads data** efficiently with Dask
2. **Removes complex columns** (lists/dicts)
3. **Feature Engineering**:
   - Numeric: fill with 0
   - Categorical high-cardinality (>100): frequency encoding
   - Categorical low-cardinality (â‰¤100): label encoding
4. **Trains LightGBM** with early stopping on log-transformed target
5. **Generates submissions** in chunks (memory-efficient)
6. **Saves artifacts** for future inference

### Key Parameters to Adjust:
- `SAMPLE_FRAC`: Data fraction to use (default 0.15 = 15%)
- `lgb_params`: LightGBM hyperparameters
- Test threshold for frequency encoding (currently >100)