In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

smadex_challenge_predict_the_revenue_path = kagglehub.competition_download('smadex-challenge-predict-the-revenue')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import dask
import dask.dataframe as dd

dask.config.set({"dataframe.convert-string": False})

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_log_error
import gc

TRAIN_PATH = "/kaggle/input/smadex-challenge-predict-the-revenue/train/train"
TEST_PATH  = "/kaggle/input/smadex-challenge-predict-the-revenue/test/test"

TARGET_COL = "iap_revenue_d7"

In [None]:
import numpy as np
import pandas as pd

# Columnas monstruosas que casi seguro van con listas/mapas/histogramas.
# Las quitamos del baseline para RAM y simplicidad.
ignore_big_cols = [
    "bundles_ins",
    "user_bundles",
    "user_bundles_l28d",
    "city_hist",
    "country_hist",
    "region_hist",
    "dev_language_hist",
    "dev_osv_hist",
    "bcat",
    "bcat_bottom_taxonomy",
    "bundles_cat",
    "bundles_cat_bottom_taxonomy",
    "first_request_ts_bundle",
    "first_request_ts_category_bottom_taxonomy",
    "last_buy_ts_bundle",
    "last_buy_ts_category",
    "last_install_ts_bundle",
    "last_install_ts_category",
    "advertiser_actions_action_count",
    "advertiser_actions_action_last_timestamp",
    "user_actions_bundles_action_count",
    "user_actions_bundles_action_last_timestamp",
    "new_bundles",
    "whale_users_bundle_num_buys_prank",
    "whale_users_bundle_revenue_prank",
    "whale_users_bundle_total_num_buys",
    "whale_users_bundle_total_revenue",
]

def reduce_memory(df: pd.DataFrame) -> pd.DataFrame:
    """Downcast numéricas para ahorrar memoria."""
    df = df.copy()
    for col in df.columns:
        col_type = df[col].dtype
        if col_type == "float64":
            df[col] = df[col].astype("float32")
        elif col_type == "int64":
            df[col] = df[col].astype("int32")
    return df

def detect_listlike_columns(df: pd.DataFrame, cols=None):
    """Detecta columnas que contienen listas o dicts."""
    if cols is None:
        cols = df.columns
    listlike = []
    for c in cols:
        sample_vals = df[c].head(100)
        if sample_vals.apply(lambda v: isinstance(v, (list, dict))).any():
            listlike.append(c)
    return listlike

def preprocess_train_valid(X_train, X_valid, num_cols, cat_cols):
    """Preprocesado para train/valid."""
    X_train = X_train.copy()
    X_valid = X_valid.copy()

    # Numéricas: NaN -> 0
    for c in num_cols:
        X_train[c] = X_train[c].fillna(0)
        X_valid[c] = X_valid[c].fillna(0)

    # Categóricas: strings + categorías fijas basadas en train
    for c in cat_cols:
        X_train[c] = X_train[c].astype("object").fillna("unknown").astype(str)
        X_train[c] = X_train[c].astype("category")

        cats = X_train[c].cat.categories
        X_valid[c] = X_valid[c].astype("object").fillna("unknown").astype(str)
        X_valid[c] = X_valid[c].astype(
            pd.api.types.CategoricalDtype(categories=cats)
        )

    return X_train, X_valid

def preprocess_new(X_new, num_cols, cat_cols, cat_ref_df):
    """Preprocesado para test usando las categorías de train."""
    X_new = X_new.copy()

    for c in num_cols:
        if c in X_new.columns:
            X_new[c] = X_new[c].fillna(0)

    for c in cat_cols:
        if c in X_new.columns:
            X_new[c] = X_new[c].astype("object").fillna("unknown").astype(str)
            cats = cat_ref_df[c].cat.categories
            X_new[c] = X_new[c].astype(
                pd.api.types.CategoricalDtype(categories=cats)
            )

    return X_new

In [None]:
# Train: 1–5 de octubre
filters_train = [("datetime", ">=", "2025-10-01-00-00"),
                 ("datetime", "<",  "2025-10-06-00-00")]

# Valid: día 6 de octubre
filters_valid = [("datetime", ">=", "2025-10-06-00-00"),
                 ("datetime", "<",  "2025-10-07-00-00")]

dd_train = dd.read_parquet(TRAIN_PATH, filters=filters_train)
dd_valid = dd.read_parquet(TRAIN_PATH, filters=filters_valid)

# Quitar las columnas monstruosas si existen
existing_big_cols_train = [c for c in ignore_big_cols if c in dd_train.columns]
existing_big_cols_valid = [c for c in ignore_big_cols if c in dd_valid.columns]

dd_train = dd_train.drop(columns=existing_big_cols_train)
dd_valid = dd_valid.drop(columns=existing_big_cols_valid)

# Muestreo de train: AJUSTA ESTO si quieres más datos
frac_train = 0.10  # 10% de train; puedes subir a 0.2 si ves que va bien

train_sample = dd_train.sample(frac=frac_train, random_state=42).compute()
valid_df     = dd_valid.compute()

train_sample = reduce_memory(train_sample)
valid_df     = reduce_memory(valid_df)

print("Train sample shape:", train_sample.shape)
print("Valid shape:", valid_df.shape)
print("Train memory (GB):", train_sample.memory_usage(deep=True).sum() / (1024**3))
print("Valid memory (GB):", valid_df.memory_usage(deep=True).sum() / (1024**3))

Train sample shape: (1729408, 58)
Valid shape: (3306478, 58)
Train memory (GB): 2.52042169123888
Valid memory (GB): 4.839879894629121


In [None]:
# Todas las labels auxiliares que SOLO están en train
LABEL_COLS = [
    "buyer_d1",
    "buyer_d7",
    "buyer_d14",
    "buyer_d28",
    "buy_d7",
    "buy_d14",
    "buy_d28",
    "iap_revenue_d7",   # target principal
    "iap_revenue_d14",
    "iap_revenue_d28",
    "registration",
    "retention_d1_to_d7",
    "retention_d3_to_d7",
    "retention_d7_to_d14",
    "retention_d1",
    "retention_d3",
    "retentiond7",
]

TARGET_COL = "iap_revenue_d7"

assert TARGET_COL in train_sample.columns, "No está iap_revenue_d7 en train"

# y_train / y_valid: solo la target principal
y_train = train_sample[TARGET_COL].values
y_valid = valid_df[TARGET_COL].values

# Columnas que NO queremos como features
cols_to_drop_from_X = ["row_id", "datetime"] + LABEL_COLS

# Features = todas las demás
feature_cols = [c for c in train_sample.columns if c not in cols_to_drop_from_X]

print("Número de features:", len(feature_cols))

X_train = train_sample[feature_cols].copy()
X_valid = valid_df[feature_cols].copy()

# 1) Detectar columnas con listas/dicts y quitarlas
listlike_cols = detect_listlike_columns(X_train, cols=feature_cols)
print("Columnas con listas/dicts:", listlike_cols)

X_train = X_train.drop(columns=listlike_cols)
X_valid = X_valid.drop(columns=listlike_cols)

# 2) Volver a calcular numéricas y categóricas
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

print("Numéricas:", len(num_cols))
print("Categóricas:", len(cat_cols))

# 3) Preprocesar
X_train_prep, X_valid_prep = preprocess_train_valid(X_train, X_valid, num_cols, cat_cols)

print("X_train_prep shape:", X_train_prep.shape)
print("X_valid_prep shape:", X_valid_prep.shape)

Número de features: 39
Columnas con listas/dicts: ['avg_daily_sessions', 'avg_duration', 'cpm', 'cpm_pct_rk', 'ctr', 'ctr_pct_rk', 'hour_ratio', 'iap_revenue_usd_bundle', 'iap_revenue_usd_category', 'iap_revenue_usd_category_bottom_taxonomy', 'num_buys_bundle', 'num_buys_category', 'num_buys_category_bottom_taxonomy', 'rev_by_adv', 'rwd_prank']
Numéricas: 10
Categóricas: 14
X_train_prep shape: (1729408, 24)
X_valid_prep shape: (3306478, 24)


In [None]:
from lightgbm import LGBMRegressor

y_train_log = np.log1p(y_train)
y_valid_log = np.log1p(y_valid)

model = LGBMRegressor(
    objective="regression",
    n_estimators=600,
    learning_rate=0.05,
    num_leaves=255,
    max_depth=-1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=0.0,
    verbosity=-1
)

model.fit(X_train_prep, y_train_log)

print("Modelo entrenado.")

Modelo entrenado.


In [None]:
# Predicción en espacio log
valid_pred_log = model.predict(X_valid_prep)

# Volver al espacio original
valid_pred = np.expm1(valid_pred_log)
valid_pred = np.clip(valid_pred, 0, None)

msle_model = mean_squared_log_error(y_valid, valid_pred)
print("MSLE modelo:", msle_model)

zeros_pred = np.zeros_like(y_valid)
msle_zeros = mean_squared_log_error(y_valid, zeros_pred)
print("MSLE baseline (todo 0):", msle_zeros)

MSLE modelo: 0.17942944557952253
MSLE baseline (todo 0): 0.21498893


In [None]:
# Conservamos SOLO lo imprescindible para el test: model, X_train_prep, num_cols, cat_cols
to_keep = {"model", "X_train_prep", "num_cols", "cat_cols"}

for name in list(globals().keys()):
    if name.startswith("_"):
        continue
    if name in to_keep:
        continue
    # No borramos módulos (dask, pd, np, etc.)
    if isinstance(globals()[name], type(os)):
        continue
    try:
        del globals()[name]
    except:
        pass

gc.collect()

4

In [None]:
import numpy as np
import pandas as pd

# Columnas monstruosas que casi seguro van con listas/mapas/histogramas.
# Las quitamos del baseline para RAM y simplicidad.
ignore_big_cols = [
    "bundles_ins",
    "user_bundles",
    "user_bundles_l28d",
    "city_hist",
    "country_hist",
    "region_hist",
    "dev_language_hist",
    "dev_osv_hist",
    "bcat",
    "bcat_bottom_taxonomy",
    "bundles_cat",
    "bundles_cat_bottom_taxonomy",
    "first_request_ts_bundle",
    "first_request_ts_category_bottom_taxonomy",
    "last_buy_ts_bundle",
    "last_buy_ts_category",
    "last_install_ts_bundle",
    "last_install_ts_category",
    "advertiser_actions_action_count",
    "advertiser_actions_action_last_timestamp",
    "user_actions_bundles_action_count",
    "user_actions_bundles_action_last_timestamp",
    "new_bundles",
    "whale_users_bundle_num_buys_prank",
    "whale_users_bundle_revenue_prank",
    "whale_users_bundle_total_num_buys",
    "whale_users_bundle_total_revenue",
]

def reduce_memory(df: pd.DataFrame) -> pd.DataFrame:
    """Downcast numéricas para ahorrar memoria."""
    df = df.copy()
    for col in df.columns:
        col_type = df[col].dtype
        if col_type == "float64":
            df[col] = df[col].astype("float32")
        elif col_type == "int64":
            df[col] = df[col].astype("int32")
    return df

def detect_listlike_columns(df: pd.DataFrame, cols=None):
    """Detecta columnas que contienen listas o dicts."""
    if cols is None:
        cols = df.columns
    listlike = []
    for c in cols:
        sample_vals = df[c].head(100)
        if sample_vals.apply(lambda v: isinstance(v, (list, dict))).any():
            listlike.append(c)
    return listlike

def preprocess_train_valid(X_train, X_valid, num_cols, cat_cols):
    """Preprocesado para train/valid."""
    X_train = X_train.copy()
    X_valid = X_valid.copy()

    # Numéricas: NaN -> 0
    for c in num_cols:
        X_train[c] = X_train[c].fillna(0)
        X_valid[c] = X_valid[c].fillna(0)

    # Categóricas: strings + categorías fijas basadas en train
    for c in cat_cols:
        X_train[c] = X_train[c].astype("object").fillna("unknown").astype(str)
        X_train[c] = X_train[c].astype("category")

        cats = X_train[c].cat.categories
        X_valid[c] = X_valid[c].astype("object").fillna("unknown").astype(str)
        X_valid[c] = X_valid[c].astype(
            pd.api.types.CategoricalDtype(categories=cats)
        )

    return X_train, X_valid

def preprocess_new(X_new, num_cols, cat_cols, cat_ref_df):
    """Preprocesado para test usando las categorías de train."""
    X_new = X_new.copy()

    for c in num_cols:
        if c in X_new.columns:
            X_new[c] = X_new[c].fillna(0)

    for c in cat_cols:
        if c in X_new.columns:
            X_new[c] = X_new[c].astype("object").fillna("unknown").astype(str)
            cats = cat_ref_df[c].cat.categories
            X_new[c] = X_new[c].astype(
                pd.api.types.CategoricalDtype(categories=cats)
            )

    return X_new

In [None]:
import dask
import dask.dataframe as dd

TEST_PATH = "/kaggle/input/smadex-challenge-predict-the-revenue/test/test"

dd_test = dd.read_parquet(TEST_PATH)
existing_big_cols_test = [c for c in ignore_big_cols if c in dd_test.columns]
dd_test = dd_test.drop(columns=existing_big_cols_test)

delayed_parts = dd_test.to_delayed()
print("Número de chunks de test:", len(delayed_parts))

feature_cols = X_train_prep.columns.tolist()

pred_dfs = []

for i, d in enumerate(delayed_parts):
    print(f"Procesando chunk {i+1}/{len(delayed_parts)}...")

    part_df = d.compute()
    part_df = reduce_memory(part_df)

    row_ids = part_df["row_id"].values
    X_part = part_df[feature_cols].copy()   # ahora SÍ existen todas

    X_part_prep = preprocess_new(X_part, num_cols, cat_cols, X_train_prep)

    part_pred_log = model.predict(X_part_prep)
    part_pred = np.expm1(part_pred_log)
    part_pred = np.clip(part_pred, 0, None)

    pred_dfs.append(pd.DataFrame({
        "row_id": row_ids,
        "iap_revenue_d7": part_pred
    }))

    del part_df, X_part, X_part_prep, row_ids, part_pred_log, part_pred
    gc.collect()

submission = pd.concat(pred_dfs, ignore_index=True)
submission.to_csv("/kaggle/working/submission.csv", index=False)
submission.head()

Número de chunks de test: 96
Procesando chunk 1/96...
Procesando chunk 2/96...
Procesando chunk 3/96...
Procesando chunk 4/96...
Procesando chunk 5/96...
Procesando chunk 6/96...
Procesando chunk 7/96...
Procesando chunk 8/96...
Procesando chunk 9/96...
Procesando chunk 10/96...
Procesando chunk 11/96...
Procesando chunk 12/96...
Procesando chunk 13/96...
Procesando chunk 14/96...
Procesando chunk 15/96...
Procesando chunk 16/96...
Procesando chunk 17/96...
Procesando chunk 18/96...
Procesando chunk 19/96...
Procesando chunk 20/96...
Procesando chunk 21/96...
Procesando chunk 22/96...
Procesando chunk 23/96...
Procesando chunk 24/96...
Procesando chunk 25/96...
Procesando chunk 26/96...
Procesando chunk 27/96...
Procesando chunk 28/96...
Procesando chunk 29/96...
Procesando chunk 30/96...
Procesando chunk 31/96...
Procesando chunk 32/96...
Procesando chunk 33/96...
Procesando chunk 34/96...
Procesando chunk 35/96...
Procesando chunk 36/96...
Procesando chunk 37/96...
Procesando chunk 3

Unnamed: 0,row_id,iap_revenue_d7
0,e2f514a9-d922-4a17-bf94-f228bf4cd82f,0.0
1,4bfc70d3-d619-410a-9683-4cd759f30f32,0.059406
2,ad433b66-b41e-4157-a6fd-24cd30701f6a,0.0
3,5ed964d6-ddce-42e8-9fad-276eb7f64c2f,0.007885
4,81b73a45-c395-4d08-a4a3-513873440db3,0.000545


In [None]:
print(submission.head())
print(submission.shape)

print(submission.isna().sum())          # no debería haber NaNs
print((submission['iap_revenue_d7'] < 0).sum())  # debería ser 0

                                 row_id  iap_revenue_d7
0  e2f514a9-d922-4a17-bf94-f228bf4cd82f        0.000000
1  4bfc70d3-d619-410a-9683-4cd759f30f32        0.059406
2  ad433b66-b41e-4157-a6fd-24cd30701f6a        0.000000
3  5ed964d6-ddce-42e8-9fad-276eb7f64c2f        0.007885
4  81b73a45-c395-4d08-a4a3-513873440db3        0.000545
(13188409, 2)
row_id            0
iap_revenue_d7    0
dtype: int64
0
