In [6]:
# ----------------------------------------------
# 04_Model_CAC_30.ipynb
# Entrenamiento con split temporal para CAC_source_30
# ----------------------------------------------

#  Inicialización
import os
import sys
import pandas as pd

# Añadir src al path para importar los scripts
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

#  Imports del script
from train import (
    load_and_prepare_data,
    build_preprocessor,
    train_models,
    train_stacking_model,
    save_models
)

#   Cargar datos y definir cortes
df = pd.read_csv("../data/processed/final_dataset.csv", parse_dates=["first_session"])
df = df[df["first_session"] < "2018-07-01"]
df = df[~df["CAC_source_30"].isna()]

# Split temporal manual
train_df = df[df["first_session"] < "2018-01-01"]
test_df  = df[(df["first_session"] >= "2018-01-01") & (df["first_session"] < "2018-07-01")]

# Preparar features y targets
target = "CAC_source_30"
drop_cols = ['uid', 'first_session', 'last_session', 'first_order', 'last_order',
             'LTV_180', 'CAC_source_30', 'ltv_cohort_avg', 'cac_cohort_avg', 'conversion_rate_cohort']

X_train = train_df.drop(columns=drop_cols, errors='ignore')
y_train = train_df[target]

X_test = test_df.drop(columns=drop_cols, errors='ignore')
y_test = test_df[target]

#   Preprocesamiento automático
preprocessor = build_preprocessor(X_train)



In [7]:
#  Entrenar modelos individuales
print("Entrenando modelos base y avanzados...")
modelos = train_models(X_train, y_train, preprocessor)

#   Modelo ensamblado (stacking)
print("Entrenando modelo ensamblado (stacking)...")
stacked_model = train_stacking_model(X_train, y_train, preprocessor, modelos)
modelos["stacking"] = stacked_model



Entrenando modelos base y avanzados...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2377
[LightGBM] [Info] Number of data points in the train set: 23174, number of used features: 26
[LightGBM] [Info] Start training from score 6296.056221
Entrenando modelo ensamblado (stacking)...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001034 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2377
[LightGBM] [Info] Number of data points in the train set: 23174, number of used features: 26
[LightGBM] [Info] Start training from score 6296.056221
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001585 secon

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [8]:
#  Guardar modelos
print("Guardando modelos en carpeta /models...")
save_models(modelos, target_name="CAC_source_30", save_path="../models/")

print(" Entrenamiento completado. Test set disponible para evaluación.")


Guardando modelos en carpeta /models...
Modelos guardados exitosamente en ../models/
 Entrenamiento completado. Test set disponible para evaluación.
