# L1 – Modelado v1 (Regresión logística, Árbol de decisión y Random Forest)

En este notebook se entrena y compara el desempeño de tres modelos supervisados para predecir la variable **target_asiste** (asistencia a la masterclass) a partir del dataset de features L1.

In [8]:
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
)

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 20)

FEATURES_PATH = os.path.join("..", "data", "processed", "l1_features.csv")

In [9]:
# Cargar l1_features.csv

df = pd.read_csv(FEATURES_PATH, encoding="utf-8")
df.shape, df.head()

((224, 20),
    creado_segundos  lead_scoring  f_medium_social  f_medium_referido  \
 0           2086.8             0                0                  0   
 1           1357.6             0                0                  1   
 2            288.2             0                0                  0   
 3           2174.0             0                1                  0   
 4            414.2             0                0                  1   
 
    f_medium_unknown  f_medium_directo  f_tiene_telefono_2  f_gdpr_accept_int  \
 0                 1                 0                   0                  0   
 1                 0                 0                   0                  0   
 2                 1                 0                   0                  0   
 3                 0                 0                   0                  0   
 4                 0                 0                   1                  0   
 
    f_len_empresa    medium utm_source  utm_medium          

In [10]:
# Separar X, y y detectar columnas numéricas/categóricas

# Target
y = df["target_asiste"]

# Variables de entrada
X = df.drop(columns=["target_asiste"])

# Columnas categóricas y numéricas
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

cat_cols, num_cols

(['medium',
  'utm_source',
  'utm_medium',
  'utm_campaign',
  'utm_content',
  'utm_term',
  'status',
  'etapa_de_oportunidad_activa',
  'propietario'],
 ['creado_segundos',
  'lead_scoring',
  'f_medium_social',
  'f_medium_referido',
  'f_medium_unknown',
  'f_medium_directo',
  'f_tiene_telefono_2',
  'f_gdpr_accept_int',
  'f_len_empresa',
  'origen'])

In [11]:
# Pipeline de preprocesamiento
# Definir preprocesador (OneHot + StandardScaler)

# Transformación para variables numéricas
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

# Transformación para variables categóricas
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# ColumnTransformer que aplica cada pipeline a su grupo de columnas
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, cat_cols),
        ("num", numeric_transformer, num_cols),
    ]
)

In [12]:
# Train/Test split
# División estratificada

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y,
)

X_train.shape, X_test.shape, y_train.value_counts(), y_test.value_counts()

((156, 19),
 (68, 19),
 target_asiste
 0    131
 1     25
 Name: count, dtype: int64,
 target_asiste
 0    57
 1    11
 Name: count, dtype: int64)

In [13]:
# Helper para entrenar y evaluar modelos
# Función utilitaria

def entrenar_y_evaluar(nombre, modelo_base, X_train, X_test, y_train, y_test, preprocessor):
    """
    Entrena un pipeline (preprocesamiento + modelo) y devuelve métricas clave.
    """
    pipe = Pipeline(
        steps=[
            ("preprocess", preprocessor),
            ("model", modelo_base),
        ]
    )

    pipe.fit(X_train, y_train)

    # Probabilidades y predicciones
    y_proba = pipe.predict_proba(X_test)[:, 1]
    y_pred = pipe.predict(X_test)

    # Métricas
    auc = roc_auc_score(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    print("=" * 70)
    print(f"Modelo: {nombre}")
    print(f"AUC      : {auc:.3f}")
    print(f"Accuracy : {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Recall   : {rec:.3f}")
    print(f"F1       : {f1:.3f}")
    print("\nClassification report:\n")
    print(classification_report(y_test, y_pred, digits=3, zero_division=0))
    print("Matriz de confusión:")
    print(confusion_matrix(y_test, y_pred))

    metrics = {
        "modelo": nombre,
        "AUC": auc,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "pipeline": pipe,
    }
    return metrics

In [14]:
# Entrenar los tres modelos (LR, DT, RF)
# Regresión logística (modelo base principal)

modelo_lr = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="lbfgs",
)

metrics_lr = entrenar_y_evaluar(
    "Logistic Regression",
    modelo_lr,
    X_train,
    X_test,
    y_train,
    y_test,
    preprocessor,
)



Modelo: Logistic Regression
AUC      : 0.644
Accuracy : 0.647
Precision: 0.190
Recall   : 0.364
F1       : 0.250

Classification report:

              precision    recall  f1-score   support

           0      0.851     0.702     0.769        57
           1      0.190     0.364     0.250        11

    accuracy                          0.647        68
   macro avg      0.521     0.533     0.510        68
weighted avg      0.744     0.647     0.685        68

Matriz de confusión:
[[40 17]
 [ 7  4]]


In [15]:
# Entrenar Árbol de Decisión

modelo_dt = DecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf=5,
    random_state=42,
    class_weight="balanced",
)

metrics_dt = entrenar_y_evaluar(
    "Decision Tree",
    modelo_dt,
    X_train,
    X_test,
    y_train,
    y_test,
    preprocessor,
)

Modelo: Decision Tree
AUC      : 0.506
Accuracy : 0.603
Precision: 0.167
Recall   : 0.364
F1       : 0.229

Classification report:

              precision    recall  f1-score   support

           0      0.841     0.649     0.733        57
           1      0.167     0.364     0.229        11

    accuracy                          0.603        68
   macro avg      0.504     0.506     0.481        68
weighted avg      0.732     0.603     0.651        68

Matriz de confusión:
[[37 20]
 [ 7  4]]




In [16]:
# Entrenar Random Forest

modelo_rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_leaf=5,
    random_state=42,
    class_weight="balanced",
    n_jobs=-1,
)

metrics_rf = entrenar_y_evaluar(
    "Random Forest",
    modelo_rf,
    X_train,
    X_test,
    y_train,
    y_test,
    preprocessor,
)



Modelo: Random Forest
AUC      : 0.561
Accuracy : 0.706
Precision: 0.200
Recall   : 0.273
F1       : 0.231

Classification report:

              precision    recall  f1-score   support

           0      0.849     0.789     0.818        57
           1      0.200     0.273     0.231        11

    accuracy                          0.706        68
   macro avg      0.525     0.531     0.524        68
weighted avg      0.744     0.706     0.723        68

Matriz de confusión:
[[45 12]
 [ 8  3]]




In [17]:
# Comparar y elegir modelo ganador v1

results = pd.DataFrame(
    [
        {k: v for k, v in metrics_lr.items() if k != "pipeline"},
        {k: v for k, v in metrics_dt.items() if k != "pipeline"},
        {k: v for k, v in metrics_rf.items() if k != "pipeline"},
    ]
)

results.sort_values(by="AUC", ascending=False)

Unnamed: 0,modelo,AUC,accuracy,precision,recall,f1
0,Logistic Regression,0.644338,0.647059,0.190476,0.363636,0.25
2,Random Forest,0.560606,0.705882,0.2,0.272727,0.230769
1,Decision Tree,0.50638,0.602941,0.166667,0.363636,0.228571


In [18]:
# Guardar el modelo ganador

import joblib

MODELS_DIR = os.path.join("..", "models")
os.makedirs(MODELS_DIR, exist_ok=True)

mejor_modelo = metrics_lr["pipeline"]  # o metrics_dt["pipeline"] / metrics_rf["pipeline"]

MODEL_PATH = os.path.join(MODELS_DIR, "modelo_L1_asistencia.pkl")
joblib.dump(mejor_modelo, MODEL_PATH)

MODEL_PATH

'../models/modelo_L1_asistencia.pkl'