In [1]:
import numpy as np
import pandas as pd
from joblib import load
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
import sys
from pathlib import Path

ROOT = Path.cwd().parent
sys.path.append(str(ROOT))

In [3]:
from src.preprocessing import (
    load_and_clean_data,
    split_features_target,
    split_data
)

# Ruta relativa desde notebooks/02_modeling.ipynb
df = load_and_clean_data("../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv")
X, y, cat_cols, num_cols = split_features_target(df)
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

In [4]:
#Cargar preprocesador y transformar los datos
preprocessor = load("../models/preprocessor.joblib")

X_train_pre = preprocessor.transform(X_train)
X_val_pre = preprocessor.transform(X_val)
X_test_pre = preprocessor.transform(X_test)

In [5]:
#Modelo 1: Regresión logística
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_pre, y_train)

y_val_pred = logreg.predict(X_val_pre)
y_val_proba = logreg.predict_proba(X_val_pre)[:, 1]



In [6]:
#Modelo 2: Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_pre, y_train)

y_val_pred_rf = rf.predict(X_val_pre)
y_val_proba_rf = rf.predict_proba(X_val_pre)[:, 1]

In [7]:
#Modelo 3: XGBoost
xgb = XGBClassifier(eval_metric='logloss')
xgb.fit(X_train_pre, y_train)

y_val_pred_xgb = xgb.predict(X_val_pre)
y_val_proba_xgb = xgb.predict_proba(X_val_pre)[:, 1]

In [9]:
def evaluar_modelo(y_true, y_pred, y_proba):
    print("F1:", f1_score(y_true, y_pred))
    print("ROC AUC:", roc_auc_score(y_true, y_proba))
    print("Matriz de confusión:\n", confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred))

In [10]:
print("Regresión logística:")
evaluar_modelo(y_val, y_val_pred, y_val_proba)

Regresión logística:
F1: 0.5568862275449101
ROC AUC: 0.8356313835927753
Matriz de confusión:
 [[925 108]
 [188 186]]
              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1033
           1       0.63      0.50      0.56       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407



In [11]:
print("Random Forest:")
evaluar_modelo(y_val, y_val_pred_rf, y_val_proba_rf)

Random Forest:
F1: 0.5226917057902973
ROC AUC: 0.8137854543383841
Matriz de confusión:
 [[935  98]
 [207 167]]
              precision    recall  f1-score   support

           0       0.82      0.91      0.86      1033
           1       0.63      0.45      0.52       374

    accuracy                           0.78      1407
   macro avg       0.72      0.68      0.69      1407
weighted avg       0.77      0.78      0.77      1407



In [12]:
print("XGBoost:")
evaluar_modelo(y_val, y_val_pred_xgb, y_val_proba_xgb)

XGBoost:
F1: 0.5369532428355958
ROC AUC: 0.8116435696869613
Matriz de confusión:
 [[922 111]
 [196 178]]
              precision    recall  f1-score   support

           0       0.82      0.89      0.86      1033
           1       0.62      0.48      0.54       374

    accuracy                           0.78      1407
   macro avg       0.72      0.68      0.70      1407
weighted avg       0.77      0.78      0.77      1407



In [13]:
from joblib import dump

# Guardar los modelos entrenados
dump(logreg, "../models/churn_logreg.joblib")
dump(rf, "../models/churn_rf.joblib")
dump(xgb, "../models/churn_xgb.joblib")

print("✅ Modelos guardados correctamente en la carpeta 'models/'.")

✅ Modelos guardados correctamente en la carpeta 'models/'.
