In [1]:
# 03-Modeling.ipynb

import os
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

# 1) Make sure the models folder exists
os.makedirs("models", exist_ok=True)

# 2) Load your train/test feature CSVs
X_train = pd.read_csv("data/features/X_train.csv")
X_test  = pd.read_csv("data/features/X_test.csv")
y_train = pd.read_csv("data/features/y_train.csv").squeeze()
y_test  = pd.read_csv("data/features/y_test.csv").squeeze()

# 3) Fit a baseline Logistic Regression
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# 4) Evaluate on train & test
y_train_proba = model.predict_proba(X_train)[:,1]
y_test_proba  = model.predict_proba(X_test) [:,1]
y_test_pred   = model.predict(X_test)

print("Train ROC AUC:", roc_auc_score(y_train, y_train_proba).round(3))
print(" Test ROC AUC:", roc_auc_score(y_test,  y_test_proba).round(3))

print("\nTest Classification Report:")
print(classification_report(y_test, y_test_pred))

print("Test Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

# 5) (Optional) Hyperparameter tuning for C
param_grid = {"C": [0.01, 0.1, 1, 10]}
grid = GridSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1
)
grid.fit(X_train, y_train)
print("Best C:", grid.best_params_)

# Use the best model if tuning
best_model = grid.best_estimator_

# 6) Save your final model & scaler
joblib.dump(best_model, "models/churn_model.pkl")
print("✅ Saved churn_model.pkl")

# If you also want to persist the scaler from your feature notebook,
# be sure to reload or re-create it here. For example:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler().fit(pd.concat([X_train[numeric_feats], X_test[numeric_feats]]))
# joblib.dump(scaler, "models/scaler.pkl")


Train ROC AUC: 0.833
 Test ROC AUC: 0.825

Test Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        92
           1       0.00      0.00      0.00         7

    accuracy                           0.93        99
   macro avg       0.46      0.50      0.48        99
weighted avg       0.86      0.93      0.90        99

Test Confusion Matrix:
[[92  0]
 [ 7  0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best C: {'C': 0.1}
✅ Saved churn_model.pkl
