## Logistic Regression Baseline (60/20/20 split)

Lightweight notebook that trains and evaluates a logistic regression classifier on the churn dataset using a 60% train / 20% validation / 20% test split—no SMOTE, no grid search.

In [None]:
import pandas as pd
import numpy as np

from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, precision_recall_curve, average_precision_score
)

import matplotlib.pyplot as plt
import seaborn as sns

import joblib

RANDOM_STATE = 42


### Step 1: Load Dataset

In [None]:
DATA_PATH = Path('churn_clean.csv')
df = pd.read_csv(DATA_PATH)
df.head()


### Step 2: Prepare Features and Target

In [None]:
X = df.drop('Exited', axis=1)
y = df['Exited']

cat_cols = X.select_dtypes(include='object').columns.tolist()
num_cols = X.select_dtypes(exclude='object').columns.tolist()
cat_cols, num_cols


### Step 3: Train/Validation/Test Split (60/20/20)

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.4,
    stratify=y,
    random_state=RANDOM_STATE
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    stratify=y_temp,
    random_state=RANDOM_STATE
)

print('Train:', X_train.shape, 'Target ratio:', y_train.mean().round(3))
print('Validation:', X_val.shape, 'Target ratio:', y_val.mean().round(3))
print('Test:', X_test.shape, 'Target ratio:', y_test.mean().round(3))


### Step 4: Build Pipeline

In [None]:
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

preprocess = ColumnTransformer([
    ('categorical', ohe, cat_cols),
    ('numeric', StandardScaler(), num_cols)
])

log_reg_pipeline = Pipeline([
    ('preprocess', preprocess),
    ('model', LogisticRegression(
        max_iter=1000,
        solver='liblinear',
        class_weight='balanced',
        random_state=RANDOM_STATE
    ))
])
log_reg_pipeline


### Step 5: Train Model

In [None]:
log_reg_pipeline.fit(X_train, y_train)
log_reg_pipeline


### Step 6: Evaluate on Validation and Test Sets

In [None]:
y_val_pred = log_reg_pipeline.predict(X_val)
y_val_proba = log_reg_pipeline.predict_proba(X_val)[:, 1]
y_test_pred = log_reg_pipeline.predict(X_test)
y_test_proba = log_reg_pipeline.predict_proba(X_test)[:, 1]


In [None]:
print('Validation ROC-AUC:', round(roc_auc_score(y_val, y_val_proba), 3))
print(classification_report(y_val, y_val_pred))
print('Test ROC-AUC:', round(roc_auc_score(y_test, y_test_proba), 3))
print(classification_report(y_test, y_test_pred))


In [None]:
val_cm = confusion_matrix(y_val, y_val_pred)
test_cm = confusion_matrix(y_test, y_test_pred)
val_cm, test_cm


### Step 7: Visual Diagnostics

In [None]:
FIG_DIR = Path('figures')
FIG_DIR.mkdir(parents=True, exist_ok=True)


In [None]:
val_fpr, val_tpr, _ = roc_curve(y_val, y_val_proba)
test_fpr, test_tpr, _ = roc_curve(y_test, y_test_proba)
val_auc = roc_auc_score(y_val, y_val_proba)
test_auc = roc_auc_score(y_test, y_test_proba)

val_prec, val_rec, _ = precision_recall_curve(y_val, y_val_proba)
test_prec, test_rec, _ = precision_recall_curve(y_test, y_test_proba)
val_ap = average_precision_score(y_val, y_val_proba)
test_ap = average_precision_score(y_test, y_test_proba)


In [None]:
plt.figure(figsize=(6, 4))
plt.plot(val_fpr, val_tpr, label=f'Validation AUC = {val_auc:.3f}')
plt.plot(test_fpr, test_tpr, label=f'Test AUC = {test_auc:.3f}', linestyle='--')
plt.plot([0, 1], [0, 1], 'k--', linewidth=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve — Logistic Regression Baseline')
plt.legend(loc='lower right')
plt.tight_layout()
plt.savefig(FIG_DIR / 'roc_curve_logreg_baseline.png', dpi=200)
plt.show()


In [None]:
plt.figure(figsize=(6, 4))
plt.plot(val_rec, val_prec, label=f'Validation AP = {val_ap:.3f}')
plt.plot(test_rec, test_prec, label=f'Test AP = {test_ap:.3f}', linestyle='--')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision–Recall Curve — Logistic Regression Baseline')
plt.legend(loc='lower left')
plt.tight_layout()
plt.savefig(FIG_DIR / 'pr_curve_logreg_baseline.png', dpi=200)
plt.show()


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(9, 4))
sns.heatmap(val_cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=axes[0])
axes[0].set_title('Validation Confusion Matrix')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
sns.heatmap(test_cm, annot=True, fmt='d', cmap='Greens', cbar=False, ax=axes[1])
axes[1].set_title('Test Confusion Matrix')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')
plt.tight_layout()
plt.savefig(FIG_DIR / 'confusion_matrices_logreg_baseline.png', dpi=200)
plt.show()


### Step 8: Inspect Coefficients

In [None]:
coef = log_reg_pipeline.named_steps['model'].coef_[0]
feature_names = log_reg_pipeline.named_steps['preprocess'].get_feature_names_out()
coef_series = pd.Series(coef, index=feature_names)
coef_series.sort_values(key=lambda s: s.abs(), ascending=False).head(15)


### Step 9: Persist Model

In [None]:
MODEL_DIR = Path('models')
MODEL_DIR.mkdir(parents=True, exist_ok=True)
MODEL_PATH = MODEL_DIR / 'log_reg_baseline.joblib'
joblib.dump(log_reg_pipeline, MODEL_PATH)
MODEL_PATH


### Step 10: Notes

- Pipeline keeps preprocessing and model together for easy reuse.
- Validation metrics help tune decision thresholds before touching the hold-out test set.
- Saved joblib file can be reloaded with `joblib.load(MODEL_PATH)` to score new data.