In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# 1. Load data
df = pd.read_csv('customer_churn.csv')

# Basic cleaning
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna(subset=['TotalCharges'])
df = df.drop(['customerID'], axis=1)
df['Churn'] = df['Churn'].map({'Yes':1, 'No':0})

# Features & target
X = df.drop('Churn', axis=1)
y = df['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Column definitions  ← very important
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

categorical_cols = [
    'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
    'PaperlessBilling', 'PaymentMethod'
]

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

# Pipeline + model
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        max_iter=2000,
        class_weight='balanced',
        random_state=42
    ))
])

# Train the model  ← this creates the 'model' object
print("Training...")
model.fit(X_train, y_train)
print("Model trained.")

# ────────────────────────────────────────────────
# Now you can safely run coefficient extraction
# ────────────────────────────────────────────────

feature_names = (
    numerical_cols + 
    list(model.named_steps['preprocessor']
         .named_transformers_['cat']
         .get_feature_names_out(categorical_cols))
)

coefs = model.named_steps['classifier'].coef_[0]
intercept = model.named_steps['classifier'].intercept_[0]

df_coefs = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefs.round(4)
})

df_coefs['abs'] = df_coefs['coefficient'].abs()
df_coefs = df_coefs.sort_values('abs', ascending=False).drop(columns='abs')

print("\nIntercept:", round(intercept, 4))
print("\nTop 15 strongest coefficients:")
print(df_coefs.head(15).to_string(index=False))

Training...
Model trained.

Intercept: -0.5232

Top 15 strongest coefficients:
                       feature  coefficient
             Contract_Two year      -1.4805
                        tenure      -1.2475
   InternetService_Fiber optic       1.1420
             Contract_One year      -0.8012
                  TotalCharges       0.6095
                MonthlyCharges      -0.4741
PaymentMethod_Electronic check       0.4198
               StreamingTV_Yes       0.3821
              PhoneService_Yes      -0.3770
           StreamingMovies_Yes       0.3708
            OnlineSecurity_Yes      -0.3555
             MultipleLines_Yes       0.3413
               TechSupport_Yes      -0.3213
                Dependents_Yes      -0.2671
          PaperlessBilling_Yes       0.2612


In [5]:
# 
from sklearn.metrics import roc_auc_score, confusion_matrix

# ─── After model.fit(X_train, y_train) ───

# Get predictions
y_pred = model.predict(X_test)                  # 0 or 1
y_prob = model.predict_proba(X_test)[:, 1]      # probability of churn (class 1)

# 1. ROC AUC
auc_score = roc_auc_score(y_test, y_prob)
print(f"ROC AUC : {auc_score:.4f}")
print(f"   → Model quality: {'poor' if auc_score < 0.7 else 'okay' if auc_score < 0.8 else 'good' if auc_score < 0.9 else 'very good'}")

# 2. Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print("\nConfusion Matrix:")
print(cm)

# Nicer formatted version with labels
print("\n                  Predicted")
print("                  Stay     Churn")
print("Actual  Stay   ", f"{cm[0,0]:5d}     {cm[0,1]:5d}")
print("        Churn  ", f"{cm[1,0]:5d}     {cm[1,1]:5d}")

# Quick interpretation in plain English
tn, fp, fn, tp = cm.ravel()
total = tn + fp + fn + tp

print("\nSimple summary:")
print(f"• Correctly predicted stay       : {tn} ({tn/total:.1%})")
print(f"• Correctly caught churners      : {tp} ({tp/total:.1%})")
print(f"• False alarms (warned but stayed): {fp}")
print(f"• Missed leavers                 : {fn}")

ROC AUC : 0.8342
   → Model quality: good

Confusion Matrix:
[[724 309]
 [ 77 297]]

                  Predicted
                  Stay     Churn
Actual  Stay      724       309
        Churn      77       297

Simple summary:
• Correctly predicted stay       : 724 (51.5%)
• Correctly caught churners      : 297 (21.1%)
• False alarms (warned but stayed): 309
• Missed leavers                 : 77
