In [5]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report,
    RocCurveDisplay
)

sns.set(style="whitegrid")
os.makedirs("../images", exist_ok=True)


In [2]:

data = pd.read_csv("../DATA/churn_data.csv")
data["TotalCharges"] = pd.to_numeric(data["TotalCharges"], errors="coerce")
data = data.drop(columns=["customerID"])
print("Shape:", data.shape)
print(data.dtypes)
print(data.isnull().sum())
data.head()


Shape: (7043, 8)
tenure                int64
PhoneService         object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object
tenure               0
PhoneService         0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64


Unnamed: 0,tenure,PhoneService,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,34,Yes,One year,No,Mailed check,56.95,1889.5,No
2,2,Yes,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,45,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,2,Yes,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:

X = data.drop(columns=["Churn"])
y = data["Churn"]      
X_train, X_test, y_train, y_test = train_test_split( X, y,test_size=0.35,random_state=42)



In [4]:

numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, numeric_cols),
        ("cat", cat_pipeline, categorical_cols)
    ]
)


In [6]:
models = {
    "LogisticRegression": LogisticRegression(
    
        class_weight="balanced"
    ),
    "DecisionTree": DecisionTreeClassifier(
        max_depth=5,
        random_state=42,
        class_weight="balanced"
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        random_state=42,
        class_weight="balanced"
    )
}


In [7]:
results = []

y_test_binary = (y_test == "Yes").astype(int)
for name, model in models.items():
    print("="*60)
    print(f"Training model: {name}")
    
   
    clf = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", model)
    ])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:, 1]
    
    # metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, pos_label="Yes")
    rec = recall_score(y_test, y_pred, pos_label="Yes")
    f1 = f1_score(y_test, y_pred, pos_label="Yes")
    auc = roc_auc_score(y_test_binary, y_proba)
    
    print(f"Accuracy     : {acc:.4f}")
    print(f"Precision(Yes): {prec:.4f}")
    print(f"Recall(Yes)  : {rec:.4f}")
    print(f"F1-score(Yes): {f1:.4f}")
    print(f"ROC-AUC      : {auc:.4f}")
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred, labels=["No", "Yes"])
    print(cm)
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Save confusion matrix heatmap
    plt.figure(figsize=(4,4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
                xticklabels=["No", "Yes"], yticklabels=["No", "Yes"])
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(f"Confusion Matrix - {name}")
    plt.tight_layout()
    plt.savefig(f"../images/confusion_{name}.png")
    plt.close()
    
    # Save ROC curve
    fig, ax = plt.subplots(figsize=(5,4))
    RocCurveDisplay.from_predictions(y_test_binary, y_proba, ax=ax)
    ax.set_title(f"ROC Curve - {name}")
    plt.tight_layout()
    plt.savefig(f"../images/roc_{name}.png")
    plt.close()
   
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision_Yes": prec,
        "Recall_Yes": rec,
        "F1_Yes": f1,
        "ROC_AUC": auc
    })


results_df = pd.DataFrame(results).sort_values("ROC_AUC", ascending=False)
results_df


Training model: LogisticRegression
Accuracy     : 0.7311
Precision(Yes): 0.5026
Recall(Yes)  : 0.8565
F1-score(Yes): 0.6335
ROC-AUC      : 0.8517

Confusion Matrix:
[[1230  567]
 [  96  573]]

Classification Report:
              precision    recall  f1-score   support

          No       0.93      0.68      0.79      1797
         Yes       0.50      0.86      0.63       669

    accuracy                           0.73      2466
   macro avg       0.72      0.77      0.71      2466
weighted avg       0.81      0.73      0.75      2466

Training model: DecisionTree
Accuracy     : 0.7251
Precision(Yes): 0.4959
Recall(Yes)  : 0.8236
F1-score(Yes): 0.6191
ROC-AUC      : 0.8308

Confusion Matrix:
[[1237  560]
 [ 118  551]]

Classification Report:
              precision    recall  f1-score   support

          No       0.91      0.69      0.78      1797
         Yes       0.50      0.82      0.62       669

    accuracy                           0.73      2466
   macro avg       0.70      

Unnamed: 0,Model,Accuracy,Precision_Yes,Recall_Yes,F1_Yes,ROC_AUC
0,LogisticRegression,0.731144,0.502632,0.856502,0.633499,0.851676
1,DecisionTree,0.725061,0.49595,0.823617,0.619101,0.830761
2,RandomForest,0.774939,0.601786,0.503737,0.548413,0.816568


In [8]:
import joblib

# Pick best model by ROC-AUC
best_row = results_df.iloc[0]
best_model_name = best_row["Model"]
print("Best model based on ROC-AUC:", best_model_name)

best_model = models[best_model_name]

# Rebuild pipeline with best model
best_clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", best_model)
])

# Fit on FULL dataset (X, y)
best_clf.fit(X, y)

# Create models directory and save
os.makedirs("../models", exist_ok=True)
model_path = f"../models/churn_{best_model_name}.pkl"
joblib.dump(best_clf, model_path)

print(f"Best model saved to: {model_path}")


Best model based on ROC-AUC: LogisticRegression
Best model saved to: ../models/churn_LogisticRegression.pkl
