# Train churn prediction models and save the best one.

In [10]:
import pandas as pd
import numpy as np
import os
import joblib

from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

## Load the cleaned dataset

In [3]:
data_path = "C:/projects/Customer-churn-predictor/data/Processed/telco_churn_clean.csv"
data = pd.read_csv(data_path)

## Split into features (x) and target (y)

In [4]:
x = data.drop("Churn", axis=1)
y = data["Churn"]

# Split data into test and train
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

## Scale feature (Important for Logistic Regression)

In [5]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

## Initialize models

In [11]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
}

results = {}

## Train and Evaluate Models

In [13]:
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(x_train_scaled if "Logistic" in name else x_train, y_train)

    # Predict
    y_pred = model.predict(x_test_scaled if "Logistic" in name else x_test)

    # For logistic regression only
    if "Logistic" in name:
        y_probs = model.predict_proba(x_test_scaled)[:, 1]
        precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

        # Choose threshold that gives good balance
        chosen_threshold = 0.4
        y_pred = (y_probs >= chosen_threshold).astype(int)

    # Evaluate
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    print(f"{name} - Accuracy: {acc:.4f}, ROC-AUC: {auc:.4f}")
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))

    results[name] = {"model": model, "accuracy": acc, "auc": auc}


Training Logistic Regression...
Logistic Regression - Accuracy: 0.6991, ROC-AUC: 0.7525
              precision    recall  f1-score   support

           0       0.93      0.64      0.76      1035
           1       0.46      0.87      0.60       374

    accuracy                           0.70      1409
   macro avg       0.70      0.75      0.68      1409
weighted avg       0.81      0.70      0.72      1409

[[661 374]
 [ 50 324]]

Training Random Forest...
Random Forest - Accuracy: 0.7878, ROC-AUC: 0.6933
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1035
           1       0.63      0.49      0.55       374

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.71      1409
weighted avg       0.78      0.79      0.78      1409

[[926 109]
 [190 184]]


## Choose the best model based on ROC-AUC

In [14]:
best_model_name = max(results, key=lambda x: results[x]["auc"])
best_model = results[best_model_name]["model"]

print(f"\nBest model: {best_model_name}")


Best model: Logistic Regression


## Save model and scaler

In [9]:
os.makedirs("../models", exist_ok=True)
joblib.dump(best_model, f"../models/{best_model_name.replace(' ', '_').lower()}.pkl")
joblib.dump(scaler, "../models/scaler.pkl")


['../models/scaler.pkl']