# Train churn prediction models and save the best one.

In [None]:

import pandas as pd
import numpy as np
import os
import joblib

from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

## Load the cleaned dataset

In [None]:
data_path = "C:/projects/Customer-churn-predictor/data/Processed/telco_churn_clean.csv"
data = pd.read_csv(data_path)

## Split into features (x) and target (y)

In [None]:
x = data.drop("Churn", axis=1)
y = data["Churn"]

# Split data into test and train
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y
)

## Scale data (Important for Logistic Regression)

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

## Train and Evaluate Models

### Logistic Regression Grid Search

In [None]:
log_params = {
    'C': [0.01, 0.1, 1, 5, 10],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l2']
}

log_model = LogisticRegression(max_iter=1000, random_state=42)

log_grid = GridSearchCV(log_model, log_params, cv=5, scoring='roc_auc', n_jobs=-1)
log_grid.fit(X_train_scaled, y_train)

print(f"Best Logistic Regression Params: {log_grid.best_params_}")
best_log_model = log_grid.best_estimator_

Best Logistic Regression Params: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}


## Random Forest Grid Search (Base)

In [None]:
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf_model, rf_params, cv=5, scoring='roc_auc', n_jobs=-1)
rf_grid.fit(X_train, y_train)

print(f"Best Random Forest Params (GridSearch): {rf_grid.best_params_}")
best_rf_model = rf_grid.best_estimator_

## Advanced Random Forest Fine-Tuning (RandomizedSearchCV)

In [None]:
print("\nRunning advanced Random Forest fine-tuning...\n")

param_dist = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [8, 10, 12, 15, 20, None],
    'min_samples_split': [2, 4, 6, 8],
    'min_samples_leaf': [1, 2, 3, 4],
    'max_features': ['sqrt', 'log2', None]
}

rf_random = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=20,
    scoring='roc_auc',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

rf_random.fit(X_train, y_train)
print("Best Random Forest Params (RandomizedSearch):", rf_random.best_params_)

best_rf_tuned = rf_random.best_estimator_


## Evaluate Models

In [None]:
models = {
    "Logistic Regression": (best_log_model, X_test_scaled),
    "Random Forest (GridSearch)": (best_rf_model, X_test),
    "Random Forest (Fine-Tuned)": (best_rf_tuned, X_test)
}

for name, (model, X_eval) in models.items():
    y_pred = model.predict(X_eval)
    y_prob = model.predict_proba(X_eval)[:, 1]
    print(f"\n{name} - Accuracy: {accuracy_score(y_test, y_pred):.4f}, ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")
    print(classification_report(y_test, y_pred))

## Choose the best model based on ROC-AUC

In [None]:
scores = {
    "log": roc_auc_score(y_test, best_log_model.predict_proba(X_test_scaled)[:, 1]),
    "rf_grid": roc_auc_score(y_test, best_rf_model.predict_proba(X_test)[:, 1]),
    "rf_tuned": roc_auc_score(y_test, best_rf_tuned.predict_proba(X_test)[:, 1])
}

best_key = max(scores, key=scores.get)

if best_key == "log":
    best_model = best_log_model
    model_name = "Logistic Regression"
elif best_key == "rf_grid":
    best_model = best_rf_model
    model_name = "Random Forest (GridSearch)"
else:
    best_model = best_rf_tuned
    model_name = "Random Forest (Fine-Tuned)"

print(f"\nBest model after all tuning: {model_name}")


NameError: name 'results' is not defined

## Save model and scaler

In [None]:
os.makedirs("C:/projects/Customer-churn-predictor/models", exist_ok=True)
joblib.dump(best_model, "C:/projects/Customer-churn-predictor/models/best_model.pkl")
joblib.dump(scaler, "C:/projects/Customer-churn-predictor/models/scaler.pkl")

print("Best model and scaler saved successfully!")


['../models/scaler.pkl']