# 03 â€” Model Training

Train multiple models, compare results, and save the best model.


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import joblib
from sklearn.metrics import classification_report, roc_auc_score
import os


In [2]:
PROCESSED = '../data/processed/Telco_Customer_Churn.csv'
MODEL_PATH = '../models/churn_model.pkl'
PREPROC_PATH = '../models/preprocessor.pkl'
df = pd.read_csv(PROCESSED)
df.shape


(7043, 22)

In [3]:
if 'Churn' not in df.columns:
    raise ValueError('Processed CSV must contain Churn column')
X = df.drop(columns=['Churn'])
y = df['Churn'].apply(lambda x: 1 if str(x).lower() in ['yes','y','true','1'] else 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print('Train size:', X_train.shape, 'Test size:', X_test.shape)


Train size: (5634, 21) Test size: (1409, 21)


In [4]:
preproc_obj = joblib.load(PREPROC_PATH)
preprocessor = preproc_obj['preprocessor']
Xtr = preprocessor.transform(X_train)
Xte = preprocessor.transform(X_test)


In [5]:
models = {
    'logistic': LogisticRegression(max_iter=1000),
    'random_forest': RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42),
    'xgboost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}
results = {}
for name, m in models.items():
    print('Training', name)
    m.fit(Xtr, y_train)
    preds = m.predict(Xte)
    probs = m.predict_proba(Xte)[:,1]
    results[name] = {
        'report': classification_report(y_test, preds, output_dict=True),
        'roc_auc': roc_auc_score(y_test, probs),
        'model': m
    }
    print(name, 'ROC AUC:', results[name]['roc_auc'])


Training logistic
logistic ROC AUC: 0.8420289855072465
Training random_forest
random_forest ROC AUC: 0.818242527577566
Training xgboost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgboost ROC AUC: 0.8226962721847633


In [6]:
for name, r in results.items():
    print(name, 'ROC AUC:', r['roc_auc'])
best_name = max(results.keys(), key=lambda k: results[k]['roc_auc'])
best_model = results[best_name]['model']
print('Best model:', best_name)


logistic ROC AUC: 0.8420289855072465
random_forest ROC AUC: 0.818242527577566
xgboost ROC AUC: 0.8226962721847633
Best model: logistic


In [7]:
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)
joblib.dump({'preprocessor': preprocessor, 'model': best_model}, MODEL_PATH)
print('Saved model to', MODEL_PATH)


Saved model to ../models/churn_model.pkl


## Next steps
- Run `04_model_evaluation.ipynb` for detailed metrics and plots
- Optionally perform hyperparameter tuning in `05_hyperparameter_tuning.ipynb`
