# 05 â€” Hyperparameter Tuning

Grid / Randomized search to improve model performance and a small section on cross-validation best practices.


In [4]:
import pandas as pd
import joblib
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, f1_score
import numpy as np
import os


In [5]:
PROCESSED = '../data/processed/Telco_Customer_Churn.csv'
PREPROC_PATH = '../models/preprocessor.pkl'
df = pd.read_csv(PROCESSED)
X = df.drop(columns=['Churn'])
y = df['Churn'].apply(lambda x: 1 if str(x).lower() in ['yes','y','true','1'] else 0)
preproc_obj = joblib.load(PREPROC_PATH)
preprocessor = preproc_obj['preprocessor']
X_t = preprocessor.transform(X)


In [None]:
xgb = XGBClassifier( eval_metric='logloss')
param_dist = {
    'n_estimators': [50,100,200,300],
    'max_depth': [3,5,7,9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6,0.8,1.0]
}
rs = RandomizedSearchCV(xgb, param_dist, n_iter=20, scoring='f1', cv=3, n_jobs=-1, verbose=2, random_state=42)
rs.fit(X_t, y)
print('Best params:', rs.best_params_)
print('Best score:', rs.best_score_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits


## Notes
- Use `scoring` that reflects business objective (e.g., recall for catching churners)
- Keep a validation set separate when tuning to avoid data leakage
- Save best estimator and update `models/churn_model.pkl`
