# Notebook 03 â€“ Decision Tree Model & Hyperparameter Tuning

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import joblib

Load raw train-test data

In [2]:
X_train = pd.read_csv("results/X_train_raw.csv")
y_train = pd.read_csv("results/y_train.csv")['Churn']
X_test = pd.read_csv("results/X_test_raw.csv")
y_test = pd.read_csv("results/y_test.csv")['Churn']

Load preprocessing pipeline

In [3]:
preprocessor = joblib.load("results/preprocessor.pkl")

Create Decision Tree pipeline

In [4]:
dt_pipeline = Pipeline([
('preprocessor', preprocessor),
('clf', DecisionTreeClassifier(random_state=42))
])

Define hyperparameter grid

In [5]:
param_grid = {
'clf__max_depth': [3, 5, 7, 10, None],
'clf__min_samples_split': [2, 5, 10],
'clf__min_samples_leaf': [1, 2, 4]
}

Stratified 5-fold cross-validation

In [6]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

Grid search

In [7]:
grid_dt = GridSearchCV(dt_pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=1)

Fit Decision Tree

In [9]:
grid_dt.fit(X_train, y_train)

print("Best Parameters:", grid_dt.best_params_)
print("Best CV ROC AUC:", grid_dt.best_score_)

# Save best model
joblib.dump(grid_dt.best_estimator_, "results/best_decision_tree.pkl")

Fitting 5 folds for each of 45 candidates, totalling 225 fits
Best Parameters: {'clf__max_depth': 5, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 10}
Best CV ROC AUC: 0.8290997129050837


['results/best_decision_tree.pkl']

Evaluate on test set

In [11]:
best_dt = grid_dt.best_estimator_
y_pred = best_dt.predict(X_test)
y_proba = best_dt.predict_proba(X_test)[:,1]


print("Decision Tree Test Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

Decision Tree Test Accuracy: 0.7984386089425124
Precision: 0.6347305389221557
Recall: 0.5668449197860963
F1 Score: 0.5988700564971752
ROC AUC: 0.829725645198791


Feature importance

In [14]:
try:
    import numpy as np
    clf = best_dt.named_steps['clf']
    pre = best_dt.named_steps['preprocessor']
    cat_cols = pre.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out()
    feature_names = X_train.select_dtypes(include=['int64','float64']).columns.tolist() + cat_cols.tolist()
    importances = clf.feature_importances_
    feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)
    print(feat_imp.head(10))
except Exception as e:
    print("Could not compute feature importances automatically:", e)

Contract_Month-to-month           0.515311
InternetService_Fiber optic       0.163687
tenure                            0.157125
MonthlyCharges                    0.036015
TotalCharges                      0.035884
PaymentMethod_Electronic check    0.028004
TechSupport_No                    0.021433
Contract_One year                 0.009242
OnlineBackup_No                   0.009222
OnlineSecurity_No                 0.008642
dtype: float64
