In [5]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import joblib
import json
import os
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,roc_auc_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve, RocCurveDisplay)
import pickle
import time
import warnings


warnings.filterwarnings('ignore')

os.makedirs('models', exist_ok=True)
os.makedirs('results/figures', exist_ok=True)

In [6]:
#load preprocessed data

print("\nDecision Tree Classifier")
print("="*60)

print("\nStep 1: Load preprocessed data")
print("="*60)

with open('models/preprocessed_data.pkl','rb') as f:
    preprocessed_data= pickle.load(f)

feature_names = preprocessed_data['feature_names']
X_train = preprocessed_data['X_train']
X_test = preprocessed_data['X_test']
y_train = preprocessed_data['y_train']
y_test= preprocessed_data['y_test']

print("Data loaded successfully.")
print(f"Training data samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"Features: {X_train.shape[1]}")
print(f"Class distribution (train): {np.bincount(y_train)}")
print(f"Class distribution (test): {np.bincount(y_test)}")



Decision Tree Classifier

Step 1: Load preprocessed data
Data loaded successfully.
Training data samples: 8278
Test samples: 1409
Features: 19
Class distribution (train): [4139 4139]
Class distribution (test): [1035  374]


In [7]:
#baseline decision tree model
print("Step 2: Baseline decision tree model")
print("="*60)

print("\nTraining baseline decision tree with default parameters.")

#create the raw decision tree. random_state =42 : get the same result every time
bs_model = DecisionTreeClassifier(random_state=42)

#train the model
bs_model.fit(X_train, y_train)

#make predictions
y_pred_baseline = bs_model.predict(X_test) #return the class directly (0,1)
y_pred_prob_baseline = bs_model.predict_proba(X_test)[:, 1] #return probabilities of classes - extracts prob of churn (class 1) -required for ROC-AUC

#evaluate model
print("Baseline model performance")
print("="*60)

bs_accuracy= accuracy_score(y_test, y_pred_baseline) #percentage of correct predictions

bs_precision = precision_score(y_test, y_pred_baseline) #Of all predicted "Churn" customers, how many actually churned?

bs_recall = recall_score(y_test, y_pred_baseline)#Of all actual churners, how many did the model detect?

bs_f1 = f1_score(y_test, y_pred_baseline) #Harmonic mean of Precision & Recall

bs_roc_auc = roc_auc_score(y_test, y_pred_prob_baseline)
#Measures overall ranking ability of the model: [0.5 - random guessing, 1- perfect]

print(f"Accuracy: {bs_accuracy:.4f}")
print(f"Precision: {bs_precision:.4f}")
print(f"Recall: {bs_recall:.4f}")
print(f"F1-Score: {bs_f1:.4f}")
print(f"ROC-AUC: {bs_roc_auc:.4f}")

print("\nClassification report:")
print(classification_report(y_test, y_pred_baseline, target_names=['No Churn', 'Churn']))

#confusion matrix
cm = confusion_matrix(y_test, y_pred_baseline)
print("\nConfusion Matrix")
print(cm)




Step 2: Baseline decision tree model

Training baseline decision tree with default parameters.
Baseline model performance
Accuracy: 0.7175
Precision: 0.4729
Recall: 0.5588
F1-Score: 0.5123
ROC-AUC: 0.6666

Classification report:
              precision    recall  f1-score   support

    No Churn       0.83      0.77      0.80      1035
       Churn       0.47      0.56      0.51       374

    accuracy                           0.72      1409
   macro avg       0.65      0.67      0.66      1409
weighted avg       0.73      0.72      0.72      1409


Confusion Matrix
[[802 233]
 [165 209]]


In [None]:
#hyperparameter tuning
print("")