In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

## 1. Load & Preview Data

In [5]:
# Load the CSV
df = pd.read_csv("../data/Telco_Customer_Churn_feature_engineered_v1.csv")

# Quick preview
df.head()

Unnamed: 0,customerID,IsSeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,HasPartnerOrDependents,ExpectedTotalCharges,BillingDiffPct,Churn_num,...,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,TenureGroup_Early,TenureGroup_Loyal,PackageChangeBehavior_Downgrade,PackageChangeBehavior_Upgrade
0,7590-VHVEG,0,1,29.85,29.85,No,1,29.85,0.0,0,...,False,False,True,False,True,False,False,False,False,False
1,5575-GNVDE,0,34,56.95,1889.5,No,0,1936.3,-0.02417,0,...,True,False,False,False,False,True,False,True,True,False
2,3668-QPYBK,0,2,53.85,108.15,Yes,0,107.7,0.004178,1,...,False,False,True,False,False,True,False,False,False,False
3,7795-CFOCW,0,45,42.3,1840.75,No,0,1903.5,-0.032965,0,...,True,False,False,False,False,False,False,True,True,False
4,9237-HQITU,0,2,70.7,151.65,Yes,0,141.4,0.072484,1,...,False,False,True,False,True,False,False,False,False,True


## 2. Model Training

In [6]:
# Assuming your dataset is df_encoded and target is 'Churn'
X = df.drop(columns=['customerID', 'Churn','Churn_num'])
y = df['Churn_num']

# Split the data (80% train, 20% test), stratify to maintain churn ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [7]:
logreg = LogisticRegression(max_iter=10000, random_state=42)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

In [8]:
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)

In [9]:
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X_train, y_train)
y_pred_forest = forest.predict(X_test)

## 3. Model Evaluation

In [17]:
# Logistic Regression Evaluation:
def model_evalulation_metrics(y_pred):
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [18]:
print("Logistic Regression Metrics:")
model_evalulation_metrics(y_pred = y_pred_logreg)

Logistic Regression Metrics:
Accuracy: 0.8077
Precision: 0.6746
Recall: 0.5321
F1 Score: 0.5949

Confusion Matrix:
 [[939  96]
 [175 199]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.91      0.87      1035
           1       0.67      0.53      0.59       374

    accuracy                           0.81      1409
   macro avg       0.76      0.72      0.73      1409
weighted avg       0.80      0.81      0.80      1409



In [19]:
# Decision Tree Evaluation:
print("Decision Tree Metrics:")
model_evalulation_metrics(y_pred = y_pred_tree)

Decision Tree Metrics:
Accuracy: 0.7360
Precision: 0.5026
Recall: 0.5134
F1 Score: 0.5079

Confusion Matrix:
 [[845 190]
 [182 192]]

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82      1035
           1       0.50      0.51      0.51       374

    accuracy                           0.74      1409
   macro avg       0.66      0.66      0.66      1409
weighted avg       0.74      0.74      0.74      1409



In [20]:
# Random Forest Evaluation:
print("Random Forest Metrics:")
model_evalulation_metrics(y_pred = y_pred_forest)

Random Forest Metrics:
Accuracy: 0.7892
Precision: 0.6351
Recall: 0.4840
F1 Score: 0.5493

Confusion Matrix:
 [[931 104]
 [193 181]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.90      0.86      1035
           1       0.64      0.48      0.55       374

    accuracy                           0.79      1409
   macro avg       0.73      0.69      0.71      1409
weighted avg       0.78      0.79      0.78      1409



### 📌 Conclusion

Logistic Regression performed the strongest overall, reaching 81% accuracy.
However, all models show difficulty in catching churners (Recall around 50%), suggesting a need for further feature engineering or balancing techniques.

## 4. Insights & Next Steps

### Fine Tuning

In [None]:
# Define hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 10, 15, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy']  # Gini impurity or Information gain
}

# Setup the GridSearchCV
grid_search = GridSearchCV(tree, param_grid, cv=5, scoring='recall', n_jobs=-1)

# Fit to the training data
grid_search.fit(X_train, y_train)

# Best parameters
print(f"Best parameters: {grid_search.best_params_}")

# Best estimator
best_tree = grid_search.best_estimator_

# Predict and evaluate
y_pred_best_tree = best_tree.predict(X_test)


Best parameters: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [22]:
# Decision Tree Evaluation:
print("Decision Tree Metrics:")
model_evalulation_metrics(y_pred = y_pred_best_tree)

Decision Tree Metrics:
Accuracy: 0.7509
Precision: 0.5273
Recall: 0.5936
F1 Score: 0.5585

Confusion Matrix:
 [[836 199]
 [152 222]]

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.81      0.83      1035
           1       0.53      0.59      0.56       374

    accuracy                           0.75      1409
   macro avg       0.69      0.70      0.69      1409
weighted avg       0.76      0.75      0.76      1409

