In [12]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix
)


In [13]:
df = pd.read_csv("../data/raw/customer_churn.csv")

df.head()


Unnamed: 0,Customer ID,Gender,Age,Under 30,Senior Citizen,Married,Dependents,Number of Dependents,Country,State,...,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Satisfaction Score,Customer Status,Churn Label,Churn Score,CLTV,Churn Category,Churn Reason
0,8779-QRDMV,Male,78,No,Yes,No,No,0,United States,California,...,20,0.0,59.65,3,Churned,Yes,91,5433,Competitor,Competitor offered more data
1,7495-OOKFY,Female,74,No,Yes,Yes,Yes,1,United States,California,...,0,390.8,1024.1,3,Churned,Yes,69,5302,Competitor,Competitor made better offer
2,1658-BYGOY,Male,71,No,Yes,No,Yes,3,United States,California,...,0,203.94,1910.88,2,Churned,Yes,81,3179,Competitor,Competitor made better offer
3,4598-XLKNJ,Female,78,No,Yes,Yes,Yes,1,United States,California,...,0,494.0,2995.07,2,Churned,Yes,88,5337,Dissatisfaction,Limited range of services
4,4846-WHAFZ,Female,80,No,Yes,Yes,Yes,1,United States,California,...,0,234.21,3102.36,2,Churned,Yes,67,2793,Price,Extra data charges


In [14]:
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)


In [15]:
X = df.drop(columns=["churn_label"])
y = df["churn_label"]


In [16]:
leakage_cols = [
    "customer_id",
    "customer_status",
    "churn_score",
    "churn_category",
    "churn_reason"
]

X = X.drop(columns=leakage_cols)


In [17]:
X = pd.get_dummies(X, drop_first=True)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [19]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train, y_train)


In [20]:
y_pred = rf.predict(X_test)

print("Classification Report:\n")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))


Classification Report:

              precision    recall  f1-score   support

          No       0.96      0.91      0.93      1035
         Yes       0.78      0.89      0.83       374

    accuracy                           0.90      1409
   macro avg       0.87      0.90      0.88      1409
weighted avg       0.91      0.90      0.91      1409


Confusion Matrix:

[[941  94]
 [ 41 333]]


In [21]:
feature_importance = pd.Series(
    rf.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

feature_importance.head(10)


satisfaction_score           0.270172
number_of_referrals          0.059113
tenure_in_months             0.055497
contract_Two Year            0.049757
total_revenue                0.036188
avg_monthly_gb_download      0.034668
internet_type_Fiber Optic    0.034449
monthly_charge               0.034147
total_charges                0.032828
dependents_Yes               0.027192
dtype: float64

### Business Insights from Random Forest Model

- Contract type and tenure are the strongest churn predictors
- Higher monthly charges increase churn risk
- Customers with short tenure are most vulnerable
- Long-term contracts significantly reduce churn probability

**Business Action:**
Target high-risk customers with discounts and loyalty programs.


In [None]:
import joblib
import os

os.makedirs("../models", exist_ok=True)

joblib.dump(rf, "../models/random_forest_churn_model.pkl")

print("✅ Random Forest model saved successfully")
