In [104]:
## Importing libraries

import pandas as pd0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [105]:
## Reading the file

df = pd.read_csv('/content/Customer-Churn.csv')

In [106]:
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [107]:
df.Churn.value_counts()/len(df)*100

Unnamed: 0_level_0,count
Churn,Unnamed: 1_level_1
No,73.463013
Yes,26.536987


## **Churn Rate**: 26.53%

- Which means, 26.53% of the customers churn out of this telecom company

In [108]:
## Divide data into X and y - X (Independent features), y(Dependent variable)

# Define y (target variable)
y = df['Churn']

# Define X (features) by dropping 'customerID' and 'Churn'
X = df.drop(columns=['customerID', 'Churn'])

# Optional: Verify the shapes
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (7043, 19)
Shape of y: (7043,)


## Train Test Split

In [109]:
X = pd.get_dummies(X, drop_first=True)
y = df['Churn'].map({'No': 0, 'Yes': 1})

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Model Building

In [111]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)

In [112]:
y_pred_dt = model_dt.predict(X_test)

In [113]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       0.85      0.84      0.85      1054
           1       0.54      0.55      0.55       355

    accuracy                           0.77      1409
   macro avg       0.70      0.70      0.70      1409
weighted avg       0.77      0.77      0.77      1409



In [114]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


## Initial Insights

- Base model has a accuracy of 76% which is not reliable because of the imbalanced dataset
- TotalCharges needs to be a float/int type (Data Cleaning)
- Need to perform Feature Scaling

## Data Cleaning

In [115]:
telco_data = df.copy()

In [116]:
telco_data.TotalCharges = pd.to_numeric(telco_data.TotalCharges, errors='coerce')

In [117]:
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [118]:
telco_data.loc[telco_data['TotalCharges'].isnull() == True]

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.25,,No
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.75,,No
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No phone service,DSL,Yes,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,19.85,,No
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,25.35,,No
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Two year,No,Mailed check,20.0,,No
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,One year,Yes,Mailed check,19.7,,No
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [119]:
telco_data.dropna(how='any', inplace=True)

In [120]:
telco_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 
 17  

In [121]:
telco_data.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [122]:
telco_data.tenure.max()

72

In [123]:
import pandas as pd

# Define the bin edges (0 to 72, stepping by 12 months)
bins = [0, 12, 24, 36, 48, 60, 72]

# Define the labels for each bin
labels = ['1-12', '13-24', '25-36', '37-48', '49-60', '61-72']

# Create the new binned column
telco_data['tenure_bin'] = pd.cut(telco_data['tenure'], bins=bins, labels=labels, include_lowest=True)

# Optional: Check the result
print(telco_data[['tenure', 'tenure_bin']].head(10))
print(telco_data['tenure_bin'].value_counts().sort_index())

   tenure tenure_bin
0       1       1-12
1      34      25-36
2       2       1-12
3      45      37-48
4       2       1-12
5       8       1-12
6      22      13-24
7      10       1-12
8      28      25-36
9      62      61-72
tenure_bin
1-12     2175
13-24    1024
25-36     832
37-48     762
49-60     832
61-72    1407
Name: count, dtype: int64


In [124]:
telco_data.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_bin
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,1-12
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,25-36
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1-12
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,37-48
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1-12


In [125]:
## Divide data into X and y - X (Independent features), y(Dependent variable)

# Define y (target variable)
y = telco_data['Churn']

# Define X (features) by dropping 'customerID' and 'Churn'
X = telco_data.drop(columns=['customerID', 'Churn', 'tenure'])

# Optional: Verify the shapes
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (7032, 19)
Shape of y: (7032,)


In [90]:
X

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,tenure_bin
0,Female,0,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,1-12
1,Male,0,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,25-36
2,Male,0,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1-12
3,Male,0,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,37-48
4,Female,0,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,1-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,13-24
7039,Female,0,Yes,Yes,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,61-72
7040,Female,0,Yes,Yes,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,1-12
7041,Male,1,Yes,No,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,1-12


In [126]:
X = pd.get_dummies(X, drop_first=True)
y = telco_data['Churn'].map({'No': 0, 'Yes': 1})

In [127]:
X

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,InternetService_Fiber optic,...,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,tenure_bin_13-24,tenure_bin_25-36,tenure_bin_37-48,tenure_bin_49-60,tenure_bin_61-72
0,0,29.85,29.85,False,True,False,False,True,False,False,...,False,True,False,True,False,False,False,False,False,False
1,0,56.95,1889.50,True,False,False,True,False,False,False,...,False,False,False,False,True,False,True,False,False,False
2,0,53.85,108.15,True,False,False,True,False,False,False,...,False,True,False,False,True,False,False,False,False,False
3,0,42.30,1840.75,True,False,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
4,0,70.70,151.65,False,False,False,True,False,False,True,...,False,True,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,84.80,1990.50,True,True,True,True,False,True,False,...,False,True,False,False,True,True,False,False,False,False
7039,0,103.20,7362.90,False,True,True,True,False,True,True,...,False,True,True,False,False,False,False,False,False,True
7040,0,29.60,346.45,False,True,True,False,True,False,False,...,False,True,False,True,False,False,False,False,False,False
7041,1,74.40,306.60,True,True,False,True,False,True,True,...,False,True,False,False,True,False,False,False,False,False


In [128]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Feature Scaling

In [129]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [130]:
model_dt2 = DecisionTreeClassifier()
model_dt2.fit(X_train, y_train)

y_pred_dt2 = model_dt2.predict(X_test)

In [132]:
print(classification_report(y_test, y_pred_dt2))

              precision    recall  f1-score   support

           0       0.82      0.79      0.81      1025
           1       0.49      0.54      0.52       382

    accuracy                           0.72      1407
   macro avg       0.66      0.67      0.66      1407
weighted avg       0.73      0.72      0.73      1407



## Feature Scaling - MinMaxScaler

In [133]:
mms = MinMaxScaler()
X_train_mms = mms.fit_transform(X_train)
X_test_mms = mms.transform(X_test)

In [134]:
model_dt3 = DecisionTreeClassifier()
model_dt3.fit(X_train_mms, y_train)

y_pred_dt3 = model_dt3.predict(X_test_mms)

In [135]:
print(classification_report(y_test, y_pred_dt3))

              precision    recall  f1-score   support

           0       0.82      0.79      0.81      1025
           1       0.49      0.54      0.52       382

    accuracy                           0.72      1407
   macro avg       0.66      0.67      0.66      1407
weighted avg       0.73      0.72      0.73      1407



## SMOTEENN () [UpSampling + ENN)

In [136]:
from imblearn.combine import SMOTEENN

In [137]:
sm = SMOTEENN()
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

In [138]:
model_dt_smoteenn = DecisionTreeClassifier()
model_dt_smoteenn.fit(X_train_resampled, y_train_resampled)

y_pred_dt_smoteenn = model_dt_smoteenn.predict(X_test)

In [139]:
print(classification_report(y_test, y_pred_dt_smoteenn))

              precision    recall  f1-score   support

           0       0.89      0.69      0.78      1025
           1       0.49      0.77      0.60       382

    accuracy                           0.72      1407
   macro avg       0.69      0.73      0.69      1407
weighted avg       0.78      0.72      0.73      1407



In [140]:
model_rf_smoteenn = RandomForestClassifier(n_estimators=500)
model_rf_smoteenn.fit(X_train_resampled, y_train_resampled)

y_pred_rf_smoteenn = model_rf_smoteenn.predict(X_test)

In [141]:
print(classification_report(y_test, y_pred_rf_smoteenn))

              precision    recall  f1-score   support

           0       0.93      0.69      0.79      1025
           1       0.51      0.85      0.63       382

    accuracy                           0.73      1407
   macro avg       0.72      0.77      0.71      1407
weighted avg       0.81      0.73      0.75      1407



## XGBoost without SMOTEENN

In [142]:
from xgboost import XGBClassifier

# Initialize and train the XGBoost model
model_xgb = XGBClassifier(random_state=42)  # random_state for reproducibility
model_xgb.fit(X_train, y_train)

# Make predictions on the test set
y_pred_xgb = model_xgb.predict(X_test)

In [143]:
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.83      0.88      0.86      1025
           1       0.62      0.53      0.57       382

    accuracy                           0.78      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.77      0.78      0.78      1407



## XGBoost with SMOTEENN

In [144]:
from xgboost import XGBClassifier

# Initialize and train the XGBoost model
model_xgb_smoteenn = XGBClassifier(random_state=42)  # random_state for reproducibility
model_xgb_smoteenn.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred_xgb_smoteenn = model_xgb_smoteenn.predict(X_test)

In [145]:
print(classification_report(y_test, y_pred_xgb_smoteenn))

              precision    recall  f1-score   support

           0       0.92      0.69      0.79      1025
           1       0.50      0.84      0.63       382

    accuracy                           0.73      1407
   macro avg       0.71      0.76      0.71      1407
weighted avg       0.81      0.73      0.75      1407



## XGBoost with SMOTE

In [146]:
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from collections import Counter

# Check class distribution before SMOTE
print("Before SMOTE:", Counter(y_train))

# Apply SMOTE to the training data only
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check class distribution after SMOTE
print("After SMOTE:", Counter(y_train_smote))

# Initialize and train the XGBoost model on resampled data
model_xgb_smote = XGBClassifier(random_state=42)
model_xgb_smote.fit(X_train_smote, y_train_smote)

# Make predictions on the original test set (unchanged)
y_pred_xgb_smote = model_xgb_smote.predict(X_test)

Before SMOTE: Counter({0: 4138, 1: 1487})
After SMOTE: Counter({0: 4138, 1: 4138})


In [147]:
print(classification_report(y_test, y_pred_xgb_smote))

              precision    recall  f1-score   support

           0       0.85      0.84      0.84      1025
           1       0.58      0.60      0.59       382

    accuracy                           0.77      1407
   macro avg       0.72      0.72      0.72      1407
weighted avg       0.78      0.77      0.78      1407



In [150]:
from imblearn.over_sampling import ADASYN
from xgboost import XGBClassifier
from collections import Counter

# Check original distribution
print("Before ADASYN:", Counter(y_train))

# Apply ADASYN to training data only
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

# Check new distribution
print("After ADASYN:", Counter(y_train_adasyn))

# Train XGBoost on ADASYN-resampled data
model_xgb_adasyn = XGBClassifier(random_state=42)
model_xgb_adasyn.fit(X_train_adasyn, y_train_adasyn)

# Predict on the original test set
y_pred_xgb_adasyn = model_xgb_adasyn.predict(X_test)

# Evaluate
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_test, y_pred_xgb_adasyn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb_adasyn))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb_adasyn))

Before ADASYN: Counter({0: 4138, 1: 1487})
After ADASYN: Counter({0: 4138, 1: 4054})
Accuracy: 0.7846481876332623

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      1025
           1       0.60      0.60      0.60       382

    accuracy                           0.78      1407
   macro avg       0.73      0.73      0.73      1407
weighted avg       0.78      0.78      0.78      1407


Confusion Matrix:
 [[873 152]
 [151 231]]


In [149]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Calculate scale_pos_weight (ratio of negative to positive class)
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

# Train XGBoost with scale_pos_weight
model_xgb_weighted = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='logloss'  # avoids warnings
)

model_xgb_weighted.fit(X_train, y_train)
y_pred_weighted = model_xgb_weighted.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred_weighted))
print("\nClassification Report:\n", classification_report(y_test, y_pred_weighted))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_weighted))

scale_pos_weight: 2.78
Accuracy: 0.7683013503909026

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.79      0.83      1025
           1       0.56      0.72      0.63       382

    accuracy                           0.77      1407
   macro avg       0.72      0.75      0.73      1407
weighted avg       0.79      0.77      0.78      1407


Confusion Matrix:
 [[807 218]
 [108 274]]


In [151]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Calculate the weight for the positive (minority) class
# Common approach: weight_positive = negative_count / positive_count
negative_count = (y_train == 0).sum()
positive_count = (y_train == 1).sum()
weight_positive = negative_count / positive_count if positive_count > 0 else 1
print(f"Negative class count: {negative_count}")
print(f"Positive class count: {positive_count}")
print(f"Weight for positive class: {weight_positive:.2f}")

# Create sample weights: higher weight for minority class
sample_weight = np.where(y_train == 1, weight_positive, 1.0)

# Train AdaBoost with sample weights
model_ada_weighted = AdaBoostClassifier(
    n_estimators=50,  # default, you can tune
    random_state=42
)

model_ada_weighted.fit(X_train, y_train, sample_weight=sample_weight)

# Predict and evaluate
y_pred_ada_weighted = model_ada_weighted.predict(X_test)

print("AdaBoost with sample weighting - Accuracy:", accuracy_score(y_test, y_pred_ada_weighted))
print("\nClassification Report:\n", classification_report(y_test, y_pred_ada_weighted))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_ada_weighted))

Negative class count: 4138
Positive class count: 1487
Weight for positive class: 2.78
AdaBoost with sample weighting - Accuracy: 0.7569296375266524

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.73      0.81      1025
           1       0.53      0.83      0.65       382

    accuracy                           0.76      1407
   macro avg       0.73      0.78      0.73      1407
weighted avg       0.82      0.76      0.77      1407


Confusion Matrix:
 [[748 277]
 [ 65 317]]


## **Hyper Parameter Optimization**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

param_grid = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 400],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'scale_pos_weight': [1, scale_pos_weight]  # try both balanced and weighted
}

xgb = XGBClassifier(random_state=42, eval_metric='logloss')

# Use 'f1' or 'recall' scoring since churn is imbalanced
search = RandomizedSearchCV(
    xgb,
    param_grid,
    n_iter=50,
    cv=5,
    scoring='f1',  # or 'recall' if catching churn is priority
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

print("Best parameters:", search.best_params_)
print("Best CV F1 score:", search.best_score_)

# Use best model
best_model = search.best_estimator_
y_pred_best = best_model.predict(X_test)
print("\nTest Classification Report:\n", classification_report(y_test, y_pred_best))

Best parameters: {'subsample': 0.9, 'scale_pos_weight': np.float64(2.741035856573705), 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.9}
Best CV F1 score: 0.6204890020765312

Test Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.72      0.79      1046
           1       0.48      0.75      0.58       363

    accuracy                           0.72      1409
   macro avg       0.68      0.73      0.69      1409
weighted avg       0.78      0.72      0.74      1409



In [None]:
# Save this model

import joblib

# Save the best model to a .pkl file
joblib.dump(best_model, 'best_xgboost_churn_model.pkl')

print("Best model successfully saved as 'best_xgboost_churn_model.pkl'")

Best model successfully saved as 'best_xgboost_churn_model.pkl'


## Trying out few more techniques

In [152]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(
    class_weight='balanced',
    random_state=42,
    n_estimators=300,
    max_depth=6
)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.93      0.70      0.80      1025
           1       0.52      0.85      0.64       382

    accuracy                           0.74      1407
   macro avg       0.72      0.78      0.72      1407
weighted avg       0.82      0.74      0.76      1407



In [155]:
from catboost import CatBoostClassifier

model_cat = CatBoostClassifier(
    auto_class_weights='Balanced',  # or 'SqrtBalanced'
    verbose=0,
    random_state=42
)
model_cat.fit(X_train, y_train)
y_pred_cat = model_cat.predict(X_test)
print(classification_report(y_test, y_pred_cat))

              precision    recall  f1-score   support

           0       0.90      0.76      0.82      1025
           1       0.54      0.77      0.63       382

    accuracy                           0.76      1407
   macro avg       0.72      0.76      0.73      1407
weighted avg       0.80      0.76      0.77      1407



In [156]:
from lightgbm import LGBMClassifier

model_lgb = LGBMClassifier(
    scale_pos_weight=scale_pos_weight,
    random_state=42
)
model_lgb.fit(X_train, y_train)
y_pred_lgb = model_lgb.predict(X_test)
print(classification_report(y_test, y_pred_lgb))

[LightGBM] [Info] Number of positive: 1487, number of negative: 4138
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 606
[LightGBM] [Info] Number of data points in the train set: 5625, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.264356 -> initscore=-1.023452
[LightGBM] [Info] Start training from score -1.023452
              precision    recall  f1-score   support

           0       0.89      0.75      0.82      1025
           1       0.53      0.76      0.63       382

    accuracy                           0.75      1407
   macro avg       0.71      0.76      0.72      1407
weighted avg       0.80      0.75      0.77      1407





## **Optuna fine tuning**

In [None]:
import optuna
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, make_scorer

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),  # L1 regularization
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 2), # L2 regularization
        'scale_pos_weight': trial.suggest_categorical('scale_pos_weight', [1, scale_pos_weight]),
        'random_state': 42,
        'eval_metric': 'logloss'
    }

    model = XGBClassifier(**params)

    # Use F1 for minority class as scorer
    f1_scorer = make_scorer(f1_score, average='binary', pos_label=1)
    score = cross_val_score(model, X_train, y_train, cv=5, scoring=f1_scorer, n_jobs=-1).mean()

    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  # Increase trials for better results

print("Best parameters:", study.best_params)
print("Best CV F1 score:", study.best_value)

# Train final model with best params
best_model_optuna = XGBClassifier(**study.best_params)
best_model_optuna.fit(X_train, y_train)
y_pred_optuna = best_model_optuna.predict(X_test)
print(classification_report(y_test, y_pred_optuna))

[I 2025-12-21 16:35:30,565] A new study created in memory with name: no-name-45a55a50-f7df-4e59-a25b-3259e76eacf6
[I 2025-12-21 16:37:56,719] Trial 0 finished with value: 0.5581046601993382 and parameters: {'max_depth': 6, 'learning_rate': 0.06156258438536952, 'n_estimators': 472, 'subsample': 0.986280382402016, 'colsample_bytree': 0.8956355388458284, 'min_child_weight': 1, 'gamma': 0.3753371403995946, 'reg_alpha': 0.7461990345185435, 'reg_lambda': 0.00813717640675038, 'scale_pos_weight': 1}. Best is trial 0 with value: 0.5581046601993382.
[I 2025-12-21 16:40:10,333] Trial 1 finished with value: 0.569977711890309 and parameters: {'max_depth': 9, 'learning_rate': 0.1822381249614022, 'n_estimators': 259, 'subsample': 0.6839068809756238, 'colsample_bytree': 0.6122032010394644, 'min_child_weight': 9, 'gamma': 0.3713919685290119, 'reg_alpha': 0.11290793255215215, 'reg_lambda': 1.6299418206912184, 'scale_pos_weight': np.float64(2.741035856573705)}. Best is trial 1 with value: 0.5699777118903

Best parameters: {'max_depth': 4, 'learning_rate': 0.016716972456610215, 'n_estimators': 352, 'subsample': 0.7227508919173684, 'colsample_bytree': 0.9234643702808608, 'min_child_weight': 4, 'gamma': 0.29393186125942017, 'reg_alpha': 0.8486093776172495, 'reg_lambda': 1.3422131664548662, 'scale_pos_weight': np.float64(2.741035856573705)}
Best CV F1 score: 0.6253376485050377
              precision    recall  f1-score   support

           0       0.89      0.72      0.79      1046
           1       0.48      0.75      0.59       363

    accuracy                           0.73      1409
   macro avg       0.69      0.73      0.69      1409
weighted avg       0.79      0.73      0.74      1409



In [None]:
# Save this model

import joblib

# Save the best model to a .pkl file
joblib.dump(best_model_optuna, 'best_optuna_churn_model.pkl')

print("Best model successfully saved as 'best_optuna_churn_model.pkl'")

Best model successfully saved as 'best_optuna_churn_model.pkl'


In [157]:
# Save this model

import joblib

# Save the best model to a .pkl file
joblib.dump(model_ada_weighted, 'ada_boost_churn_model.pkl')

print("Best model successfully saved as 'ada_boost_churn_model.pkl'")

Best model successfully saved as 'ada_boost_churn_model.pkl'
