In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix ,classification_report
from sklearn.compose import ColumnTransformer
from imblearn.combine import SMOTEENN
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [150]:
tel_churn =pd.read_csv('tel_churn.csv')

In [151]:
tel_churn.head(5)

Unnamed: 0.1,Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_group
0,0,Female,No,Yes,No,No,No phone service,DSL,No,Yes,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,1-12
1,1,Male,No,No,No,Yes,No,DSL,Yes,No,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,25-36
2,2,Male,No,No,No,Yes,No,DSL,Yes,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1-12
3,3,Male,No,No,No,No,No phone service,DSL,Yes,No,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,37-48
4,4,Female,No,No,No,Yes,No,Fiber optic,No,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1-12


In [152]:
tel_churn.drop(columns=['Unnamed: 0'],inplace=True)

In [153]:

# Define custom order for tenure_group
tenure_order = [['1-12', '13-24', '25-36', '37-48', '49-60', '61-72']]

# Define categorical columns to one-hot encode
categorical_columns = ['gender','SeniorCitizen','Partner','Dependents','PhoneService',
                       'MultipleLines','InternetService','OnlineSecurity','OnlineBackup',
                       'DeviceProtection','TechSupport','StreamingTV','StreamingMovies',
                       'Contract','PaperlessBilling','PaymentMethod']

# Build ColumnTransformer
transformer = ColumnTransformer(
    transformers=[
        ('tnf1', OrdinalEncoder(categories=tenure_order), ['tenure_group']),
        ('trf2', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'), categorical_columns)
    ],
    remainder='passthrough'
)


In [154]:
x=tel_churn.drop(columns=['Churn'])
y=tel_churn["Churn"]

In [155]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42,stratify=y)

In [156]:
x_train

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,tenure_group
1408,Male,No,Yes,Yes,Yes,Yes,Fiber optic,Yes,Yes,Yes,Yes,No,No,Two year,No,Credit card (automatic),94.55,6078.75,61-72
6992,Male,No,No,No,No,No phone service,DSL,No,No,Yes,Yes,No,No,Month-to-month,No,Electronic check,35.75,1022.50,25-36
3349,Female,No,Yes,No,Yes,Yes,Fiber optic,No,Yes,Yes,Yes,No,No,Two year,No,Credit card (automatic),90.20,6297.65,61-72
4486,Male,No,No,No,Yes,No,Fiber optic,No,Yes,No,No,No,Yes,Month-to-month,No,Electronic check,84.30,235.05,1-12
3535,Female,No,Yes,No,No,No phone service,DSL,Yes,No,No,No,Yes,No,Month-to-month,No,Bank transfer (automatic),40.65,2070.75,49-60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3445,Male,Yes,Yes,No,Yes,No,Fiber optic,No,No,No,No,No,No,One year,Yes,Bank transfer (automatic),70.95,4555.20,61-72
5115,Female,Yes,No,No,Yes,No,Fiber optic,No,Yes,No,No,No,No,Month-to-month,Yes,Credit card (automatic),75.30,1147.45,13-24
4128,Female,No,Yes,Yes,Yes,Yes,DSL,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),92.90,3379.25,25-36
4242,Female,No,Yes,Yes,Yes,No,DSL,No,Yes,Yes,No,No,Yes,One year,Yes,Mailed check,65.90,660.05,1-12


In [157]:
from sklearn.tree import DecisionTreeClassifier 

# Decision Tree Classifier

In [158]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [159]:
x_train=transformer.fit_transform(x_train)
x_test=transformer.transform(x_test)

In [160]:
x_train

array([[5.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        9.45500e+01, 6.07875e+03],
       [2.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.57500e+01, 1.02250e+03],
       [5.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        9.02000e+01, 6.29765e+03],
       ...,
       [2.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        9.29000e+01, 3.37925e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        6.59000e+01, 6.60050e+02],
       [0.00000e+00, 1.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        7.47000e+01, 7.47000e+01]])

In [161]:
le=LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [162]:
model_dt.fit(x_train,y_train)
y_pred1=model_dt.predict(x_test)
model_dt.score(x_test,y_test)

0.7896233120113717

In [163]:
print(classification_report(y_test, y_pred1, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1033
           1       0.64      0.48      0.55       374

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.71      1407
weighted avg       0.78      0.79      0.78      1407



As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.

Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.

Hence, moving ahead to call SMOTEENN (UpSampling + ENN)

SMOTEENN

In [164]:
x_transformed=transformer.fit_transform(x)


In [165]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x_transformed,y)

In [166]:
x_train1,x_test1,y_train1,y_test1=train_test_split(X_resampled,y_resampled,test_size=0.2,random_state=42)

In [167]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [168]:
le=LabelEncoder()
y_train1 = le.fit_transform(y_train1)
y_test1= le.transform(y_test1)

In [169]:
model_dt.fit(x_train1,y_train1)
y_pred=model_dt.predict(x_test1)
model_dt.score(x_test1,y_test1)

0.9349871685201027

In [170]:
print(classification_report(y_test1, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.93       552
           1       0.94      0.93      0.94       617

    accuracy                           0.93      1169
   macro avg       0.93      0.94      0.93      1169
weighted avg       0.94      0.93      0.94      1169



Now we can see quite better results, i.e. Accuracy: 92 %, and a very good recall, precision & f1 score for minority class but **problem is that we cannot apply smoteenn on whole x, whole y**. *we have to apply smoteenn on x_train,y_train  only*.

Let's try with some other classifier. 

# SMOTEENN ON X_TRAIN ,Y_TRAIN only

In [171]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x_train,y_train)

In [172]:
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [173]:
model_dt.fit(x_train,y_train)
y_pred=model_dt.predict(x_test)
model_dt.score(x_test,y_test)

0.7896233120113717

In [174]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86      1033
           1       0.64      0.48      0.55       374

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.71      1407
weighted avg       0.78      0.79      0.78      1407



Bad performance in recall score as well as accuracy  because a lot of categorical column is present in data.so thats why SMOTEENN not working well here

# Random Forest Classifier

In [175]:
from sklearn.ensemble import RandomForestClassifier

In [176]:
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)
model_rf.fit(x_train,y_train)

In [177]:
y_pred=model_rf.predict(x_test)
model_rf.score(x_test,y_test)

0.7882018479033405

In [178]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.92      0.86      1033
           1       0.66      0.42      0.51       374

    accuracy                           0.79      1407
   macro avg       0.74      0.67      0.69      1407
weighted avg       0.77      0.79      0.77      1407



In [179]:
model_rf.fit(x_train1,y_train1)
y_pred=model_rf.predict(x_test1)
print(model_rf.score(x_test1,y_test1))
print(classification_report(y_test1, y_pred, labels=[0,1]))

0.951240376390077
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       552
           1       0.95      0.95      0.95       617

    accuracy                           0.95      1169
   macro avg       0.95      0.95      0.95      1169
weighted avg       0.95      0.95      0.95      1169



# BalancedRandomForestClassifier

In [180]:
from imblearn.ensemble import BalancedRandomForestClassifier

In [None]:
balanced_rfc_model=BalancedRandomForestClassifier(random_state=42)
balanced_rfc_model.fit(x_train,y_train)
y_pred=balanced_rfc_model.predict(x_test)

In [182]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.89      0.78      0.83      1033
           1       0.54      0.72      0.62       374

    accuracy                           0.76      1407
   macro avg       0.71      0.75      0.72      1407
weighted avg       0.79      0.76      0.77      1407



In [183]:
classifier.score(x_test,y_test)

0.7640369580668088

# Logistic Regression

In [184]:
sc=StandardScaler()
x_train_standarised=sc.fit_transform(x_train)
x_test_standarised=sc.transform(x_test)

In [185]:
lr=LogisticRegression(max_iter=1000, class_weight='balanced')
lr.fit(x_train_standarised,y_train)
y_pred=lr.predict(x_test_standarised)

In [198]:
lr.score(x_test_standarised,y_test)

0.7242359630419332

In [187]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.90      0.70      0.79      1033
           1       0.49      0.78      0.60       374

    accuracy                           0.72      1407
   macro avg       0.69      0.74      0.70      1407
weighted avg       0.79      0.72      0.74      1407



Better than Decision tree and Random forest classifier as well as balanced random forest in term of recall of positive class but less in term f1 score as well as precision of than Balanced randomforest classifier

# xgboost

In [194]:

# XGBoost Classifier
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model_xgb.fit(x_train, y_train)

# Predictions
y_pred = model_xgb.predict(x_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.7732764747690121
              precision    recall  f1-score   support

           0       0.83      0.86      0.85      1033
           1       0.58      0.53      0.55       374

    accuracy                           0.77      1407
   macro avg       0.71      0.69      0.70      1407
weighted avg       0.77      0.77      0.77      1407



Using Smoteenn

In [195]:
sm=SMOTEENN()
x_smot_train,y_smot_train=sm.fit_resample(x_train,y_train)
model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model_xgb.fit(x_smot_train, y_smot_train)

# Predictions
y_pred = model_xgb.predict(x_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))


Accuracy: 0.759772565742715
              precision    recall  f1-score   support

           0       0.87      0.79      0.83      1033
           1       0.54      0.69      0.60       374

    accuracy                           0.76      1407
   macro avg       0.71      0.74      0.72      1407
weighted avg       0.78      0.76      0.77      1407



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


## Overall Conclusion

The best result i have obtained using **BalancedRandomForestclassifier** with recall of positive class as 72% and with f1 score as 62%

**Pickling the model**


In [196]:
import pickle

In [197]:
filename = 'model.sav'

In [None]:
pickle.dump(balanced_rfc_model, open(filename, 'wb'))