In [8]:
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTEENN

In [13]:
df=pd.read_csv("tel_churn.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,0,29.85,29.85,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
1,1,0,56.95,1889.5,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,2,0,53.85,108.15,1,0,1,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,3,0,42.3,1840.75,0,0,1,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,4,0,70.7,151.65,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0


In [14]:
df=df.drop('Unnamed: 0',axis=1)

In [15]:

x=df.drop('Churn',axis=1)
x

Unnamed: 0,SeniorCitizen,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,29.85,29.85,1,0,0,1,1,0,1,...,0,1,0,0,0,1,0,0,1,0
1,0,56.95,1889.50,0,1,1,0,1,0,0,...,0,0,1,0,1,0,0,0,0,1
2,0,53.85,108.15,0,1,1,0,1,0,0,...,0,1,0,0,0,1,0,0,0,1
3,0,42.30,1840.75,0,1,1,0,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,0,70.70,151.65,1,0,1,0,1,0,0,...,0,1,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,0,84.80,1990.50,0,1,0,1,0,1,0,...,1,0,1,0,0,1,0,0,0,1
7028,0,103.20,7362.90,1,0,0,1,0,1,0,...,1,0,1,0,0,1,0,1,0,0
7029,0,29.60,346.45,1,0,0,1,0,1,1,...,0,1,0,0,0,1,0,0,1,0
7030,1,74.40,306.60,0,1,0,1,1,0,0,...,0,1,0,0,0,1,0,0,0,1


In [None]:

y=df['Churn']
y

# Model Building

# Train Test Split
spliting the data into train data and test data.

In [17]:

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

# Decision Tree Classifier

In [18]:
model_dt=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)
model_dt.fit(x_train,y_train)

DecisionTreeClassifier(max_depth=6, min_samples_leaf=8, random_state=100)

In [19]:
y_pred=model_dt.predict(x_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [20]:
model_dt.score(x_test,y_test)

0.7661691542288557

In [21]:
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84      1016
           1       0.60      0.48      0.53       391

    accuracy                           0.77      1407
   macro avg       0.71      0.68      0.69      1407
weighted avg       0.75      0.77      0.76      1407



As you can see that the accuracy is quite low, and as it's an imbalanced dataset, we shouldn't consider Accuracy as our metrics to measure the model, as Accuracy is cursed in imbalanced datasets.
Hence, we need to check recall, precision & f1 score for the minority class, and it's quite evident that the precision, recall & f1 score is too low for Class 1, i.e. churned customers.


# SMOTEENN (UpSampling + ENN)

In [27]:
sm = SMOTEENN()
X_resampled, y_resampled = sm.fit_resample(x, y)

In [28]:
xr_train,xr_test,yr_train,yr_test=train_test_split(X_resampled, y_resampled,test_size=0.2)
model_dt_smote=DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

In [29]:
model_dt_smote.fit(xr_train,yr_train)
yr_predict = model_dt_smote.predict(xr_test)
model_score_r = model_dt_smote.score(xr_test, yr_test)
print(model_score_r)
print(metrics.classification_report(yr_test, yr_predict))

0.9432989690721649
              precision    recall  f1-score   support

           0       0.95      0.92      0.94       530
           1       0.94      0.96      0.95       634

    accuracy                           0.94      1164
   macro avg       0.94      0.94      0.94      1164
weighted avg       0.94      0.94      0.94      1164



# Print the confussion matrix

In [30]:
print(metrics.confusion_matrix(yr_test, yr_predict))

[[490  40]
 [ 26 608]]


using decision tree classifier,Accuracy: 92 %, and a very good recall, precision & f1 score for minority class.

# Random Forest Classifier

In [32]:
from sklearn.ensemble import RandomForestClassifier
model_rf=RandomForestClassifier(n_estimators=100, criterion='gini', random_state = 100,max_depth=6, min_samples_leaf=8)
model_rf.fit(x_train,y_train)
RandomForestClassifier(max_depth=6, min_samples_leaf=8, random_state=100)
y_pred=model_rf.predict(x_test)
model_rf.score(x_test,y_test)
0.7953091684434968
print(classification_report(y_test, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86      1016
           1       0.67      0.50      0.57       391

    accuracy                           0.79      1407
   macro avg       0.75      0.70      0.72      1407
weighted avg       0.78      0.79      0.78      1407



# SMOTEENN (UpSampling + ENN)

In [35]:
sm = SMOTEENN()

# Fit and resample the data
X_resampled1, y_resampled1 = sm.fit_resample(x, y)

# Split resampled data into training and testing sets
xr_train1, xr_test1, yr_train1, yr_test1 = train_test_split(X_resampled1, y_resampled1, test_size=0.2, random_state=42)

# Initialize RandomForestClassifier
model_rf_smote = RandomForestClassifier(n_estimators=100, criterion='gini', random_state=100, max_depth=6, min_samples_leaf=8)

# Train the model
model_rf_smote.fit(xr_train1, yr_train1)

# Make predictions on the test set
yr_predict1 = model_rf_smote.predict(xr_test1)

# Calculate model accuracy
model_score_r1 = model_rf_smote.score(xr_test1, yr_test1)

# Print model accuracy and classification report
print("Model Accuracy:", model_score_r1)
print("Classification Report:")
print(metrics.classification_report(yr_test1, yr_predict1))

Model Accuracy: 0.9311224489795918
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.89      0.92       548
           1       0.91      0.96      0.94       628

    accuracy                           0.93      1176
   macro avg       0.93      0.93      0.93      1176
weighted avg       0.93      0.93      0.93      1176



In [36]:
print(metrics.confusion_matrix(yr_test1, yr_predict1))

[[489  59]
 [ 22 606]]
