In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

In [18]:
df = pd.read_csv(r'C:\Users\NIDHAL\Desktop\lab-imbalanced-data-master\files_for_lab\customer_churn.csv')
df


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [20]:
# Separation X and y 
x = df.loc[:, ["tenure", "SeniorCitizen", "MonthlyCharges"]]
y = df["Churn"]                    


In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
scaler = StandardScaler()
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2, random_state = 0)
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


# Step 2 - Build the model 🏋️‍♂️

In [25]:
model = LogisticRegression(multi_class = "ovr", max_iter=1000 )
model.fit(x_train, y_train)


In [26]:
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)

print(classification_report(y_train, pred_train))
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

          No       0.82      0.91      0.87      4133
         Yes       0.65      0.47      0.54      1501

    accuracy                           0.79      5634
   macro avg       0.74      0.69      0.70      5634
weighted avg       0.78      0.79      0.78      5634

              precision    recall  f1-score   support

          No       0.82      0.90      0.86      1041
         Yes       0.61      0.46      0.52       368

    accuracy                           0.78      1409
   macro avg       0.72      0.68      0.69      1409
weighted avg       0.77      0.78      0.77      1409



# Synthetic Minority Oversampling TEchnique (SMOTE)

In [29]:
sm = SMOTE(k_neighbors = 3, random_state = 42)

x_train_SMOTE, y_train_SMOTE = sm.fit_resample(x_train, y_train)
y_train_SMOTE.value_counts(normalize = True)


No     0.5
Yes    0.5
Name: Churn, dtype: float64

In [31]:
print(x_train.shape)
print(x_train_SMOTE.shape)

(5634, 3)
(8266, 3)


In [32]:
model.fit(x_train_SMOTE, y_train_SMOTE)

pred_train_SMOTE = model.predict(x_train_SMOTE)
pred_test_SMOTE = model.predict(x_test)

print(classification_report(y_train_SMOTE, pred_train_SMOTE))
print(classification_report(y_test, pred_test_SMOTE))


              precision    recall  f1-score   support

          No       0.74      0.73      0.74      4133
         Yes       0.74      0.74      0.74      4133

    accuracy                           0.74      8266
   macro avg       0.74      0.74      0.74      8266
weighted avg       0.74      0.74      0.74      8266

              precision    recall  f1-score   support

          No       0.88      0.72      0.79      1041
         Yes       0.47      0.71      0.57       368

    accuracy                           0.72      1409
   macro avg       0.68      0.72      0.68      1409
weighted avg       0.77      0.72      0.73      1409



# Tomek links

In [34]:
TL = TomekLinks(sampling_strategy='majority') # play around with sampling_strategy_
x_train_TL, y_train_TL = TL.fit_resample(x_train, y_train)

In [36]:
print('x_train:', x_train.shape)
print('x_train_TL:',x_train_TL.shape)

x_train: (5634, 3)
x_train_TL: (5256, 3)


In [40]:
model.fit(x_train_TL, y_train_TL)

pred_train_TL = model.predict(x_train_TL)
pred_test_TL = model.predict(x_test)

print(classification_report(y_test, pred_test_TL))
print(classification_report(y_train_TL, pred_train_TL))


              precision    recall  f1-score   support

          No       0.83      0.85      0.84      1041
         Yes       0.55      0.51      0.53       368

    accuracy                           0.76      1409
   macro avg       0.69      0.68      0.68      1409
weighted avg       0.76      0.76      0.76      1409

              precision    recall  f1-score   support

          No       0.83      0.90      0.86      3755
         Yes       0.68      0.53      0.60      1501

    accuracy                           0.79      5256
   macro avg       0.75      0.72      0.73      5256
weighted avg       0.78      0.79      0.79      5256



SMOTE (synthetic minority oversampling technique) has helped to solve the imbalance problem.