In [1]:
# Import libraries

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
import os
pd.options.display.float_format = '{:,.3f}'.format

For this lab, we will build a model on customer churn binary classification problem. You will be using files_for_lab/**Customer-Churn.csv** file.

In [35]:
os.chdir(r'C:\Users\TrendingPC\Desktop\IronHAck\LABS\LABS-unit-7\7.Cross-validation\files_for_lab')
data = pd.read_csv('Customer-Churn.csv')
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

# Instructions

**1. Apply SMOTE for upsampling the data**

In [37]:
# Creating the the target value and the explanatives values 
y = data.iloc[:, 15]
X = data.iloc[:, [0,2,3,5,6,7,8,9,10,11,12]]

In [38]:
y.value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [39]:
#Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Preprocessing the data
scaler_X = OrdinalEncoder()
scaler_y = LabelEncoder()

X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

y_train = scaler_y.fit_transform(y_train)
y_test = scaler_y.transform(y_test)

In [41]:
# first correction imbalance: oversampling --> SMOTE 

sm = SMOTE(k_neighbors=3)

X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)

In [42]:
# Looking to the shape before and after the 

print(X_train.shape)
print(X_train_SMOTE.shape)

(4930, 11)
(7228, 11)


* **Use logistic regression to fit the model and compute the accuracy of the model.**

In [27]:
# Modeling

model1 = LogisticRegression()

In [53]:
def Building_model_SMOTE(model):

    model.fit(X_train_SMOTE, y_train_SMOTE)
        
    # Evaluation of the model
    pred_train_SMOTE = model.predict(X_train_SMOTE)
    pred_test_SMOTE = model.predict(X_test)
    
    # Accuracy of the model
    print('The accuracy of the model in the train set (SMOTE) is: %.3f' % accuracy_score(y_train_SMOTE, pred_train_SMOTE))
    print('The accuracy of the model in the test set (SMOTE) is: %.3f' % accuracy_score(y_test, pred_test_SMOTE))

In [54]:
Building_model_SMOTE(model1)

The accuracy of the model in the train set (SMOTE) is: 0.748
The accuracy of the model in the test set (SMOTE) is: 0.689


* **Use decision tree classifier to fit the model and compute the accuracy of the model.**

In [55]:
model2 = DecisionTreeClassifier()

Building_model_SMOTE(model2)

The accuracy of the model in the train set (SMOTE) is: 0.845
The accuracy of the model in the test set (SMOTE) is: 0.721


* **Compare the accuracies of the two models.**

The accuracy of the model, on the train set and on the test set, using Decision Tree is better than the Logistic Regression model

**2. Apply TomekLinks for downsampling**

* **It is important to remember that it does not make the two classes equal but only removes the points from the majority class that are close to other points in minority class.**

In [81]:
tl = TomekLinks()

X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)

In [94]:
imbalance = pd.DataFrame(y_train_tl)
print(y.value_counts())
print(imbalance.value_counts())

No     5174
Yes    1869
Name: Churn, dtype: int64
0    3605
1    1316
dtype: int64


* **Use logistic regression to fit the model and compute the accuracy of the model.**

In [82]:
def Building_model_TL(model):

    model.fit(X_train_tl, y_train_tl)
        
    # Evaluation of the model
    pred_train_tl = model.predict(X_train_tl)
    pred_test_tl = model.predict(X_test)
    
    # Accuracy of the model
    print('The accuracy of the model in the train set (TL) is: %.3f' % accuracy_score(y_train_tl, pred_train_tl))
    print('The accuracy of the model in the test set (TL) is: %.3f' % accuracy_score(y_test, pred_test_tl))

In [83]:
Building_model_TL(model1)

The accuracy of the model in the train set (TL) is: 0.776
The accuracy of the model in the test set (TL) is: 0.770


* **Use decision tree classifier to fit the model and compute the accuracy of the model.**

In [84]:
Building_model_TL(model2)

The accuracy of the model in the train set (TL) is: 0.813
The accuracy of the model in the test set (TL) is: 0.736


* **Compare the accuracies of the two models.**

The accuracy of the model, on the train set and on the test set, using Decision Tree is better than the Logistic Regression model

* **You can also apply this algorithm one more time and check the how the imbalance in the two classes changed from the last time.**

In [85]:
X_train_tl1, y_train_tl1 = tl.fit_resample(X_train_tl, y_train_tl)

In [95]:
imbalance1 = pd.DataFrame(y_train_tl1)
print(imbalance.value_counts())
print(imbalance1.value_counts())

0    3605
1    1316
dtype: int64
0    3599
1    1316
dtype: int64
