In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## <span style = 'background :lightblue'>STEP 1 : Import Transformed Dataset</span>

In [2]:
filepath_train = r"S2a_Part2_FE_Pipeline_Train_DS.csv"
filepath_test = r"S2a_Part2_FE_Pipeline_Test_DS.csv"

In [3]:
d_train = pd.read_csv(filepath_train  ,index_col= 0 , header= 0)
d_test = pd.read_csv(filepath_test  ,index_col= 0 , header= 0)

In [4]:
d_train.head(3)

Unnamed: 0,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_NO,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,-0.338701,0.049061,0,1,1,0,0,1,1,0,...,0,0,1,1,0,1,0,0,0,0
1,1.030607,0.139714,1,0,1,0,1,0,1,0,...,1,0,0,0,1,1,0,0,0,0
2,0.147022,-0.956056,1,0,1,0,1,0,1,0,...,1,0,0,1,0,0,0,1,0,1


In [5]:
d_test.head(3)

Unnamed: 0,MonthlyCharges,TotalCharges,gender_Female,gender_Male,SeniorCitizen_NO,SeniorCitizen_Yes,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,...,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,-1.457688,-0.536472,0,1,1,0,0,1,1,0,...,0,0,1,1,0,0,0,0,1,0
1,-1.480896,-0.374264,0,1,1,0,1,0,1,0,...,0,0,1,1,0,1,0,0,0,0
2,0.142049,0.81081,0,1,1,0,1,0,1,0,...,0,1,0,1,0,1,0,0,0,0


## <span style = 'background :lightblue'>STEP 2 : Train - Test Split</span>

In [6]:
X_train = d_train.drop(labels= ['Churn'] , axis = 1)
y_train = d_train['Churn']

X_test = d_test.drop(labels= ['Churn'] , axis = 1)
y_test = d_test['Churn']

In [7]:
X_train.shape , y_train.shape

((5634, 51), (5634,))

In [8]:
X_test.shape , y_test.shape

((1409, 51), (1409,))

In [9]:
y_train.value_counts()

0    4149
1    1485
Name: Churn, dtype: int64

In [10]:
y_test.value_counts()

0    1025
1     384
Name: Churn, dtype: int64

## <span style = 'background :lightblue'>STEP 3 : Up-sampling of Train and Test DataSet</span>

In [11]:
import warnings
warnings.filterwarnings("ignore")

In [12]:
import copy

In [13]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, KMeansSMOTE 

In [14]:
over_sampl_models = [RandomOverSampler(), 
                      SMOTE(), 
                      KMeansSMOTE()]

In [15]:
def oversampling(os_model = None ,sampl_stra_train = 1, sampl_stra_test = 1,  
                  Xtrain = None , ytrain = None, 
                  Xtest = None, ytest = None ):

    # On Training Datatset
    model_train = copy.deepcopy(os_model)

    model_train = model_train.set_params(**{"sampling_strategy" : sampl_stra_train 
                                            })
  
    # fit and apply the transform
    X_train_res, Y_train_res = model_train.fit_resample(Xtrain, ytrain)


    # On Test Dataset
    model_test = copy.deepcopy(os_model)

    model_test = model_test.set_params(**{"sampling_strategy" : sampl_stra_test , 
                                          }) 
                                           
    # fit and apply the transform
    X_test_res, Y_test_res = model_test.fit_resample(Xtest, ytest)

    
    #\33[1mTraining Accuracy :\33[0m \n
    print("\n-------------------------------------------------------")
    print(f"\33[1mOver Sampling Model : {str(os_model)}\33[0m")
    print("-------------------------------------------------------")
    print("\n")
    print("X_train_res shape : ",X_train_res.shape)
    print("y_train_res shape : ",Y_train_res.shape)
    print("y_train_res class count : \n" ,Y_train_res.value_counts(),"\n")

    print("X_test_res shape : " , X_test_res.shape)
    print("y_test_res shape : " , Y_test_res.shape)
    print("y_test_res class count : \n", Y_test_res.value_counts(),"\n")


    return (X_train_res, Y_train_res , X_test_res , Y_test_res)


## <span style = 'background :lightblue'>STEP 4 : Performance of ML Models on Different Under-Sampled Train and Test DataSets</span>

In [16]:
from sklearn.metrics import classification_report , accuracy_score

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [18]:
models = [LogisticRegression(max_iter=1000), SVC(), GaussianNB(),DecisionTreeClassifier(), RandomForestClassifier()]

In [19]:
for samp_model in over_sampl_models:
    
    
    X_train_res, y_train_res , X_test_res , y_test_res = oversampling(os_model = samp_model ,
                                                                        sampl_stra_train = 0.928, 
                                                                        sampl_stra_test = 0.854,  
                                                                        Xtrain = X_train , 
                                                                        ytrain = y_train , 
                                                                        Xtest = X_test,
                                                                        ytest = y_test ,
                                                                     )

    for model in models :

        model.fit(X_train_res.values, y_train_res)
        y_hat_train = model.predict(X_train_res.values)
        y_hat_test =  model.predict(X_test_res.values)

        print(f"******************* \33[1m{str(model)}\33[0m *******************\n")

        print("\33[1mTraining Accuracy :\33[0m \n")
        print(accuracy_score(y_train_res,y_hat_train ))

        print() 

        print("\33[1mTesting Accuracy :\33[0m \n")
        print(accuracy_score(y_test_res,y_hat_test ))

        print()


-------------------------------------------------------
[1mOver Sampling Model : RandomOverSampler()[0m
-------------------------------------------------------


X_train_res shape :  (7999, 51)
y_train_res shape :  (7999,)
y_train_res class count : 
 0    4149
1    3850
Name: Churn, dtype: int64 

X_test_res shape :  (1900, 51)
y_test_res shape :  (1900,)
y_test_res class count : 
 0    1025
1     875
Name: Churn, dtype: int64 

******************* [1mLogisticRegression(max_iter=1000)[0m *******************

[1mTraining Accuracy :[0m 

0.7727215901987748

[1mTesting Accuracy :[0m 

0.741578947368421

******************* [1mSVC()[0m *******************

[1mTraining Accuracy :[0m 

0.812726590823853

[1mTesting Accuracy :[0m 

0.7336842105263158

******************* [1mGaussianNB()[0m *******************

[1mTraining Accuracy :[0m 

0.7468433554194275

[1mTesting Accuracy :[0m 

0.7121052631578947

******************* [1mDecisionTreeClassifier()[0m ****************

## Conclusion:

- Out of all the over sampling algorithm **KMeansSMOTE()** is the best performing Algorithm