**AGENDA**

1) Using SMOTE oversampling method to address class imbalance <br>
2) Comparison of results with and without SMOTE. <br>
3) Applying LDA transform for dimensionality reduction to enhance separability. <br>
4) Training with different classifiers.

In [1]:
import pandas as pd,numpy as np
import matplotlib.pyplot as plt, seaborn as sns

%matplotlib inline

In [2]:
ibm_df = pd.read_csv('IBMAttrition.csv')
ibm_df_original = ibm_df.copy()

In [3]:
## Basic checks on data

ibm_df.info() ## No null values on first look.
sum(ibm_df.duplicated()) # No duplicate values
ibm_df.describe() ## Descriptive stats - numeric columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
Age                         1470 non-null int64
Attrition                   1470 non-null object
BusinessTravel              1470 non-null object
DailyRate                   1470 non-null int64
Department                  1470 non-null object
DistanceFromHome            1470 non-null int64
Education                   1470 non-null int64
EducationField              1470 non-null object
EmployeeCount               1470 non-null int64
EmployeeNumber              1470 non-null int64
EnvironmentSatisfaction     1470 non-null int64
Gender                      1470 non-null object
HourlyRate                  1470 non-null int64
JobInvolvement              1470 non-null int64
JobLevel                    1470 non-null int64
JobRole                     1470 non-null object
JobSatisfaction             1470 non-null int64
MaritalStatus               1470 non-null object
MonthlyIncome         

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [4]:
ibm_df.EmployeeCount.value_counts() ## Only one value (1), can be omitted
ibm_df.EmployeeNumber.nunique() ## Number can be omitted as it provides no information.
ibm_df.StandardHours.value_counts() ## Again, single value class, can be omitted.

80    1470
Name: StandardHours, dtype: int64

In [5]:
## Checking rest of the numeric columns as well.

ibm_df.select_dtypes([np.number])
ibm_df.select_dtypes([np.number])[['JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating']].describe()

Unnamed: 0,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,PerformanceRating
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,2.728571,6502.931293,14313.103401,2.693197,15.209524,3.153741
std,1.102846,4707.956783,7117.786044,2.498009,3.659938,0.360824
min,1.0,1009.0,2094.0,0.0,11.0,3.0
25%,2.0,2911.0,8047.0,1.0,12.0,3.0
50%,3.0,4919.0,14235.5,2.0,14.0,3.0
75%,4.0,8379.0,20461.5,4.0,18.0,3.0
max,4.0,19999.0,26999.0,9.0,25.0,4.0


In [6]:
# Checking object columns.

ibm_df.describe(exclude=[np.number]) ## Descriptive stats - non-numeric columns

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
count,1470,1470,1470,1470,1470,1470,1470,1470,1470
unique,2,3,3,6,2,9,3,1,2
top,No,Travel_Rarely,Research & Development,Life Sciences,Male,Sales Executive,Married,Y,No
freq,1233,1043,961,606,882,326,673,1470,1054


In [7]:
ibm_df.BusinessTravel.value_counts() ## Travel Rarely and Non-Travel can be combined.
ibm_df.Over18.value_counts() ## Can be omitted. 

Y    1470
Name: Over18, dtype: int64

In [8]:
# Data cleaning

ibm_df.drop(['EmployeeCount','EmployeeNumber','StandardHours','Over18'],axis=1,inplace=True) ## Dropping unnecessary cols.
ibm_df.shape

(1470, 31)

In [9]:
ibm_df['BusinessTravel'] = ibm_df['BusinessTravel'].map({'Travel_Rarely':0, 'Non-Travel':0,'Travel_Frequently':1})
ibm_df.rename(columns={'BusinessTravel':'Freqnt_BusinessTravel'},inplace=True)

In [10]:
for i in ibm_df.select_dtypes([np.number]).columns:
    print('{}:{}'.format(i,ibm_df[i].nunique()))

Age:43
Freqnt_BusinessTravel:2
DailyRate:886
DistanceFromHome:29
Education:5
EnvironmentSatisfaction:4
HourlyRate:71
JobInvolvement:4
JobLevel:5
JobSatisfaction:4
MonthlyIncome:1349
MonthlyRate:1427
NumCompaniesWorked:10
PercentSalaryHike:15
PerformanceRating:2
RelationshipSatisfaction:4
StockOptionLevel:4
TotalWorkingYears:40
TrainingTimesLastYear:7
WorkLifeBalance:4
YearsAtCompany:37
YearsInCurrentRole:19
YearsSinceLastPromotion:16
YearsWithCurrManager:18


In [11]:
def dummy_encode(df,col_list):
    for i in col_list:
        dummy = pd.get_dummies(df[i],prefix=i+'_',drop_first=True)
        df = pd.concat([df,dummy],axis=1)
        df.drop(i,axis=1,inplace=True)
    return df

In [12]:
col_list = list(ibm_df.select_dtypes(exclude=[np.number]).columns)[1:]

ibm_df = dummy_encode(ibm_df,col_list)

In [13]:
ibm_df.sample(3)

Unnamed: 0,Age,Attrition,Freqnt_BusinessTravel,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole__Laboratory Technician,JobRole__Manager,JobRole__Manufacturing Director,JobRole__Research Director,JobRole__Research Scientist,JobRole__Sales Executive,JobRole__Sales Representative,MaritalStatus__Married,MaritalStatus__Single,OverTime__Yes
363,33,Yes,0,350,5,3,4,34,3,1,...,0,0,0,0,0,0,1,0,1,1
915,21,Yes,1,251,10,2,1,45,2,1,...,1,0,0,0,0,0,0,0,1,0
241,32,No,0,976,26,4,3,100,3,2,...,0,0,0,0,0,1,0,1,0,0


In [14]:
## VERSIONS OF X

X = ibm_df.drop('Attrition',axis=1)
y = ibm_df['Attrition']

## Scaled VERSION

from sklearn.preprocessing import StandardScaler

X_scaled = pd.DataFrame( StandardScaler().fit_transform(X) , columns= X.columns)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [15]:
## Applying Dimensionality reduction techniques

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis

## Class imbalance handling technqiues

from imblearn.over_sampling import SMOTE

In [16]:
from sklearn.model_selection import train_test_split,cross_val_score,RandomizedSearchCV,GridSearchCV

In [17]:
print('Before SMOTE Oversampling no of class instance Yes',sum(y=='Yes'))
print('Before SMOTE Oversampling no of class instance No',sum(y=='No'))

sm = SMOTE()
X_sm, y_sm = sm.fit_sample(X,y)
print()

print('Before SMOTE Oversampling no of class instance Yes',sum(y_sm=='Yes'))
print('Before SMOTE Oversampling no of class instance No',sum(y_sm=='No'))

X_sm_scaled = StandardScaler().fit_transform(X_sm)

Before SMOTE Oversampling no of class instance Yes 237
Before SMOTE Oversampling no of class instance No 1233

Before SMOTE Oversampling no of class instance Yes 1233
Before SMOTE Oversampling no of class instance No 1233


In [18]:
 X_train1, X_test1, y_train1, y_test1 = train_test_split(X_scaled, y, test_size=0.33, random_state=42)

In [19]:
 X_train2, X_test2, y_train2, y_test2 = train_test_split(X_sm_scaled, y_sm, test_size=0.33, random_state=42)

In [20]:
## Applying LDA classifier

lda1 = LinearDiscriminantAnalysis()
lda1.fit(X_train1,y_train1)

lda2 = LinearDiscriminantAnalysis()
lda2.fit(X_train2,y_train2)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [29]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,precision_recall_curve,cohen_kappa_score

In [31]:
print(classification_report(y_test1,lda1.predict(X_test1)))
print('-'*25)
print(accuracy_score(y_test1,lda1.predict(X_test1)))
print('-'*25)
print(confusion_matrix(y_test1,lda1.predict(X_test1)))
print('-'*25)
print(cohen_kappa_score(y_test1,lda1.predict(X_test1)))

              precision    recall  f1-score   support

          No       0.90      0.95      0.93       416
         Yes       0.60      0.40      0.48        70

   micro avg       0.87      0.87      0.87       486
   macro avg       0.75      0.68      0.70       486
weighted avg       0.86      0.87      0.86       486

-------------------------
0.8744855967078189
-------------------------
[[397  19]
 [ 42  28]]
-------------------------
0.41040531402887714


In [32]:
print(classification_report(y_test2,lda2.predict(X_test2)))
print('-'*25)
print(accuracy_score(y_test2,lda2.predict(X_test2)))
print('-'*25)
print(confusion_matrix(y_test2,lda2.predict(X_test2)))
print('-'*25)
print(cohen_kappa_score(y_test2,lda2.predict(X_test2)))

              precision    recall  f1-score   support

          No       0.82      0.74      0.78       417
         Yes       0.75      0.83      0.79       397

   micro avg       0.78      0.78      0.78       814
   macro avg       0.79      0.78      0.78       814
weighted avg       0.79      0.78      0.78       814

-------------------------
0.7825552825552825
-------------------------
[[308 109]
 [ 68 329]]
-------------------------
0.5659229086702138


In [34]:
# CONCLUSION : CLASS IMBALANCE PROBLEM SIGNIFICANTLY REDUCED BY SMOTE OVERSAMPLING TECHNIQUE.
# ALTHOUGH THE ACCURACY SUFFERED, IN SUCH CASES ACCURACY CAN BE MISLEADING. FOR EXAMPLE, IF THE CLASSIFIER HAD ONLY 
# PREDICTED ALL THE VALUES AS NO, IT WOULD STILL HAVE HAD A 1233/(237+1233) = 83.87% ACCURACY.SS

0.8387755102040816

In [36]:
## USING LDA AS A Dimensionality reduction transformer TO OBTAIN BETTER SEPARABILITY.

X_sm_scaled.shape

(2466, 43)

In [52]:
X_sm_scaled_lda_train = lda2.transform(X_train2)  ## LDA transformed X. We'll use this for training other classifiers.
X_sm_scaled_lda_test = lda2.transform(X_test2)

In [56]:
# Importing Linear and Non Linear Models

from sklearn.linear_model import LogisticRegression,SGDClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import xgboost
from xgboost import XGBClassifier

In [68]:
def train_test_results(estimator,X,y,metric):
   
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    estimator = estimator()
    estimator.fit(X_train,y_train)
    pred = estimator.predict(X_test)
    
    print(metric(y_test,pred))

In [110]:
#train_test_results(LogisticRegression,X_sm_scaled_lda_train,y_train2,classification_report)
train_test_results(AdaBoostClassifier,X_sm_scaled_lda_train,y_train2,classification_report)
train_test_results(AdaBoostClassifier,X_sm_scaled_lda_train,y_train2,cohen_kappa_score)

              precision    recall  f1-score   support

          No       0.78      0.76      0.77       254
         Yes       0.80      0.82      0.81       292

   micro avg       0.79      0.79      0.79       546
   macro avg       0.79      0.79      0.79       546
weighted avg       0.79      0.79      0.79       546

              precision    recall  f1-score   support

          No       0.82      0.70      0.76       254
         Yes       0.77      0.86      0.81       292

   micro avg       0.79      0.79      0.79       546
   macro avg       0.79      0.78      0.79       546
weighted avg       0.79      0.79      0.79       546

0.572874576547896


In [112]:
## XGB Classifier results on training data.

train_test_results(XGBClassifier,X_sm_scaled_lda_train,y_train2,classification_report)
train_test_results(XGBClassifier,X_sm_scaled_lda_train,y_train2,cohen_kappa_score)

              precision    recall  f1-score   support

          No       0.78      0.75      0.76       254
         Yes       0.79      0.82      0.80       292

   micro avg       0.79      0.79      0.79       546
   macro avg       0.79      0.78      0.78       546
weighted avg       0.79      0.79      0.79       546

0.5681255830933354


In [104]:
def cross_validation(estimator,X,y,metric=None):
    
    """Returns model.score value for the Estimator specified"""
    
    estimator = estimator()
    scores = cross_val_score(estimator,X,y,cv=5)
    #print( "Accuracy scores", scores )
    print( "Mean Accuracy", scores.mean() )


def classification_linear__model_results(X,y):
    
    """Returns the cross validation results for linear estimators"""

    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)

    linear_models = {'Logistic Regression': LogisticRegression,'SGDClassifier': SGDClassifier}
    for name,model in linear_models.items():
        print(name)
        print(len(name)*'-')
        cross_validation(model,X,y)
        print()
        
classification_linear__model_results(X_sm_scaled_lda_train,y_train2) # Cross validation results with LDA

Logistic Regression
-------------------
Mean Accuracy 0.8129390288426434

SGDClassifier
-------------
Mean Accuracy 0.6862723621759766



In [105]:
classification_linear__model_results(X_sm_scaled,y_sm) ## Cross validation results without LDA

Logistic Regression
-------------------
Mean Accuracy 0.801310029294625

SGDClassifier
-------------
Mean Accuracy 0.7250600704387611



In [106]:
def classification_nonlinear__model_results(X,y):
    
    """Returns the cross validation results for linear estimators"""

    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)

    non_linear_models = {'DecisionTreeClassifier': DecisionTreeClassifier,'RandomForestClassifier': RandomForestClassifier,
                    'ExtraTreesClassifier':ExtraTreesClassifier,'AdaBoostClassifier':AdaBoostClassifier,
                     'KNeighborsClassifier':KNeighborsClassifier,'SVC':SVC,'XGBClassifier':XGBClassifier}
    for name,model in non_linear_models.items():
        print(name)
        print(len(name)*'-')
        cross_validation(model,X,y)
        print()
        
classification_nonlinear__model_results(X_sm_scaled_lda_train,y_train2) ## Cross validation results  With LDA

DecisionTreeClassifier
----------------------
Mean Accuracy 0.7342533771449433

RandomForestClassifier
----------------------
Mean Accuracy 0.7463490324936108

ExtraTreesClassifier
--------------------
Mean Accuracy 0.7354581964220518

AdaBoostClassifier
------------------
Mean Accuracy 0.8117415115005476

KNeighborsClassifier
--------------------
Mean Accuracy 0.7941767068273093

SVC
---
Mean Accuracy 0.8093026652062797

XGBClassifier
-------------
Mean Accuracy 0.8129499817451625



In [107]:
classification_nonlinear__model_results(X_sm_scaled,y_sm) ## Cross validation results Without LDA

DecisionTreeClassifier
----------------------
Mean Accuracy 0.8496478061946611

RandomForestClassifier
----------------------
Mean Accuracy 0.9007669266976069

ExtraTreesClassifier
--------------------
Mean Accuracy 0.9388268983904414

AdaBoostClassifier
------------------
Mean Accuracy 0.8699697179158026

KNeighborsClassifier
--------------------
Mean Accuracy 0.8041209966755538

SVC
---
Mean Accuracy 0.9314884302689181

XGBClassifier
-------------
Mean Accuracy 0.889434185839834

