# **Prepare Data**

In [24]:
import pandas as pd
import numpy as np


In [25]:
df = pd.read_csv("employee_churn_data_clearned.csv")
df

Unnamed: 0,department_0,department_1,department_2,department_3,department_4,department_5,department_6,department_7,department_8,department_9,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month,left
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.0,3,0,5.0,3.0,0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,4.0,3,1,6.0,1.0,0,1.0,0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.0,3,1,6.0,1.0,0,1.0,0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.0,4,2,8.0,1.0,0,2.0,0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.0,3,2,5.0,3.0,1,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9535,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1.0,4,1,8.0,2.0,0,2.0,1
9536,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,3,1,8.0,3.0,0,2.0,1
9537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.0,3,0,7.0,4.0,0,1.0,1
9538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1.0,4,1,8.0,3.0,1,2.0,1


In [26]:
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score,KFold

In [27]:
X = df.loc[:,df.columns != 'left']
y = df['left']
X_train ,X_test,y_train,y_test  = train_test_split(X,y,test_size=0.3,random_state=0)


In [28]:
from imblearn.under_sampling import TomekLinks,EditedNearestNeighbours
from imblearn.over_sampling import SMOTE

print(f'Before')
print(f'{y_train.value_counts()}')
print(f'{y_train.value_counts(normalize=True)}')
print('=-=' * 10 )
tmk = TomekLinks(n_jobs=-1)
X_train,y_train = tmk.fit_resample(X_train,y_train)
enn = EditedNearestNeighbours()
X_train,y_train = enn.fit_resample(X_train,y_train)
smote = SMOTE(random_state=0)
X_train,y_train = smote.fit_resample(X_train,y_train)

print(f'After')
print(f'{y_train.value_counts()}')
print(f'{y_train.value_counts(normalize=True)}')

Before
left
0    4766
1    1912
Name: count, dtype: int64
left
0    0.713687
1    0.286313
Name: proportion, dtype: float64
=-==-==-==-==-==-==-==-==-==-=
After
left
0    3243
1    3243
Name: count, dtype: int64
left
0    0.5
1    0.5
Name: proportion, dtype: float64


# **Voting Classifier**

In [29]:
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

## **Default parameters**

In [30]:
clf1 = KNeighborsClassifier() 
clf2 = SVC()  


voting_clf = VotingClassifier(
    estimators=[('SVC', clf2), ('KNeighborsClassifier', clf1)],
    voting='hard'
    )

score = cross_val_score(voting_clf, X_train, y_train, cv=5)
print("train score : ", score.mean())

voting_clf.fit(X_train, y_train)
print("test score : ", voting_clf.score(X_test, y_test))

train score :  0.8826711636311366
test score :  0.8109713487071978


## **With hyper parameters**

In [31]:
import warnings
warnings.filterwarnings('ignore')

In [32]:
grid = {
    "n_neighbors": [20],
    "algorithm": ['auto'],
    "leaf_size": [10],
    "p": [0.01, 0.1],
}
gcv_knb = GridSearchCV(KNeighborsClassifier(), grid)

grid = {
    "gamma" : ["scale","auto"],
}

gcv_svc = GridSearchCV(SVC(),grid)

In [33]:
voting_clf = VotingClassifier(
    estimators=[('gcv_svc', gcv_svc), ('gcv_knb', gcv_knb)],
    voting='hard'
    )

score = cross_val_score(voting_clf, X_train, y_train, cv=5)

print("train score : ", score.mean())
voting_clf.fit(X_train, y_train)
print("test score : ", voting_clf.score(X_test, y_test))

train score :  0.883750340064128
test score :  0.8050314465408805


# **Stacking Classifier**

In [34]:
from sklearn.ensemble import StackingClassifier

## **default parameters**

In [35]:
stack_clf = StackingClassifier(
    estimators=[('SVC', SVC()), ('KNeighborsClassifier', KNeighborsClassifier())],
    stack_method='predict',
    cv=10
    )

score = cross_val_score(stack_clf, X_train, y_train, cv=5)

print("train score : ", score.mean())
stack_clf.fit(X_train, y_train)
print("test score : ", stack_clf.score(X_test, y_test))

train score :  0.9085725860198895
test score :  0.7788259958071279


## **With hyper parameter**

In [36]:
grid = {
    "n_neighbors": [20],
    "algorithm": ['auto'],
    "leaf_size": [10],
    "p": [0.01, 0.1],
}
gcv_knb = GridSearchCV(KNeighborsClassifier(), grid,n_jobs=-1)

grid = {
    "gamma" : ["scale","auto"],
}

gcv_svc = GridSearchCV(SVC(),grid)

In [37]:
stack_clf = StackingClassifier(
    estimators=[('gcv_svc', gcv_svc), ('gcv_knb', gcv_knb)],
    stack_method='auto'
    )

score = cross_val_score(stack_clf, X_train, y_train, cv=5)

print("train score : ", score.mean())
stack_clf.fit(X_train, y_train)
print("test score : ", stack_clf.score(X_test, y_test))

train score :  0.8903793630673131
test score :  0.7816212438853948


# **Baseline** 

In [48]:
from sklearn.metrics import classification_report

In [49]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train, y_train)
print(classification_report(y_test,dummy_clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.70      1.00      0.82      1990
           1       0.00      0.00      0.00       872

    accuracy                           0.70      2862
   macro avg       0.35      0.50      0.41      2862
weighted avg       0.48      0.70      0.57      2862



In [50]:
dummy_clf = DummyClassifier(strategy='prior')
dummy_clf.fit(X_train, y_train)
print(classification_report(y_test,dummy_clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.70      1.00      0.82      1990
           1       0.00      0.00      0.00       872

    accuracy                           0.70      2862
   macro avg       0.35      0.50      0.41      2862
weighted avg       0.48      0.70      0.57      2862



In [51]:
dummy_clf = DummyClassifier(strategy='stratified')
dummy_clf.fit(X_train, y_train)
print(classification_report(y_test,dummy_clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.69      0.49      0.57      1990
           1       0.30      0.49      0.37       872

    accuracy                           0.49      2862
   macro avg       0.49      0.49      0.47      2862
weighted avg       0.57      0.49      0.51      2862



In [52]:
dummy_clf = DummyClassifier(strategy='uniform')
dummy_clf.fit(X_train, y_train)
print(classification_report(y_test,dummy_clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.67      0.48      0.56      1990
           1       0.28      0.46      0.35       872

    accuracy                           0.47      2862
   macro avg       0.47      0.47      0.45      2862
weighted avg       0.55      0.47      0.49      2862

