# **Prepare Data**

In [7]:
import pandas as pd
import numpy as np


In [8]:
df = pd.read_csv("employee_churn_data_clearned.csv")
df

Unnamed: 0,department_0,department_1,department_2,department_3,department_4,department_5,department_6,department_7,department_8,department_9,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month,left
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0.0,3,0,5.0,3.0,0,0.0,0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.0,4,2,8.0,1.0,0,2.0,0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.0,3,2,5.0,3.0,1,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,3.0,2,1,5.0,3.0,1,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,1.0,4,2,5.0,4.0,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10489,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,3,1,6.0,0.0,0,0.0,1
10490,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.0,3,1,6.0,0.0,0,1.0,1
10491,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,4,1,8.0,4.0,0,2.0,1
10492,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.0,3,1,6.0,0.0,0,1.0,1


In [9]:
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score,KFold

In [10]:
X = df.loc[:,df.columns != 'left']
y = df['left']
X_train ,X_test,y_train,y_test  = train_test_split(X,y,test_size=0.3,random_state=0)


In [11]:
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE

tmk = TomekLinks(n_jobs=-1)
X_train,y_train = tmk.fit_resample(X_train,y_train)
smote = SMOTE(random_state=0)
X_train,y_train = smote.fit_resample(X_train,y_train)

# **Voting Classifier**

In [12]:
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

## **Default parameters**

In [13]:
clf1 = KNeighborsClassifier() 
clf2 = SVC()  


voting_clf = VotingClassifier(
    estimators=[('SVC', clf2), ('KNeighborsClassifier', clf1)],
    voting='hard'
    )

score = cross_val_score(voting_clf, X_train, y_train, cv=5)
print("train score : ", score.mean())

voting_clf.fit(X_train, y_train)
print("test score : ", voting_clf.score(X_test, y_test))

train score :  0.8606872570889772
test score :  0.8751984757065735


## **With hyper parameters**

In [14]:
import warnings
warnings.filterwarnings('ignore')

In [15]:
grid = {
    "n_neighbors": [20],
    "algorithm": ['auto'],
    "leaf_size": [10],
    "p": [0.01, 0.1],
}
gcv_knb = GridSearchCV(KNeighborsClassifier(), grid)

grid = {
    "gamma" : ["scale","auto"],
}

gcv_svc = GridSearchCV(SVC(),grid)

In [16]:
voting_clf = VotingClassifier(
    estimators=[('gcv_svc', gcv_svc), ('gcv_knb', gcv_knb)],
    voting='hard'
    )

score = cross_val_score(voting_clf, X_train, y_train, cv=5)

print("train score : ", score.mean())
voting_clf.fit(X_train, y_train)
print("test score : ", voting_clf.score(X_test, y_test))

train score :  0.8666551985833545
test score :  0.8755160368370911


# **Stacking Classifier**

In [17]:
from sklearn.ensemble import StackingClassifier

## **default parameters**

In [18]:
stack_clf = StackingClassifier(
    estimators=[('SVC', SVC()), ('KNeighborsClassifier', KNeighborsClassifier())],
    stack_method='predict',
    cv=10
    )

score = cross_val_score(stack_clf, X_train, y_train, cv=5)

print("train score : ", score.mean())
stack_clf.fit(X_train, y_train)
print("test score : ", stack_clf.score(X_test, y_test))

train score :  0.8758809649748176
test score :  0.8932994601460781


## **With hyper parameter**

In [19]:
grid = {
    "n_neighbors": [20],
    "algorithm": ['auto'],
    "leaf_size": [10],
    "p": [0.01, 0.1],
}
gcv_knb = GridSearchCV(KNeighborsClassifier(), grid,n_jobs=-1)

grid = {
    "gamma" : ["scale","auto"],
}

gcv_svc = GridSearchCV(SVC(),grid)

In [20]:
stack_clf = StackingClassifier(
    estimators=[('gcv_svc', gcv_svc), ('gcv_knb', gcv_knb)],
    stack_method='auto'
    )

score = cross_val_score(stack_clf, X_train, y_train, cv=5)

print("train score : ", score.mean())
stack_clf.fit(X_train, y_train)
print("test score : ", stack_clf.score(X_test, y_test))

train score :  0.8670618862543982
test score :  0.8771038424896792


# **Baseline** 

In [21]:
from sklearn.dummy import DummyClassifier
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_test,y_test)


0.49190219117180056

In [22]:
dummy_clf = DummyClassifier(strategy='prior')
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_test,y_test)

0.49190219117180056

In [23]:
dummy_clf = DummyClassifier(strategy='stratified')
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_test,y_test)

0.5084153699587171

In [24]:
dummy_clf = DummyClassifier(strategy='uniform')
dummy_clf.fit(X_train, y_train)
dummy_clf.score(X_test,y_test)

0.4922197523023182

In [25]:
%pip install nbconvert

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip
