<h1>MODEL DEVELOPMENT AND EVALUATION</h1>

In [17]:
import pandas as pd
import numpy as np
from pandas import DataFrame
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics
import joblib

In [18]:
data = pd.read_csv('./datasets/WA_Fn-UseC_-HR-Employee-Attrition.csv')
data = data.drop(columns=['StandardHours','EmployeeCount','Over18','EmployeeNumber','StockOptionLevel'])

le = preprocessing.LabelEncoder()
categorial_variables = ['Attrition','BusinessTravel','Department','EducationField',
                        'Gender','JobRole','MaritalStatus','OverTime']
for i in categorial_variables:
    data[i] = le.fit_transform(data[i])
data.head(5)
data.to_csv('./datasets/LabelEncoded_CleanData.csv')

In [19]:
target = data['Attrition']
train = data.drop('Attrition',axis = 1)
train.shape

(1470, 29)

<h3>Implementation of all the popular classifiers in scikit-learn</h3>

1. *Logistic Regression*
1. *SVM*
1. *KNN*
1. *Decision Tree*
1. *K Means Clustering*

In [20]:
train_accuracy = []
test_accuracy = []
models = ['Logistic Regression','SVM','KNN','Decision Tree','K Means Clustering']

In [21]:
#Defining a function which will give us train and test accuracy for each classifier.
def train_test_error(y_train,y_test):
    train_error = ((y_train==Y_train).sum())/len(y_train)*100
    test_error = ((y_test==Y_test).sum())/len(Y_test)*100
    train_accuracy.append(train_error)
    test_accuracy.append(test_error)
    print('{}'.format(train_error) + " is the train accuracy")
    print('{}'.format(test_error) + " is the test accuracy")

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(train, target, test_size=0.33, random_state=42)

## Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=5000)
log_reg.fit(X_train,Y_train)
train_predict = log_reg.predict(X_train)
test_predict = log_reg.predict(X_test)
y_prob = log_reg.predict(train)
y_pred = np.where(y_prob > 0.5, 1, 0)
train_test_error(train_predict , test_predict)

86.99186991869918 is the train accuracy
86.0082304526749 is the test accuracy


In [25]:
filename = "trained_model"
joblib.dump(log_reg,filename)

['trained_model']

## SVM

In [8]:
from sklearn import svm
SVM = svm.SVC(probability=True)
SVM.fit(X_train,Y_train)
train_predict = SVM.predict(X_train)
test_predict = SVM.predict(X_test)
train_test_error(train_predict , test_predict)

83.02845528455285 is the train accuracy
85.59670781893004 is the test accuracy


## KNN

In [9]:
from sklearn import neighbors
n_neighbors = 15
knn = neighbors.KNeighborsClassifier(n_neighbors, weights='distance')
knn.fit(X_train,Y_train)
train_predict = knn.predict(X_train)
test_predict = knn.predict(X_test)
train_test_error(train_predict , test_predict)

100.0 is the train accuracy
84.5679012345679 is the test accuracy


## Decision Tree

In [10]:
from sklearn import tree
dec = tree.DecisionTreeClassifier()
dec.fit(X_train,Y_train)
train_predict = dec.predict(X_train)
test_predict = dec.predict(X_test)
train_test_error(train_predict , test_predict)

100.0 is the train accuracy
79.21810699588477 is the test accuracy


## K-MEANS CLUSTERING

In [11]:
from sklearn.cluster import KMeans
kms = KMeans(n_clusters=2, random_state=1)
kms.fit(X_train,Y_train)
train_predict = kms.predict(X_train)
test_predict = kms.predict(X_test)
train_test_error(train_predict,test_predict)

50.0 is the train accuracy
49.17695473251029 is the test accuracy


In [12]:
results = DataFrame({"Test Accuracy" : test_accuracy , "Train Accuracy" : train_accuracy} , index = models)

In [13]:
results

Unnamed: 0,Test Accuracy,Train Accuracy
Logistic Regression,85.390947,83.739837
SVM,85.596708,83.028455
KNN,84.567901,100.0
Decision Tree,79.218107,100.0
K Means Clustering,49.176955,50.0


**Since Logistic Regression has the highest test accuracy, Logistic regression is the winner**
