In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Classification Task:

In this section, we find the best classifier that can be used to predict the variable - 'Attrition' based on the various features present in the dataset.

As in our case, it is critical to classify Attrition correctly we will be considering Recall of 1 (as we wish to have less False Negatives predicted by our model) along with overall accuracy for our evaluation metric of the model.

In [None]:
import pandas as pd
import numpy as np

import  matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

In [None]:
df.info()

### Removing the unwanted EmployeeNumber,Over18 and EmployeeCount column

In [None]:
df = df.drop(['EmployeeNumber'],axis = 1)
df = df.drop(['Over18'],axis = 1)
df = df.drop(['EmployeeCount'],axis=1)
df = df.drop(['StandardHours'],axis=1)
df.columns

### Visualizing Target Variable

In [None]:
df['Attrition'].value_counts().plot(kind='bar', color="blue", alpha=.65)
plt.title("Attrition Breakdown")

### OBSERVATION
The dataset is imbalanced so we have to apply SMOTE technique

### Heatmap

In [None]:
plt.figure(figsize=(18,15))
sns.heatmap(df.corr(),annot=False)

### How is Gender Related to Attrition?

In [None]:
pd.crosstab(df['Gender'],df['Attrition']).plot(kind='bar')
plt.title('Attrition with respect to Gender')
plt.xlabel('Gender')
plt.ylabel('Frequency of Attrition')

Based on the above chart, Gender seems to have some effect on Attrition. It seems like more male employees have been attritioned than the female employees. We will cross-check using Chi-square test

How is Gender Related to Attrition?

### Is Business Travel the reason for Attrition?

In [None]:
pd.crosstab(df['BusinessTravel'],df['Attrition']).plot(kind='bar')
plt.title('Attrition with respect to BusinessTravel')
plt.xlabel('BusinessTravel')
plt.ylabel('Frequency of Attrition')
plt.xticks(rotation=40)

### OBSERVATION
The attrition increases from Non-Travel > Travel_Frequently>Travel_Rarely. But the number of employees also increases. We will perform statistical analysis to confirm whether business travel is statistically related to attrition.

### Is there higher Attrition for a specific department?

In [None]:
pd.crosstab(df['Department'],df['Attrition']).plot(kind='bar', stacked=True)
plt.title('Attrition with respect to Department')
plt.xlabel('Department')
plt.ylabel('Frequency of Attrition')
plt.xticks(rotation=40)

### Attrition with respect to Education Field

In [None]:

pd.crosstab(df['EducationField'],df['Attrition']).plot(kind='bar',stacked=False)
plt.title('Attrition with respect to EducationField')
plt.xlabel('EducationField')
plt.ylabel('Frequency of Attrition')

### Attrition with respect to Job Role

In [None]:

pd.crosstab(df['JobRole'],df['Attrition']).plot(kind='bar', stacked=False)
plt.title('Attrition with respect to JobRole')
plt.xlabel('JobRole')
plt.ylabel('Frequency of Attrition')

### Attrition with respect to marital status

In [None]:
pd.crosstab(df['MaritalStatus'],df['Attrition']).plot(kind='bar', stacked=False)
plt.title('Attrition with respect to MaritalStatus')
plt.xlabel('MaritalStatus')
plt.ylabel('Frequency of Attrition')

In [None]:
df['Gender'] = df['Gender'].map({'Female':0, 'Male':1}).astype(int)
df['BusinessTravel'] = df['BusinessTravel'].map({'Travel_Rarely':2, 'Travel_Frequently':1, 'Non-Travel':0}).astype(int)
df['OverTime'] = df['OverTime'].map({'Yes':0, 'No':1}).astype(int)

### Using dummy variable to convert categorical variables to numerical variables

In [None]:
dummy1 = pd.get_dummies(df['EducationField'])
dummy2 = pd.get_dummies(df['JobRole'])
dummy3 = pd.get_dummies(df['MaritalStatus'])
dummy4 = pd.get_dummies(df['Department'])

### Joining the original dataframe and all the dummy variables produced


In [None]:
df=pd.concat([df,dummy1,dummy2,dummy3,dummy4],axis=1)

### Dropping the original variables from the dataframe

In [None]:
df=df.drop(['EducationField','JobRole','MaritalStatus','Department'],axis=1)

### Mapping Attrition variable

In [None]:
df['Attrition'] = df['Attrition'].map({'Yes':0, 'No':1}).astype(int)

### Attrition column will be assigned to Y and all other variables are assigned to X

In [None]:
X=df.drop(['Attrition'],axis=1)
Y=df['Attrition']

In [None]:
X.shape

From the shape function on X, we know that the dataframe has *1470 data points* and *48 features* that can be used to predict 'Attrition' stored in y. 

The different classification models that can be used for this task are:
- KNN Classification
- Logistic Regression
- Support Vector Machine - Linear SVC
- Support Vector Machine with Kernel trick – Rbf, Poly, Linear
- Decision Tree

But before we start applying these classification models, a three-fold split is performed on the entire dataset

Thus, we fit the above models using train and validation set and after finding the best classifier, we check for the accuracy of the best classifier using test set.

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

X_trainval_org, X_test_org, y_trainval, y_test = train_test_split(X,Y, random_state = 2)

# split train+validation set into training and validation sets
X_train_org, X_valid_org, y_train, y_valid = train_test_split(X_trainval_org, y_trainval, random_state=1)

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train_org)
X_valid = scaler.fit_transform(X_valid_org)
X_trainval = scaler.fit_transform(X_trainval_org)
X_test = scaler.transform(X_test_org)

print("Size of training set: {}   size of validation set: {}   size of test set:"
      " {}\n".format(X_train.shape[0], X_valid.shape[0], X_test.shape[0]))

From the results, it can be said that the training set has 826 data points, validation set has 276 data points and testing set has 368 data points.

Let us fit model - KNN Classifier using train and validation set and find the best parameter - *'K'* using naive grid search.

## KNN Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

%matplotlib inline
train_score_array = []
valid_score_array = []

best_score = 0

for k in range(1,10):
    knn_clf = KNeighborsClassifier(k)
    knn_clf.fit(X_train, y_train)
    train_score_array.append(knn_clf.score(X_train, y_train))
    score = knn_clf.score(X_valid, y_valid)
    valid_score_array.append(score)
    if score > best_score:
            best_score = score
            best_parameters = {'K': k}
            best_K = k

x_axis = range(1,10)
plt.plot(x_axis, train_score_array, c = 'g', label = 'Train Score')
plt.plot(x_axis, valid_score_array, c = 'b', label = 'Validation Score')
plt.legend()
plt.xlabel('k')
plt.ylabel('MSE')

print("Best score: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

The best parameter value of K for this model is 8 which gives an accuracy of 0.84 on the validation dataset.

We review this parameter using cross validation:

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
knn_grid = KNeighborsClassifier(best_K)

scores = cross_val_score(knn_grid, X_trainval, y_trainval, cv =10, scoring = 'accuracy')
print("Cross-validation scores: {}".format(scores))

print("Average cross-validation score: {:.2f}".format(scores.mean()))

The average cross-validation score for the parameter K = 8 is 0.85 

Let us find the best parameter for this model using GridSearchCV:

In [None]:
k_range = list(range(1, 11))

param_grid = dict(n_neighbors=k_range)

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=10, return_train_score=True)

grid_search.fit(X_trainval, y_trainval)

df = pd.DataFrame(grid_search.cv_results_)
%matplotlib inline
x_axis = range(1,11)
plt.plot(x_axis, df.mean_train_score, c = 'g', label = 'Train Score')
plt.plot(x_axis, df.mean_test_score, c = 'b', label = 'Validation Score')
plt.legend()
plt.xlabel('k')
plt.ylabel('CV Score')

print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

The best parameter for this model using GridSearchCV is 8. Eventhough we found the best parameter as K = 8  using the naive grid search, but with parameter K = 8, the average cross validation is 0.85 which is better than the average cross validation of the naive grid search. 

In [None]:
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, classification_report

 
pred_knn = grid_search.predict(X_test)
print(metrics.accuracy_score(y_test,pred_knn))

confusion = confusion_matrix(y_test, pred_knn)
print("Confusion matrix:\n{}".format(confusion))

print(classification_report(y_test,pred_knn))

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

precision,recall,fscore,support=score(y_test,pred_knn)

print ('Recall    : {}'.format(recall[0]))
print ('F1-Score    : {}'.format(fscore[0]))

Thus, the accuracy,recall and f1-score of this model for the best parameter - K are 0.85,0.15 and 0.23 respectively

In [None]:
Classification_Scores={}

Classification_Scores.update({'KNN Classification':[metrics.accuracy_score(y_test,pred_knn),recall[0],fscore[0]]})

In [None]:
columns = ['Classifier','Best Parameters','Accuracy_Score','Recall of 0']
clf_model_para = pd.DataFrame(columns=columns)

clf_model_para=clf_model_para.append({'Classifier':'KNN Classification',
                                      'Best Parameters':grid_search.best_params_,
                                      'Accuracy_Score':metrics.accuracy_score(y_test,pred_knn),
                                      'Recall of 0':recall[0]},ignore_index=True)

## Logistic Regression

In [None]:
from warnings import simplefilter

simplefilter(action='ignore', category=FutureWarning)

from sklearn.linear_model import LogisticRegression

c_range = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
train_score_l1 = []
train_score_l2 = []
valid_score_l1 = []
valid_score_l2 = []

best_score = 0
l1 = 'l1'
l2 = 'l2'

for c in c_range:
    log_l1 = LogisticRegression(penalty = 'l1', C = c,solver='liblinear')
    log_l2 = LogisticRegression(penalty = 'l2', C = c,solver='lbfgs')
    
    log_l1.fit(X_train, y_train)
    log_l2.fit(X_train, y_train)
    
    train_score_l1.append(log_l1.score(X_train, y_train))
    train_score_l2.append(log_l2.score(X_train, y_train))
    
    score = log_l1.score(X_valid, y_valid)
    valid_score_l1.append(score)
    if score > best_score:
            best_score = score
            best_parameters = {'C': c , 'penalty': l1}
            best_C = c
            best_Penalty = 'l1'
    
    score = log_l2.score(X_valid, y_valid)
    valid_score_l2.append(score)
    if score > best_score:
            best_score = score
            best_parameters = {'C': c , 'penalty' : l2}
            best_C = c
            best_Penalty = 'l2'
    
plt.subplot(1,2,1)
plt.plot(c_range, train_score_l1, label = 'Train score, penalty = l1')
plt.plot(c_range, valid_score_l1, label = 'Test score, penalty = l1')
plt.xscale('log')
plt.legend()
plt.subplot(1,2,2)
plt.plot(c_range, train_score_l2, label = 'Train score, penalty = l2')
plt.plot(c_range, valid_score_l2, label = 'Test score, penalty = l2')
plt.xscale('log')
plt.legend()

print("Best score: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

The best parameter value of C and Penalty for this model are 1 and `l2` respectively which gives a score of 0.88 on the validation dataset.

We review this parameter using cross validation.

In [None]:
log_grid = LogisticRegression(penalty = best_Penalty, C = best_C)

scores = cross_val_score(log_grid, X_trainval, y_trainval, cv =10, scoring = 'accuracy')
print("Cross-validation scores: {}".format(scores))

print("Average cross-validation score: {:.2f}".format(scores.mean()))

The average cross-validation score for the best parameters (1 and l2) is 0.88.

Let us find the best parameter for this model using GridSearchCV:

In [None]:
param_grid = {'penalty': ['l1','l2'],
             'C':  [0.001, 0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(LogisticRegression(max_iter=10000), param_grid, cv=10, return_train_score=True)

grid_search.fit(X_trainval, y_trainval)

print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

The best parameter for this model using GridSearchCV is `C=1` and `penalty = l2` giving average cross-validation score same as that of naive grid search.

Thus, we will choose the best parameter for Logistic Regression as `C=1` and `penalty = l2`.

In [None]:
pred_log = grid_search.predict(X_test)
print(metrics.accuracy_score(y_test,pred_log))

confusion = confusion_matrix(y_test, pred_log)
print("Confusion matrix:\n{}".format(confusion))

print(classification_report(y_test,pred_log))

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

precision,recall,fscore,support=score(y_test,pred_log)

print ('Recall    : {}'.format(recall[0]))
print ('F1Score    : {}'.format(fscore[0]))

Thus, the accuracy,recall and f1 score of this model for the best parameter  C=1 and penalty = l2 are 0.875,0.32 and 0.47

In [None]:
Classification_Scores.update({'Logistic Classification':[metrics.accuracy_score(y_test,pred_log),recall[0],fscore[0]]})

In [None]:
clf_model_para=clf_model_para.append({'Classifier':'Logistic Classification',
                                      'Best Parameters':grid_search.best_params_,
                                      'Accuracy_Score':metrics.accuracy_score(y_test,pred_log),
                                      'Recall of 0':recall[0]},ignore_index=True)

## Support-Vector Classification

Let us apply some SVC models on this dataset.

### LinearSVC

In [None]:
from sklearn.svm import LinearSVC

train_score_list = []
valid_score_list = []

best_score = 0

for C in [0.001, 0.01, 0.1, 1, 10, 100]:
    linear_svc = LinearSVC(C=C, max_iter=10000)
    linear_svc.fit(X_train,y_train)
    train_score_list.append(linear_svc.score(X_train,y_train))
    score = linear_svc.score(X_valid, y_valid)
    valid_score_list.append(score)
    if score > best_score:
        best_score = score
        best_parameters = {'C' : C}
        best_C = C

x_range = [0.001, 0.01, 0.1, 1, 10, 100]
plt.plot(x_range, train_score_list, c = 'g', label = 'Train Score')
plt.plot(x_range, valid_score_list, c = 'b', label = 'Validation Score')
plt.xscale('log')
plt.legend(loc = 3)
plt.xlabel('Regularization parameter')

print("Best score: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

Using LinearSVC, we get a score of 0.89 for the regularization parameter C = 1.

Lets check the average cross-validation score for this parameter C = 1.

In [None]:
linear_svc_grid = LinearSVC(C = best_C, max_iter=10000)
scores = cross_val_score(linear_svc_grid, X_trainval, y_trainval, cv = 10, scoring = 'accuracy')
print("Cross-validation scores: {}".format(scores))

print("Average cross-validation score: {:.2f}".format(scores.mean()))

We get an average cross-validation score of 0.88 for C = 1. 

The average cross validation score using GridSearchCV is given by:

In [None]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(LinearSVC(max_iter=10000), param_grid, cv=10, return_train_score=True)

grid_search.fit(X_trainval, y_trainval)

df = pd.DataFrame(grid_search.cv_results_)
%matplotlib inline
plt.plot(x_range, df.mean_train_score, c = 'g', label = 'Train Score')
plt.plot(x_range, df.mean_test_score, c = 'b', label = 'Validation Score')
plt.xscale('log')
plt.legend(loc = 3)
plt.xlabel('Regularization Parameter')

print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

From the results of both the grid search methods - naive grid search and GridSearchCV, we get the same average cross-validation score of 0.88 for regularization parameter `c=10` compared to naive grid search. 

Also, from the above graph we see that for this model, with the increase in the value of regularization parameter i.e. when less regularization is done, the model has more features and performs better compared to when more regularization is done.

In [None]:
pred_linear_svc = grid_search.predict(X_test)
print(metrics.accuracy_score(y_test,pred_linear_svc))

confusion = confusion_matrix(y_test, pred_linear_svc)
print("Confusion matrix:\n{}".format(confusion))

print(classification_report(y_test,pred_linear_svc))

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

precision,recall,fscore,support=score(y_test,pred_linear_svc)

print ('Recall    : {}'.format(recall[0]))
print ('F1Score    : {}'.format(fscore[0]))

Thus, the accuracy,recall and F1Score of this model for the best parameter - C=1 are 0.8699,0.34 and 0.47 respectively

In [None]:
Classification_Scores.update({'Linear_SVC':[metrics.accuracy_score(y_test,pred_linear_svc),recall[0],fscore[0]]})

In [None]:
clf_model_para=clf_model_para.append({'Classifier':'Linear_SVC',
                                      'Best Parameters':grid_search.best_params_,
                                      'Accuracy_Score':metrics.accuracy_score(y_test,pred_linear_svc),
                                      'Recall of 0':recall[0]},ignore_index=True)

### SVC - rbf

The hyper-parameters for this model are `gamma` and regularlization term `C`.

In [None]:
from sklearn.svm import SVC

train_score_list = []
valid_score_list = []

best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        svc_rbf = SVC(kernel='rbf', gamma=gamma, C=C)
        svc_rbf.fit(X_train,y_train)
        train_score_list.append(svc_rbf.score(X_train,y_train))
        score = svc_rbf.score(X_valid, y_valid)
        valid_score_list.append(score)
        if score > best_score:
            best_score = score
            best_parameters = {'gamma': gamma , 'C' : C}
            best_Gamma = gamma
            best_C = C

print("Best score: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

The best paramaters for this kernel are `gamma = 0.01` and `C = 100` which gives a score of 0.87 on the validation set.
The average cross-validation score for these parameters is:

In [None]:
svc_rbf_grid = SVC(kernel='rbf', gamma = best_Gamma, C = best_C)

scores = cross_val_score(svc_rbf_grid, X_trainval, y_trainval, cv =10, scoring = 'accuracy')
print("Cross-validation scores: {}".format(scores))

print("Average cross-validation score: {:.2f}".format(scores.mean()))

From the results of both the grid search methods - we got the average cross validation score as 0.87 and 0.88 

Let us check for the average cross-validation score using GridSearchCV.

In [None]:
param_grid = {'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
             'C': [0.001, 0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=10, return_train_score=True)

grid_search.fit(X_trainval, y_trainval)

print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

The best parameters that we got from GridSearchCV are C = 100 and gamma = 0.01. Compared to the best parameters we got from naive grid search (C = 100 and gamma = 0.01), the average cross-validation score increased from 0.87 to 0.88.

Thus the best parameters for this model are C = 100 and gamma = 0.01.

In [None]:
pred_rbf = grid_search.predict(X_test)
print(metrics.accuracy_score(y_test,pred_rbf))

confusion = confusion_matrix(y_test, pred_rbf)
print("Confusion matrix:\n{}".format(confusion))

print(classification_report(y_test,pred_rbf))

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

precision,recall,fscore,support=score(y_test,pred_rbf)

print ('Recall    : {}'.format(recall[0]))
print ('F1-Score    : {}'.format(fscore[0]))

Thus, the accuracy,recall and f1-score of this model for the best parameter - C=100 and gamma = 0.01 are 0.875,0.34 and 0.48  respectively.

In [None]:
Classification_Scores.update({'SVC RBF Kernel':[metrics.accuracy_score(y_test,pred_rbf),recall[0],fscore[0]]})

In [None]:
clf_model_para=clf_model_para.append({'Classifier':'SVC RBF Kernel',
                                      'Best Parameters':grid_search.best_params_,
                                      'Accuracy_Score':metrics.accuracy_score(y_test,pred_rbf),
                                      'Recall of 0':recall[0]},ignore_index=True)

### SVC - Poly

In [None]:
train_score_list = []
valid_score_list = []

best_score = 0

for degree in range(1,5):
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
            svc_poly = SVC(kernel='poly', degree = degree, C=C, gamma = gamma)
            svc_poly.fit(X_train,y_train)
            train_score_list.append(svc_poly.score(X_train,y_train))
            score = svc_poly.score(X_valid, y_valid)
            valid_score_list.append(score)
            if score > best_score:
                best_score = score
                best_parameters = {'degree': degree , 'C' : C, 'gamma' : gamma}
                best_Degree = degree
                best_C = C
                best_gamma = gamma

print("Best score: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

The best paramaters for this kernel are `degree = 1`, `gamma = 100` and `C = 0.01` which gives a score of 0.88 on the validation set.
The average cross-validation score for these parameters is:

In [None]:
svc_poly_grid = SVC(kernel='poly',degree = best_Degree, C=best_C, gamma = best_Gamma)

scores = cross_val_score(svc_poly_grid, X_trainval, y_trainval, cv =10, scoring = 'accuracy')
print("Cross-validation scores: {}".format(scores))

print("Average cross-validation score: {:.2f}".format(scores.mean()))

The average cross-validation score for the best parameters is 0.84.

Let's check for the best parameter using GridSearchCV:

In [None]:
param_grid = {'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
             'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'degree': [1,2,3,4,5]}

grid_search = GridSearchCV(SVC(kernel='poly'), param_grid, cv=10, return_train_score=True)

grid_search.fit(X_trainval, y_trainval)

print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

The best parameters that we got from GridSearchCV are degree = 1, C = 1 and gamma = 100.

Compared to the best parameters we got from naive grid search (degree = 1, C = 1 and gamma = 100), the average cross-validation score remained same (0.88).

Thus the best parameters for this model are degree = 1, C = 1 and gamma = 100. 

In [None]:
pred_poly = grid_search.predict(X_test)
print(metrics.accuracy_score(y_test,pred_poly))

confusion = confusion_matrix(y_test, pred_poly)
print("Confusion matrix:\n{}".format(confusion))

print(classification_report(y_test,pred_poly))

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

precision,recall,fscore,support=score(y_test,pred_poly)

print ('Recall    : {}'.format(recall[0]))
print ('FScore    : {}'.format(fscore[0]))

Thus, the accuracy,recall and f1score of this model for the best parameter - degree = 1, C = 1 and gamma = 100 are 0.88,0.32 and 0.45 respectively.

In [None]:
Classification_Scores.update({'SVC Poly Kernel':[metrics.accuracy_score(y_test,pred_poly),recall[0],fscore[0]]})

In [None]:
clf_model_para=clf_model_para.append({'Classifier':'SVC Poly Kernel',
                                      'Best Parameters':grid_search.best_params_,
                                      'Accuracy_Score':metrics.accuracy_score(y_test,pred_poly),
                                      'Recall of 0':recall[0]},ignore_index=True)

### SVM-linear

In [None]:
from sklearn.svm import SVC
train_score_list = []
valid_score_list = []

best_score = 0

for degree in range(1,5):
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
            svc_poly = SVC(kernel='linear', degree = degree, C=C, gamma = gamma)
            svc_poly.fit(X_train,y_train)
            train_score_list.append(svc_poly.score(X_train,y_train))
            score = svc_poly.score(X_valid, y_valid)
            valid_score_list.append(score)
            if score > best_score:
                best_score = score
                best_parameters = {'degree': degree , 'C' : C, 'gamma' : gamma}
                best_Degree = degree
                best_C = C
                best_gamma = gamma

print("Best score: {:.2f}".format(best_score))
print("Best parameters: {}".format(best_parameters))

The best paramaters for this kernel are `degree = 1`, `gamma = 0.001` and `C = 1` which gives a score of 0.87 on the validation set.
The average cross-validation score for these parameters is:

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

svc_poly_grid = SVC(kernel='linear',degree = best_Degree, C=best_C, gamma = best_gamma)

scores = cross_val_score(svc_poly_grid, X_trainval, y_trainval, cv =10, scoring = 'accuracy')
print("Cross-validation scores: {}".format(scores))

print("Average cross-validation score: {:.2f}".format(scores.mean()))

The average cross-validation score for the best parameters is 0.88.

Let's check for the best parameter using GridSearchCV:

In [None]:
param_grid = {'gamma': [0.001, 0.01, 0.1, 1, 10, 100],
             'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'degree': [1,2,3,4,5]}

grid_search = GridSearchCV(SVC(kernel='linear'), param_grid, cv=10, return_train_score=True)

grid_search.fit(X_trainval, y_trainval)

print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

The best parameters that we got from GridSearchCV are degree = 1, C = 100 and gamma = 0.001.

Compared to the best parameters we got from naive grid search (degree = 1, C = 1 and gamma = 0.001), the average cross-validation score remained same (0.88).

Thus the best parameters for this model are degree = 1, C = 100 and gamma = 0.001.

In [None]:
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, classification_report 
pred_linear = grid_search.predict(X_test)
print(metrics.accuracy_score(y_test,pred_linear))

confusion = confusion_matrix(y_test, pred_linear)
print("Confusion matrix:\n{}".format(confusion))

print(classification_report(y_test,pred_linear))

In [None]:
Classification_Scores.update({'SVC Poly Linear':[metrics.accuracy_score(y_test,pred_linear),recall[0],fscore[0]]})

In [None]:
clf_model_para=clf_model_para.append({'Classifier':'SVC Poly Linear',
                                      'Best Parameters':grid_search.best_params_,
                                      'Accuracy_Score':metrics.accuracy_score(y_test,pred_linear),
                                      'Recall of 0':recall[0]},ignore_index=True)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dtree = DecisionTreeClassifier()

dtree.fit(X_trainval, y_trainval)
print("Accuracy on training set: {:.3f}".format(dtree.score(X_trainval, y_trainval)))
print("Accuracy on test set: {:.3f}".format(dtree.score(X_test, y_test)))

We are getting an accuracy of 1.00 on the training set as it goes till highest depth. We are getting an accuracy of 0.799 on the testing set.

Let us check for the average cross-validation score for this model using cross_val_score. 

In [None]:
dtree_cv = DecisionTreeClassifier()
scores = cross_val_score(dtree_cv, X_trainval, y_trainval, cv = 10, scoring = 'accuracy' )
print("Cross-validation scores: {}".format(scores))

print("Average cross-validation score: {:.2f}".format(scores.mean()))

In [None]:
pred_tree = dtree.predict(X_test)
print(metrics.accuracy_score(y_test,pred_tree))

confusion = confusion_matrix(y_test, pred_tree)
print("Confusion matrix:\n{}".format(confusion))

print(classification_report(y_test,pred_tree))

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score

precision,recall,fscore,support=score(y_test,pred_tree)

print ('Recall    : {}'.format(recall[0]))
print ('F1Score    : {}'.format(fscore[0]))

In [None]:
Classification_Scores.update({'Decison Tree':[metrics.accuracy_score(y_test,pred_tree),recall[0],fscore[0]]})

In [None]:
clf_model_para=clf_model_para.append({'Classifier':'Decision Tree',
                                      'Best Parameters':' ',
                                      'Accuracy_Score':metrics.accuracy_score(y_test,pred_tree),
                                      'Recall of 0':recall[0]},ignore_index=True)

### Best Classifier

Let's check for the accuracy score and recall for all the above model: 

SVC Poly Linear depicts SVC Linear

In [None]:
Classification_Scores=pd.DataFrame(Classification_Scores)
Classification_Scores.rename({0:'Accuracy_Score',1:'Recall',2:'F1-Score'})

In [None]:
plt.figure(figsize=(56,5))

Classification_Scores.plot.bar(figsize=(20,10))

Our evaluation strategy is based on the having best recall value and best F1-Score as the dataset is imbalanced and (as we do not wish to misclassify the employess which are likely to cause attrition i.e. have less false negatives in our confusion matrix)

From the above graph, it can be said that we are getting an accuracy of 0.88, recall score of 0.34 and f1-score of 0.48 for SVC - RBF

Thus, it is the best classifier that can be used for predicting the attrition rate.