In [1]:
import numpy as np 
import pandas as pd 


import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import SMOTE
#import xgboost as xgb
from sklearn.model_selection import train_test_split


from sklearn.preprocessing import StandardScaler

# Import and suppress warnings
import warnings
warnings.filterwarnings('ignore')

## Step 1: Exploratory Data Analysis

In [2]:
df = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [3]:
# Looking for NaN
df.isnull().any()

Age                         False
Attrition                   False
BusinessTravel              False
DailyRate                   False
Department                  False
DistanceFromHome            False
Education                   False
EducationField              False
EmployeeCount               False
EmployeeNumber              False
EnvironmentSatisfaction     False
Gender                      False
HourlyRate                  False
JobInvolvement              False
JobLevel                    False
JobRole                     False
JobSatisfaction             False
MaritalStatus               False
MonthlyIncome               False
MonthlyRate                 False
NumCompaniesWorked          False
Over18                      False
OverTime                    False
PercentSalaryHike           False
PerformanceRating           False
RelationshipSatisfaction    False
StandardHours               False
StockOptionLevel            False
TotalWorkingYears           False
TrainingTimesL

In [4]:
df.isnull().sum()/len(df)

Age                         0.0
Attrition                   0.0
BusinessTravel              0.0
DailyRate                   0.0
Department                  0.0
DistanceFromHome            0.0
Education                   0.0
EducationField              0.0
EmployeeCount               0.0
EmployeeNumber              0.0
EnvironmentSatisfaction     0.0
Gender                      0.0
HourlyRate                  0.0
JobInvolvement              0.0
JobLevel                    0.0
JobRole                     0.0
JobSatisfaction             0.0
MaritalStatus               0.0
MonthlyIncome               0.0
MonthlyRate                 0.0
NumCompaniesWorked          0.0
Over18                      0.0
OverTime                    0.0
PercentSalaryHike           0.0
PerformanceRating           0.0
RelationshipSatisfaction    0.0
StandardHours               0.0
StockOptionLevel            0.0
TotalWorkingYears           0.0
TrainingTimesLastYear       0.0
WorkLifeBalance             0.0
YearsAtC

In [5]:
# check correlation
df.corr()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
Age,1.0,0.010661,-0.001686,0.208034,,-0.010145,0.010146,0.024287,0.02982,0.509604,...,0.053535,,0.03751,0.680381,-0.019621,-0.02149,0.311309,0.212901,0.216513,0.202089
DailyRate,0.010661,1.0,-0.004985,-0.016806,,-0.05099,0.018355,0.023381,0.046135,0.002966,...,0.007846,,0.042143,0.014515,0.002453,-0.037848,-0.034055,0.009932,-0.033229,-0.026363
DistanceFromHome,-0.001686,-0.004985,1.0,0.021042,,0.032916,-0.016075,0.031131,0.008783,0.005303,...,0.006557,,0.044872,0.004628,-0.036942,-0.026556,0.009508,0.018845,0.010029,0.014406
Education,0.208034,-0.016806,0.021042,1.0,,0.04207,-0.027128,0.016775,0.042438,0.101589,...,-0.009118,,0.018422,0.14828,-0.0251,0.009819,0.069114,0.060236,0.054254,0.069065
EmployeeCount,,,,,,,,,,,...,,,,,,,,,,
EmployeeNumber,-0.010145,-0.05099,0.032916,0.04207,,1.0,0.017621,0.035179,-0.006888,-0.018519,...,-0.069861,,0.062227,-0.014365,0.023603,0.010309,-0.01124,-0.008416,-0.009019,-0.009197
EnvironmentSatisfaction,0.010146,0.018355,-0.016075,-0.027128,,0.017621,1.0,-0.049857,-0.008278,0.001212,...,0.007665,,0.003432,-0.002693,-0.019359,0.027627,0.001458,0.018007,0.016194,-0.004999
HourlyRate,0.024287,0.023381,0.031131,0.016775,,0.035179,-0.049857,1.0,0.042861,-0.027853,...,0.00133,,0.050263,-0.002334,-0.008548,-0.004607,-0.019582,-0.024106,-0.026716,-0.020123
JobInvolvement,0.02982,0.046135,0.008783,0.042438,,-0.006888,-0.008278,0.042861,1.0,-0.01263,...,0.034297,,0.021523,-0.005533,-0.015338,-0.014617,-0.021355,0.008717,-0.024184,0.025976
JobLevel,0.509604,0.002966,0.005303,0.101589,,-0.018519,0.001212,-0.027853,-0.01263,1.0,...,0.021642,,0.013984,0.782208,-0.018191,0.037818,0.534739,0.389447,0.353885,0.375281


## Step 2: Data Preprocessing

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

### Feature Enginnering

In [7]:
df_categorical= pd.DataFrame(data=df.select_dtypes('object'))
df_numerical=pd.DataFrame(data=df.select_dtypes('int'))


In [8]:
df_categorical.head()

Unnamed: 0,Attrition,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,Yes,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Y,Yes
1,No,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,Y,No
2,Yes,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Y,Yes
3,No,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Y,Yes
4,No,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No


In [9]:
df_numerical.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1102,1,2,1,1,2,94,3,2,...,1,80,0,8,0,1,6,4,0,5
1,49,279,8,1,1,2,3,61,2,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1373,2,2,1,4,4,92,2,1,...,2,80,0,7,3,3,0,0,0,0
3,33,1392,3,4,1,5,4,56,3,1,...,3,80,0,8,3,3,8,7,3,0
4,27,591,2,1,1,7,1,40,3,1,...,4,80,1,6,3,3,2,2,2,2


In [10]:
print("Numerical dataframe shape: ", df_numerical.shape)
print('*'*40)
print("Categorical dataframe shape: ", df_categorical.shape)

Numerical dataframe shape:  (1470, 26)
****************************************
Categorical dataframe shape:  (1470, 9)


In [11]:
df_categorical.columns

Index(['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender',
       'JobRole', 'MaritalStatus', 'Over18', 'OverTime'],
      dtype='object')

In [12]:
df_numerical.columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [13]:
# dropping target variable from the categorical dataframe
df_categorical=df_categorical.drop(['Attrition'],axis=1)
df_categorical.head()

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Y,Yes
1,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,Y,No
2,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Y,Yes
3,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Y,Yes
4,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No


In [14]:
#convert categorical to numerical

df_categorical_encoded= pd.get_dummies(df_categorical)
df_categorical_encoded.head()

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,0,0,1,0,0,1,0,1,0,0,...,0,0,1,0,0,0,1,1,0,1
1,0,1,0,0,1,0,0,1,0,0,...,0,1,0,0,0,1,0,1,1,0
2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
3,0,1,0,0,1,0,0,1,0,0,...,0,1,0,0,0,1,0,1,0,1
4,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,1,1,0


In [15]:
df_categorical_encoded.shape


(1470, 29)

In [16]:
## treating numerical value
standard_scaler= StandardScaler()

In [17]:
df_numerical_scaled=standard_scaler.fit_transform(df_numerical)
df_numerical_scaled=pd.DataFrame(data=df_numerical_scaled,columns=df_numerical.columns,index=df_numerical.index)

In [18]:
df_numerical_scaled.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0.44635,0.742527,-1.010909,-0.891688,0.0,-1.701283,-0.660531,1.383138,0.379672,-0.057788,...,-1.584178,0.0,-0.932014,-0.421642,-2.171982,-2.49382,-0.164613,-0.063296,-0.679146,0.245834
1,1.322365,-1.297775,-0.14715,-1.868426,0.0,-1.699621,0.254625,-0.240677,-1.026167,-0.057788,...,1.191438,0.0,0.241988,-0.164511,0.155707,0.338096,0.488508,0.764998,-0.368715,0.806541
2,0.008343,1.414363,-0.887515,-0.891688,0.0,-1.696298,1.169781,1.284725,-1.026167,-0.961486,...,-0.658973,0.0,-0.932014,-0.550208,0.155707,0.338096,-1.144294,-1.167687,-0.679146,-1.155935
3,-0.429664,1.461466,-0.764121,1.061787,0.0,-1.694636,1.169781,-0.486709,0.379672,-0.961486,...,0.266233,0.0,-0.932014,-0.421642,0.155707,0.338096,0.161947,0.764998,0.252146,-1.155935
4,-1.086676,-0.524295,-0.887515,-1.868426,0.0,-1.691313,-1.575686,-1.274014,0.379672,-0.961486,...,1.191438,0.0,0.241988,-0.678774,0.155707,0.338096,-0.817734,-0.615492,-0.058285,-0.595227


In [19]:
df_numerical_scaled.shape

(1470, 26)

In [20]:
# Concat the two dataframes together columnwise
df_concat = pd.concat([df_numerical_scaled, df_categorical_encoded], axis=1)

In [21]:
df_concat.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,0.44635,0.742527,-1.010909,-0.891688,0.0,-1.701283,-0.660531,1.383138,0.379672,-0.057788,...,0,0,1,0,0,0,1,1,0,1
1,1.322365,-1.297775,-0.14715,-1.868426,0.0,-1.699621,0.254625,-0.240677,-1.026167,-0.057788,...,0,1,0,0,0,1,0,1,1,0
2,0.008343,1.414363,-0.887515,-0.891688,0.0,-1.696298,1.169781,1.284725,-1.026167,-0.961486,...,0,0,0,0,0,0,1,1,0,1
3,-0.429664,1.461466,-0.764121,1.061787,0.0,-1.694636,1.169781,-0.486709,0.379672,-0.961486,...,0,1,0,0,0,1,0,1,0,1
4,-1.086676,-0.524295,-0.887515,-1.868426,0.0,-1.691313,-1.575686,-1.274014,0.379672,-0.961486,...,0,0,0,0,0,1,0,1,1,0


In [22]:
df_concat.shape

(1470, 55)

#### Mapping Yes/No for target vriable

In [23]:
#Extracting the target variable - 'Attrition'
target = df['Attrition']

#Mapping 'Yes' to 1 and 'No' to 0
map = {'Yes':1, 'No':0}
target = target.apply(lambda x: map[x])

print("Shape of target: ",target.shape)
print('*'*40)
X = df_concat #Features
y = target #Target
target.head()

Shape of target:  (1470,)
****************************************


0    1
1    0
2    1
3    0
4    0
Name: Attrition, dtype: int64

### Train test split

In [24]:
#Splitting into Train and Test dataset in 80-20 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y,train_size = 0.8, random_state = 0, stratify = y)
print("Shape of X Train: ",X_train.shape)
print("Shape of X Test: ",X_test.shape)
print("Shape of y Train: ",y_train.shape)
print("Shape of y Test: ",y_test.shape)

Shape of X Train:  (1176, 55)
Shape of X Test:  (294, 55)
Shape of y Train:  (1176,)
Shape of y Test:  (294,)


In [25]:
from sklearn.metrics import classification_report

In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [27]:
def evaluate_model(dt_classifier):
    print("Train Accuracy :", accuracy_score(y_train, dt_classifier.predict(X_train)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, dt_classifier.predict(X_train)))
    print("-"*50)
    print("Test Accuracy :", accuracy_score(y_test, dt_classifier.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, dt_classifier.predict(X_test)))

## Step 3: Implementing Machine Learning Models

## Default model -Model 1

In [28]:
gb = GradientBoostingClassifier(random_state=100) # default 
gb.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': 100,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [29]:
# Fit the model to our train and target
gb.fit(X_train, y_train)
# Get our predictions
gb_predictions = gb.predict(X_test)

In [30]:
gb_predictions_prob = gb.predict_proba(X_test)
gb_predictions_prob

array([[0.97214732, 0.02785268],
       [0.96271524, 0.03728476],
       [0.90770467, 0.09229533],
       [0.81698697, 0.18301303],
       [0.97313882, 0.02686118],
       [0.85983124, 0.14016876],
       [0.62301887, 0.37698113],
       [0.91303016, 0.08696984],
       [0.94741334, 0.05258666],
       [0.98857917, 0.01142083],
       [0.62366462, 0.37633538],
       [0.98450006, 0.01549994],
       [0.97520107, 0.02479893],
       [0.87316736, 0.12683264],
       [0.25580597, 0.74419403],
       [0.81400261, 0.18599739],
       [0.94330564, 0.05669436],
       [0.82136009, 0.17863991],
       [0.98305586, 0.01694414],
       [0.97076228, 0.02923772],
       [0.69477524, 0.30522476],
       [0.98288542, 0.01711458],
       [0.97540646, 0.02459354],
       [0.98831724, 0.01168276],
       [0.98310889, 0.01689111],
       [0.43115137, 0.56884863],
       [0.45507658, 0.54492342],
       [0.67817715, 0.32182285],
       [0.90262863, 0.09737137],
       [0.98118542, 0.01881458],
       [0.

In [31]:
accuracy_score(y_test, gb_predictions)

0.8775510204081632

In [32]:
evaluate_model(gb)

Train Accuracy : 0.9634353741496599
Train Confusion Matrix:
[[985   1]
 [ 42 148]]
--------------------------------------------------
Test Accuracy : 0.8775510204081632
Test Confusion Matrix:
[[246   1]
 [ 35  12]]


## Model 2  - with random parameters


In [33]:
gb_2 = GradientBoostingClassifier(n_estimators= 500,learning_rate=0.2,max_depth=11,
                                  min_samples_leaf= 2,subsample= 1,max_features= 'sqrt',
                                  verbose= 0,random_state=100) 
gb_2.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.2,
 'loss': 'deviance',
 'max_depth': 11,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_iter_no_change': None,
 'random_state': 100,
 'subsample': 1,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [34]:
# Fit the model to our train and target
gb_2.fit(X_train, y_train)
# Get our predictions
gb_2_predictions = gb_2.predict(X_test)

In [35]:
gb_2_predictions_prob = gb_2.predict_proba(X_test)
gb_2_predictions_prob

array([[1.00000000e+00, 3.28150982e-31],
       [1.00000000e+00, 1.05528886e-34],
       [1.00000000e+00, 1.45227742e-28],
       [1.00000000e+00, 3.50733688e-23],
       [1.00000000e+00, 2.48516049e-32],
       [1.00000000e+00, 9.67768999e-25],
       [9.99999996e-01, 3.52759338e-09],
       [1.00000000e+00, 5.02733428e-29],
       [1.00000000e+00, 3.67865540e-27],
       [1.00000000e+00, 5.21239460e-31],
       [1.00000000e+00, 1.30328741e-17],
       [1.00000000e+00, 5.06807475e-32],
       [1.00000000e+00, 1.11285202e-31],
       [1.00000000e+00, 3.64164478e-24],
       [9.99975414e-01, 2.45857346e-05],
       [1.00000000e+00, 7.80036268e-22],
       [1.00000000e+00, 7.83263049e-27],
       [1.00000000e+00, 2.33917526e-23],
       [1.00000000e+00, 3.75790292e-29],
       [1.00000000e+00, 1.77007922e-29],
       [1.00000000e+00, 3.29807395e-25],
       [1.00000000e+00, 3.68749678e-29],
       [1.00000000e+00, 4.38512646e-28],
       [1.00000000e+00, 2.18149613e-35],
       [1.000000

In [36]:
accuracy_score(y_test, gb_2_predictions)

0.8503401360544217

In [37]:
evaluate_model(gb_2)

Train Accuracy : 1.0
Train Confusion Matrix:
[[986   0]
 [  0 190]]
--------------------------------------------------
Test Accuracy : 0.8503401360544217
Test Confusion Matrix:
[[247   0]
 [ 44   3]]


## Hyper Parameter Tunning

In [38]:
from sklearn.model_selection import GridSearchCV

In [None]:
base_estimator=DecisionTreeClassifier()

In [57]:
# Create the parameter grid based on the results of random search 
params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'n_estimators': [100, 300, 500, 800, 1000]
}

In [58]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=gb, 
                           param_grid=params, 
                           cv=4, n_jobs=-1, verbose=1, scoring = "accuracy")

In [59]:
%%time
grid_search.fit(X_train, y_train)

Fitting 4 folds for each of 125 candidates, totalling 500 fits
CPU times: user 1.75 s, sys: 129 ms, total: 1.88 s
Wall time: 7min 39s


GridSearchCV(cv=4, estimator=GradientBoostingClassifier(random_state=100),
             n_jobs=-1,
             param_grid={'max_depth': [2, 3, 5, 10, 20],
                         'min_samples_leaf': [5, 10, 20, 50, 100],
                         'n_estimators': [100, 300, 500, 800, 1000]},
             scoring='accuracy', verbose=1)

In [60]:
score_df = pd.DataFrame(grid_search.cv_results_)
score_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.243805,0.005736,0.004329,0.00157,2,5,100,"{'max_depth': 2, 'min_samples_leaf': 5, 'n_est...",0.87415,0.867347,0.867347,0.867347,0.869048,0.002946,17
1,0.673379,0.009565,0.003639,0.000779,2,5,300,"{'max_depth': 2, 'min_samples_leaf': 5, 'n_est...",0.867347,0.880952,0.857143,0.867347,0.868197,0.008461,22
2,1.117052,0.012183,0.003521,0.000811,2,5,500,"{'max_depth': 2, 'min_samples_leaf': 5, 'n_est...",0.860544,0.863946,0.863946,0.867347,0.863946,0.002405,56
3,1.791458,0.010747,0.004514,0.001103,2,5,800,"{'max_depth': 2, 'min_samples_leaf': 5, 'n_est...",0.860544,0.863946,0.846939,0.863946,0.858844,0.007012,106
4,2.198804,0.012079,0.004337,0.000268,2,5,1000,"{'max_depth': 2, 'min_samples_leaf': 5, 'n_est...",0.85034,0.867347,0.85034,0.867347,0.858844,0.008503,109


In [61]:
score_df.nlargest(5,"mean_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
16,0.650761,0.006202,0.003256,0.00131,2,50,300,"{'max_depth': 2, 'min_samples_leaf': 50, 'n_es...",0.867347,0.887755,0.887755,0.870748,0.878401,0.009431,1
46,0.859194,0.007752,0.003985,0.001263,3,100,300,"{'max_depth': 3, 'min_samples_leaf': 100, 'n_e...",0.877551,0.880952,0.867347,0.880952,0.876701,0.005576,2
21,0.613691,0.019083,0.003569,0.001335,2,100,300,"{'max_depth': 2, 'min_samples_leaf': 100, 'n_e...",0.87415,0.880952,0.87415,0.867347,0.87415,0.00481,3
31,0.980479,0.003794,0.003011,0.000339,3,10,300,"{'max_depth': 3, 'min_samples_leaf': 10, 'n_es...",0.884354,0.870748,0.870748,0.870748,0.87415,0.005891,3
15,0.22152,0.002733,0.003383,0.001948,2,50,100,"{'max_depth': 2, 'min_samples_leaf': 50, 'n_es...",0.867347,0.87415,0.887755,0.863946,0.873299,0.009119,5


In [62]:
grid_search.best_estimator_

GradientBoostingClassifier(max_depth=2, min_samples_leaf=50, n_estimators=300,
                           random_state=100)

In [63]:
gb_best = grid_search.best_estimator_

In [64]:
evaluate_model(gb_best)

Train Accuracy : 0.9498299319727891
Train Confusion Matrix:
[[983   3]
 [ 56 134]]
--------------------------------------------------
Test Accuracy : 0.8673469387755102
Test Confusion Matrix:
[[244   3]
 [ 36  11]]


In [65]:
print(classification_report(y_test, gb_best.predict(X_test)))

              precision    recall  f1-score   support

           0       0.87      0.99      0.93       247
           1       0.79      0.23      0.36        47

    accuracy                           0.87       294
   macro avg       0.83      0.61      0.64       294
weighted avg       0.86      0.87      0.84       294



### Feature Importance Gradient Boosting Model

In [66]:
gb_best.feature_importances_

array([0.06919946, 0.03272816, 0.0336035 , 0.        , 0.        ,
       0.02658319, 0.03094794, 0.01139175, 0.0221449 , 0.00383882,
       0.03697979, 0.1417963 , 0.02214617, 0.03780413, 0.00550924,
       0.        , 0.02581744, 0.        , 0.07681257, 0.04283835,
       0.01076511, 0.0170156 , 0.02642502, 0.00106266, 0.01355291,
       0.03458941, 0.0057403 , 0.02502999, 0.        , 0.        ,
       0.01439055, 0.00332814, 0.        , 0.00504118, 0.00424131,
       0.00145122, 0.        , 0.0103443 , 0.00053487, 0.00154046,
       0.00027665, 0.        , 0.01256143, 0.        , 0.        ,
       0.00035434, 0.00298851, 0.00181064, 0.01370745, 0.00095404,
       0.00069905, 0.00708104, 0.        , 0.12985889, 0.03451324])

In [67]:
# Scatter plot 
trace = go.Scatter(
    y = gb_best.feature_importances_,
    x = df_concat.columns.values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1.3,
        size = 12,
        color = gb_best.feature_importances_,
        colorscale='Portland',
        showscale=True
    ),
    text = df_concat.columns.values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'GBM Model Feature Importance',
    hovermode= 'closest',
     xaxis= dict(
         ticklen= 5,
         showgrid=False,
        zeroline=False,
        showline=False
     ),
    yaxis=dict(
        title= 'Feature Importance',
        showgrid=False,
        zeroline=False,
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter')

## Observation:

### Model 1 - Default Model
- Train Accuracy : 0.9634353741496599
- Train Confusion Matrix:<br>
[[985   1]<br>
 [ 42 148]]
--------------------------------------------------
- Test Accuracy : 0.8775510204081632
- Test Confusion Matrix:<br>
[[246   1]<br>
 [ 35  12]]

### Model 2 - Random value 
- Train Accuracy : 1.0
- Train Confusion Matrix:<br>
[[986   0]<br>
 [  0 190]]
--------------------------------------------------
- Test Accuracy : 0.8503401360544217
- Test Confusion Matrix:<br>
[[247   0]<br>
 [ 44   3]]

### Model 3- HyperParameter Tunning using Grid Search CV

- Train Accuracy : 0.9498299319727891
- Train Confusion Matrix: <br>
[[983   3]<br>
 [ 56 134]]
--------------------------------------------------
- Test Accuracy : 0.8673469387755102
- Test Confusion Matrix:<br>
[[244   3]<br>
 [ 36  11]]