In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split    
from sklearn.metrics import accuracy_score,classification_report
import matplotlib.pyplot as plt

In [2]:
hrattr_data = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv")

print (hrattr_data.head())

   Age Attrition     BusinessTravel  DailyRate              Department  \
0   41       Yes      Travel_Rarely       1102                   Sales   
1   49        No  Travel_Frequently        279  Research & Development   
2   37       Yes      Travel_Rarely       1373  Research & Development   
3   33        No  Travel_Frequently       1392  Research & Development   
4   27        No      Travel_Rarely        591  Research & Development   

   DistanceFromHome  Education EducationField  EmployeeCount  EmployeeNumber  \
0                 1          2  Life Sciences              1               1   
1                 8          1  Life Sciences              1               2   
2                 2          2          Other              1               4   
3                 3          4  Life Sciences              1               5   
4                 2          1        Medical              1               7   

   ...  RelationshipSatisfaction StandardHours  StockOptionLevel  \
0  ...

In [3]:
hrattr_data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [4]:
hrattr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [5]:
hrattr_data.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [6]:
hrattr_data['Attrition_ind'] = 0
hrattr_data.loc[hrattr_data['Attrition']=='Yes','Attrition_ind'] = 1

In [7]:
hrattr_data['Attrition_ind'].value_counts()

0    1233
1     237
Name: Attrition_ind, dtype: int64

In [8]:
dummy_busnstrvl = pd.get_dummies(hrattr_data['BusinessTravel'], prefix='busns_trvl')
dummy_dept = pd.get_dummies(hrattr_data['Department'], prefix='dept')
dummy_edufield = pd.get_dummies(hrattr_data['EducationField'], prefix='edufield')
dummy_gender = pd.get_dummies(hrattr_data['Gender'], prefix='gend')
dummy_jobrole = pd.get_dummies(hrattr_data['JobRole'], prefix='jobrole')
dummy_maritstat = pd.get_dummies(hrattr_data['MaritalStatus'], prefix='maritalstat') 
dummy_overtime = pd.get_dummies(hrattr_data['OverTime'], prefix='overtime') 

In [9]:
continuous_columns = ['Age','DailyRate','DistanceFromHome','Education','EnvironmentSatisfaction',
'HourlyRate', 'JobInvolvement', 'JobLevel','JobSatisfaction','MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 
'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction','StockOptionLevel', 'TotalWorkingYears', 
'TrainingTimesLastYear','WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
'YearsWithCurrManager']

hrattr_continuous = hrattr_data[continuous_columns]

In [10]:
hrattr_continuous['Age'].describe()

count    1470.000000
mean       36.923810
std         9.135373
min        18.000000
25%        30.000000
50%        36.000000
75%        43.000000
max        60.000000
Name: Age, dtype: float64

In [11]:
hrattr_data['BusinessTravel'].value_counts()

Travel_Rarely        1043
Travel_Frequently     277
Non-Travel            150
Name: BusinessTravel, dtype: int64

In [12]:



hrattr_data_new = pd.concat([dummy_busnstrvl,dummy_dept,dummy_edufield,dummy_gender,dummy_jobrole,
  dummy_maritstat,dummy_overtime,hrattr_continuous,hrattr_data['Attrition_ind']],axis=1)


In [13]:
# Train & Test split
x_train,x_test,y_train,y_test = train_test_split(hrattr_data_new.drop(['Attrition_ind'],axis=1),
                                                 hrattr_data_new['Attrition_ind'],train_size = 0.7,random_state=42)

In [14]:
# Adaboost Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
dtree = DecisionTreeClassifier(criterion='gini',max_depth=1)

adabst_fit = AdaBoostClassifier(base_estimator= dtree,
        n_estimators=5000,random_state=42)

adabst_fit.fit(x_train, y_train)

print ("\nAdaBoost - Train Confusion Matrix\n\n",pd.crosstab(y_train,adabst_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nAdaBoost  - Train accuracy",round(accuracy_score(y_train,adabst_fit.predict(x_train)),3))
print ("\nAdaBoost  - Train Classification Report\n",classification_report(y_train,adabst_fit.predict(x_train)))

print ("\n\nAdaBoost  - Test Confusion Matrix\n\n",pd.crosstab(y_test,adabst_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nAdaBoost  - Test accuracy",round(accuracy_score(y_test,adabst_fit.predict(x_test)),3))
print ("\nAdaBoost - Test Classification Report\n",classification_report(y_test,adabst_fit.predict(x_test)))



AdaBoost - Train Confusion Matrix

 Predicted    0    1
Actuall            
0          853    0
1            0  176

AdaBoost  - Train accuracy 1.0

AdaBoost  - Train Classification Report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       853
           1       1.00      1.00      1.00       176

    accuracy                           1.00      1029
   macro avg       1.00      1.00      1.00      1029
weighted avg       1.00      1.00      1.00      1029



AdaBoost  - Test Confusion Matrix

 Predicted    0   1
Actuall           
0          338  42
1           40  21

AdaBoost  - Test accuracy 0.814

AdaBoost - Test Classification Report
               precision    recall  f1-score   support

           0       0.89      0.89      0.89       380
           1       0.33      0.34      0.34        61

    accuracy                           0.81       441
   macro avg       0.61      0.62      0.62       441
weighted avg       0.82

In [15]:
# Gradientboost Classifier
from sklearn.ensemble import GradientBoostingClassifier

gbc_fit = GradientBoostingClassifier(n_estimators=5000,
                                     min_samples_split=2,min_samples_leaf=1,max_depth=1,random_state=42 )
gbc_fit.fit(x_train,y_train)

print ("\nGradient Boost - Train Confusion Matrix\n\n",pd.crosstab(y_train,gbc_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nGradient Boost - Train accuracy",round(accuracy_score(y_train,gbc_fit.predict(x_train)),3))
print ("\nGradient Boost  - Train Classification Report\n",classification_report(y_train,gbc_fit.predict(x_train)))

print ("\n\nGradient Boost - Test Confusion Matrix\n\n",pd.crosstab(y_test,gbc_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nGradient Boost - Test accuracy",round(accuracy_score(y_test,gbc_fit.predict(x_test)),3))
print ("\nGradient Boost - Test Classification Report\n",classification_report(y_test,gbc_fit.predict(x_test)))



Gradient Boost - Train Confusion Matrix

 Predicted    0    1
Actuall            
0          850    3
1           41  135

Gradient Boost - Train accuracy 0.957

Gradient Boost  - Train Classification Report
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       853
           1       0.98      0.77      0.86       176

    accuracy                           0.96      1029
   macro avg       0.97      0.88      0.92      1029
weighted avg       0.96      0.96      0.96      1029



Gradient Boost - Test Confusion Matrix

 Predicted    0   1
Actuall           
0          354  26
1           36  25

Gradient Boost - Test accuracy 0.859

Gradient Boost - Test Classification Report
               precision    recall  f1-score   support

           0       0.91      0.93      0.92       380
           1       0.49      0.41      0.45        61

    accuracy                           0.86       441
   macro avg       0.70      0.67      0.6

In [None]:
AdaBoostClassifier()

In [39]:
# Xgboost Classifier
import xgboost as xgb

xgb_fit = xgb.XGBClassifier(max_depth=2, n_estimators=5000)
xgb_fit.fit(x_train, y_train)

print ("\nXGBoost - Train Confusion Matrix\n\n",pd.crosstab(y_train,xgb_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nXGBoost - Train accuracy",round(accuracy_score(y_train,xgb_fit.predict(x_train)),3))
print ("\nXGBoost  - Train Classification Report\n",classification_report(y_train,xgb_fit.predict(x_train)))

print ("\n\nXGBoost - Test Confusion Matrix\n\n",pd.crosstab(y_test,xgb_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nXGBoost - Test accuracy",round(accuracy_score(y_test,xgb_fit.predict(x_test)),3))
print ("\nXGBoost - Test Classification Report\n",classification_report(y_test,xgb_fit.predict(x_test)))



XGBoost - Train Confusion Matrix

 Predicted    0    1
Actuall            
0          853    0
1            0  176

XGBoost - Train accuracy 1.0

XGBoost  - Train Classification Report
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       853
           1       1.00      1.00      1.00       176

    accuracy                           1.00      1029
   macro avg       1.00      1.00      1.00      1029
weighted avg       1.00      1.00      1.00      1029



XGBoost - Test Confusion Matrix

 Predicted    0   1
Actuall           
0          357  23
1           40  21

XGBoost - Test accuracy 0.857

XGBoost - Test Classification Report
               precision    recall  f1-score   support

           0       0.90      0.94      0.92       380
           1       0.48      0.34      0.40        61

    accuracy                           0.86       441
   macro avg       0.69      0.64      0.66       441
weighted avg       0.84      0.8

In [40]:
#Ensemble of Ensembles - by fitting various classifiers
clwght = {0:0.3,1:0.7}

# Classifier 1
from sklearn.linear_model import LogisticRegression
clf1_logreg_fit = LogisticRegression(fit_intercept=True,class_weight=clwght)
clf1_logreg_fit.fit(x_train,y_train)

print ("\nLogistic Regression for Ensemble - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf1_logreg_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nLogistic Regression for Ensemble - Train accuracy",round(accuracy_score(y_train,clf1_logreg_fit.predict(x_train)),3))
print ("\nLogistic Regression for Ensemble - Train Classification Report\n",classification_report(y_train,clf1_logreg_fit.predict(x_train)))

print ("\n\nLogistic Regression for Ensemble - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf1_logreg_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nLogistic Regression for Ensemble - Test accuracy",round(accuracy_score(y_test,clf1_logreg_fit.predict(x_test)),3))
print ("\nLogistic Regression for Ensemble - Test Classification Report\n",classification_report(y_test,clf1_logreg_fit.predict(x_test)))



Logistic Regression for Ensemble - Train Confusion Matrix

 Predicted    0   1
Actuall           
0          784  69
1          101  75

Logistic Regression for Ensemble - Train accuracy 0.835

Logistic Regression for Ensemble - Train Classification Report
               precision    recall  f1-score   support

           0       0.89      0.92      0.90       853
           1       0.52      0.43      0.47       176

    accuracy                           0.83      1029
   macro avg       0.70      0.67      0.69      1029
weighted avg       0.82      0.83      0.83      1029



Logistic Regression for Ensemble - Test Confusion Matrix

 Predicted    0   1
Actuall           
0          334  46
1           42  19

Logistic Regression for Ensemble - Test accuracy 0.8

Logistic Regression for Ensemble - Test Classification Report
               precision    recall  f1-score   support

           0       0.89      0.88      0.88       380
           1       0.29      0.31      0.30       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [41]:
# Classifier 2
from sklearn.tree import DecisionTreeClassifier
clf2_dt_fit = DecisionTreeClassifier(criterion="gini",max_depth=5,min_samples_split=2,
                                     min_samples_leaf=1,random_state=42,class_weight=clwght)
clf2_dt_fit.fit(x_train,y_train)

print ("\nDecision Tree for Ensemble - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf2_dt_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nDecision Tree for Ensemble - Train accuracy",round(accuracy_score(y_train,clf2_dt_fit.predict(x_train)),3))
print ("\nDecision Tree for Ensemble - Train Classification Report\n",classification_report(y_train,clf2_dt_fit.predict(x_train)))

print ("\n\nDecision Tree for Ensemble - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf2_dt_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nDecision Tree for Ensemble - Test accuracy",round(accuracy_score(y_test,clf2_dt_fit.predict(x_test)),3))
print ("\nDecision Tree for Ensemble - Test Classification Report\n",classification_report(y_test,clf2_dt_fit.predict(x_test)))



Decision Tree for Ensemble - Train Confusion Matrix

 Predicted    0    1
Actuall            
0          787   66
1           56  120

Decision Tree for Ensemble - Train accuracy 0.881

Decision Tree for Ensemble - Train Classification Report
               precision    recall  f1-score   support

           0       0.93      0.92      0.93       853
           1       0.65      0.68      0.66       176

    accuracy                           0.88      1029
   macro avg       0.79      0.80      0.80      1029
weighted avg       0.88      0.88      0.88      1029



Decision Tree for Ensemble - Test Confusion Matrix

 Predicted    0   1
Actuall           
0          345  35
1           37  24

Decision Tree for Ensemble - Test accuracy 0.837

Decision Tree for Ensemble - Test Classification Report
               precision    recall  f1-score   support

           0       0.90      0.91      0.91       380
           1       0.41      0.39      0.40        61

    accuracy             

In [42]:
# Classifier 3
from sklearn.ensemble import RandomForestClassifier
clf3_rf_fit = RandomForestClassifier(n_estimators=10000,criterion="gini",max_depth=6,
                                min_samples_split=2,min_samples_leaf=1,class_weight = clwght)
clf3_rf_fit.fit(x_train,y_train)       

print ("\nRandom Forest for Ensemble - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf3_rf_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nRandom Forest for Ensemble - Train accuracy",round(accuracy_score(y_train,clf3_rf_fit.predict(x_train)),3))
print ("\nRandom Forest for Ensemble - Train Classification Report\n",classification_report(y_train,clf3_rf_fit.predict(x_train)))

print ("\n\nRandom Forest for Ensemble - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf3_rf_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nRandom Forest for Ensemble - Test accuracy",round(accuracy_score(y_test,clf3_rf_fit.predict(x_test)),3))
print ("\nRandom Forest for Ensemble - Test Classification Report\n",classification_report(y_test,clf3_rf_fit.predict(x_test)))




Random Forest for Ensemble - Train Confusion Matrix

 Predicted    0    1
Actuall            
0          846    7
1           62  114

Random Forest for Ensemble - Train accuracy 0.933

Random Forest for Ensemble - Train Classification Report
               precision    recall  f1-score   support

           0       0.93      0.99      0.96       853
           1       0.94      0.65      0.77       176

    accuracy                           0.93      1029
   macro avg       0.94      0.82      0.86      1029
weighted avg       0.93      0.93      0.93      1029



Random Forest for Ensemble - Test Confusion Matrix

 Predicted    0  1
Actuall          
0          373  7
1           52  9

Random Forest for Ensemble - Test accuracy 0.866

Random Forest for Ensemble - Test Classification Report
               precision    recall  f1-score   support

           0       0.88      0.98      0.93       380
           1       0.56      0.15      0.23        61

    accuracy                 

In [43]:
# Classifier 4
from sklearn.ensemble import AdaBoostClassifier
clf4_dtree = DecisionTreeClassifier(criterion='gini',max_depth=1,class_weight = clwght)
clf4_adabst_fit = AdaBoostClassifier(base_estimator= clf4_dtree,
        n_estimators=5000,learning_rate=0.05,random_state=42)

clf4_adabst_fit.fit(x_train, y_train)

print ("\nAdaBoost for Ensemble  - Train Confusion Matrix\n\n",pd.crosstab(y_train,clf4_adabst_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nAdaBoost for Ensemble   - Train accuracy",round(accuracy_score(y_train,clf4_adabst_fit.predict(x_train)),3))
print ("\nAdaBoost for Ensemble   - Train Classification Report\n",classification_report(y_train,clf4_adabst_fit.predict(x_train)))

print ("\n\nAdaBoost for Ensemble   - Test Confusion Matrix\n\n",pd.crosstab(y_test,clf4_adabst_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nAdaBoost for Ensemble   - Test accuracy",round(accuracy_score(y_test,clf4_adabst_fit.predict(x_test)),3))
print ("\nAdaBoost for Ensemble  - Test Classification Report\n",classification_report(y_test,clf4_adabst_fit.predict(x_test)))



AdaBoost for Ensemble  - Train Confusion Matrix

 Predicted    0    1
Actuall            
0          809   44
1           25  151

AdaBoost for Ensemble   - Train accuracy 0.933

AdaBoost for Ensemble   - Train Classification Report
               precision    recall  f1-score   support

           0       0.97      0.95      0.96       853
           1       0.77      0.86      0.81       176

    accuracy                           0.93      1029
   macro avg       0.87      0.90      0.89      1029
weighted avg       0.94      0.93      0.93      1029



AdaBoost for Ensemble   - Test Confusion Matrix

 Predicted    0   1
Actuall           
0          342  38
1           33  28

AdaBoost for Ensemble   - Test accuracy 0.839

AdaBoost for Ensemble  - Test Classification Report
               precision    recall  f1-score   support

           0       0.91      0.90      0.91       380
           1       0.42      0.46      0.44        61

    accuracy                           0.84  

In [44]:
ensemble = pd.DataFrame()

ensemble["log_output_one"] = pd.DataFrame(clf1_logreg_fit.predict_proba(x_train))[1]
ensemble["dtr_output_one"] = pd.DataFrame(clf2_dt_fit.predict_proba(x_train))[1]
ensemble["rf_output_one"] = pd.DataFrame(clf3_rf_fit.predict_proba(x_train))[1]
ensemble["adb_output_one"] = pd.DataFrame(clf4_adabst_fit.predict_proba(x_train))[1]

ensemble = pd.concat([ensemble,pd.DataFrame(y_train).reset_index(drop = True )],axis=1)


In [45]:
#Fitting meta-classifier
meta_logit_fit =  LogisticRegression(fit_intercept=False)
meta_logit_fit.fit(ensemble[['log_output_one','dtr_output_one','rf_output_one','adb_output_one']],ensemble['Attrition_ind'])

coefs =  meta_logit_fit.coef_
print ("Co-efficients for LR, DT, RF & AB are:",coefs)


Co-efficients for LR, DT, RF & AB are: [[-1.22807582  1.27026857  7.68232083 -7.61508392]]


In [46]:
ensemble_test = pd.DataFrame()
ensemble_test["log_output_one"] = pd.DataFrame(clf1_logreg_fit.predict_proba(x_test))[1]
ensemble_test["dtr_output_one"] = pd.DataFrame(clf2_dt_fit.predict_proba(x_test))[1]
ensemble_test["rf_output_one"] = pd.DataFrame(clf3_rf_fit.predict_proba(x_test))[1]
ensemble_test["adb_output_one"] = pd.DataFrame(clf4_adabst_fit.predict_proba(x_test))[1]

ensemble_test["all_one"] = meta_logit_fit.predict(ensemble_test[['log_output_one','dtr_output_one','rf_output_one','adb_output_one']])

ensemble_test = pd.concat([ensemble_test,pd.DataFrame(y_test).reset_index(drop = True )],axis=1)

print ("\n\nEnsemble of Models - Test Confusion Matrix\n\n",pd.crosstab(ensemble_test['Attrition_ind'],ensemble_test['all_one'],rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nEnsemble of Models - Test accuracy",round(accuracy_score(ensemble_test['Attrition_ind'],ensemble_test['all_one']),3))
print ("\nEnsemble of Models - Test Classification Report\n",classification_report(ensemble_test['Attrition_ind'],ensemble_test['all_one']))





Ensemble of Models - Test Confusion Matrix

 Predicted    0   1
Actuall           
0          369  11
1           43  18

Ensemble of Models - Test accuracy 0.878

Ensemble of Models - Test Classification Report
               precision    recall  f1-score   support

           0       0.90      0.97      0.93       380
           1       0.62      0.30      0.40        61

    accuracy                           0.88       441
   macro avg       0.76      0.63      0.67       441
weighted avg       0.86      0.88      0.86       441



In [47]:
# Ensemble of Ensembles - by applying bagging on simple classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

clwght = {0:0.3,1:0.7}

eoe_dtree = DecisionTreeClassifier(criterion='gini',max_depth=1,class_weight = clwght)
eoe_adabst_fit = AdaBoostClassifier(base_estimator= eoe_dtree,
        n_estimators=500,random_state=42)
eoe_adabst_fit.fit(x_train, y_train)

print ("\nAdaBoost - Train Confusion Matrix\n\n",pd.crosstab(y_train,eoe_adabst_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nAdaBoost - Train accuracy",round(accuracy_score(y_train,eoe_adabst_fit.predict(x_train)),3))
print ("\nAdaBoost  - Train Classification Report\n",classification_report(y_train,eoe_adabst_fit.predict(x_train)))

print ("\n\nAdaBoost - Test Confusion Matrix\n\n",pd.crosstab(y_test,eoe_adabst_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nAdaBoost - Test accuracy",round(accuracy_score(y_test,eoe_adabst_fit.predict(x_test)),3))
print ("\nAdaBoost - Test Classification Report\n",classification_report(y_test,eoe_adabst_fit.predict(x_test)))



AdaBoost - Train Confusion Matrix

 Predicted    0    1
Actuall            
0          834   19
1            9  167

AdaBoost - Train accuracy 0.973

AdaBoost  - Train Classification Report
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       853
           1       0.90      0.95      0.92       176

    accuracy                           0.97      1029
   macro avg       0.94      0.96      0.95      1029
weighted avg       0.97      0.97      0.97      1029



AdaBoost - Test Confusion Matrix

 Predicted    0   1
Actuall           
0          338  42
1           34  27

AdaBoost - Test accuracy 0.828

AdaBoost - Test Classification Report
               precision    recall  f1-score   support

           0       0.91      0.89      0.90       380
           1       0.39      0.44      0.42        61

    accuracy                           0.83       441
   macro avg       0.65      0.67      0.66       441
weighted avg       0.84 

In [48]:


bag_fit = BaggingClassifier(base_estimator= eoe_adabst_fit,n_estimators=50,
                            max_samples=1.0,max_features=1.0,
                            bootstrap=True,
                            bootstrap_features=False,
                            n_jobs=-1,
                            random_state=42)

bag_fit.fit(x_train, y_train)

print ("\nEnsemble of AdaBoost - Train Confusion Matrix\n\n",pd.crosstab(y_train,bag_fit.predict(x_train),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nEnsemble of AdaBoost - Train accuracy",round(accuracy_score(y_train,bag_fit.predict(x_train)),3))
print ("\nEnsemble of AdaBoost  - Train Classification Report\n",classification_report(y_train,bag_fit.predict(x_train)))

print ("\n\nEnsemble of AdaBoost - Test Confusion Matrix\n\n",pd.crosstab(y_test,bag_fit.predict(x_test),rownames = ["Actuall"],colnames = ["Predicted"]))      
print ("\nEnsemble of AdaBoost - Test accuracy",round(accuracy_score(y_test,bag_fit.predict(x_test)),3))
print ("\nEnsemble of AdaBoost - Test Classification Report\n",classification_report(y_test,bag_fit.predict(x_test)))





Ensemble of AdaBoost - Train Confusion Matrix

 Predicted    0    1
Actuall            
0          847    6
1           45  131

Ensemble of AdaBoost - Train accuracy 0.95

Ensemble of AdaBoost  - Train Classification Report
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       853
           1       0.96      0.74      0.84       176

    accuracy                           0.95      1029
   macro avg       0.95      0.87      0.90      1029
weighted avg       0.95      0.95      0.95      1029



Ensemble of AdaBoost - Test Confusion Matrix

 Predicted    0   1
Actuall           
0          354  26
1           42  19

Ensemble of AdaBoost - Test accuracy 0.846

Ensemble of AdaBoost - Test Classification Report
               precision    recall  f1-score   support

           0       0.89      0.93      0.91       380
           1       0.42      0.31      0.36        61

    accuracy                           0.85       441
   macr