# Ensemble of Ensembles

## Pre-Processing Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv("data/WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [3]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
df.pop('EmployeeCount')
df.pop('EmployeeNumber')
df.pop('Over18')
df.pop('StandardHours')

0       80
1       80
2       80
3       80
4       80
5       80
6       80
7       80
8       80
9       80
10      80
11      80
12      80
13      80
14      80
15      80
16      80
17      80
18      80
19      80
20      80
21      80
22      80
23      80
24      80
25      80
26      80
27      80
28      80
29      80
        ..
1440    80
1441    80
1442    80
1443    80
1444    80
1445    80
1446    80
1447    80
1448    80
1449    80
1450    80
1451    80
1452    80
1453    80
1454    80
1455    80
1456    80
1457    80
1458    80
1459    80
1460    80
1461    80
1462    80
1463    80
1464    80
1465    80
1466    80
1467    80
1468    80
1469    80
Name: StandardHours, Length: 1470, dtype: int64

In [5]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField',
       'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
       'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [6]:
y = df['Attrition']
X = df
X.pop('Attrition')

0       Yes
1        No
2       Yes
3        No
4        No
5        No
6        No
7        No
8        No
9        No
10       No
11       No
12       No
13       No
14      Yes
15       No
16       No
17       No
18       No
19       No
20       No
21      Yes
22       No
23       No
24      Yes
25       No
26      Yes
27       No
28       No
29       No
       ... 
1440     No
1441     No
1442    Yes
1443     No
1444    Yes
1445     No
1446     No
1447     No
1448     No
1449     No
1450     No
1451     No
1452    Yes
1453     No
1454     No
1455     No
1456     No
1457     No
1458     No
1459     No
1460     No
1461    Yes
1462     No
1463     No
1464     No
1465     No
1466     No
1467     No
1468     No
1469     No
Name: Attrition, Length: 1470, dtype: object

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelBinarizer()
y = le.fit_transform(y)

In [8]:
df.select_dtypes(['object']).head()

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,OverTime
0,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Yes
1,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,No
2,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Yes
3,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Yes
4,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,No


In [9]:
ind_BusinessTravel = pd.get_dummies(df['BusinessTravel'], prefix='BusinessTravel')
ind_Department = pd.get_dummies(df['Department'], prefix='Department')
ind_EducationField = pd.get_dummies(df['EducationField'], prefix='EducationField')
ind_Gender = pd.get_dummies(df['Gender'], prefix='Gender')
ind_JobRole = pd.get_dummies(df['JobRole'], prefix='JobRole')
ind_MaritalStatus = pd.get_dummies(df['MaritalStatus'], prefix='MaritalStatus')
ind_OverTime = pd.get_dummies(df['OverTime'], prefix='OverTime')

In [10]:
ind_BusinessTravel.head()

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely
0,0,0,1
1,0,1,0
2,0,0,1
3,0,1,0
4,0,0,1


In [11]:
df1 = pd.concat([ind_BusinessTravel, ind_Department, 
                 ind_EducationField, ind_Gender, 
                 ind_JobRole, ind_MaritalStatus, 
                 ind_OverTime, df.select_dtypes(['int64'])], axis=1)

In [12]:
df1.shape

(1470, 51)

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df1, y)

***

## Print Function

In [13]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [14]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        print("Train Results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {} \n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {} \n".format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        print("Test Results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {} \n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {} \n".format(confusion_matrix(y_test, clf.predict(X_test))))

***

## Model 1: Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

In [18]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [20]:
print_score(tree_clf, X_train, y_train, X_test, y_test, train=True)

Train Results:

Accuracy Score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       927
           1       1.00      1.00      1.00       175

   micro avg       1.00      1.00      1.00      1102
   macro avg       1.00      1.00      1.00      1102
weighted avg       1.00      1.00      1.00      1102
 

Confusion Matrix: 
 [[927   0]
 [  0 175]] 

Average Accuracy: 	 0.7867
Accuracy SD: 		 0.0370


In [21]:
print_score(tree_clf, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.7690

Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.85      0.86       306
           1       0.33      0.35      0.34        62

   micro avg       0.77      0.77      0.77       368
   macro avg       0.60      0.60      0.60       368
weighted avg       0.78      0.77      0.77       368
 

Confusion Matrix: 
 [[261  45]
 [ 40  22]] 



***

## Model 2: Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [26]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train.ravel())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [27]:
print_score(rf_clf, X_train, y_train.ravel(), X_test, y_test.ravel(), train=True)

Train Results:

Accuracy Score: 0.9809

Classification Report: 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       927
           1       1.00      0.88      0.94       175

   micro avg       0.98      0.98      0.98      1102
   macro avg       0.99      0.94      0.96      1102
weighted avg       0.98      0.98      0.98      1102
 

Confusion Matrix: 
 [[927   0]
 [ 21 154]] 

Average Accuracy: 	 0.8512
Accuracy SD: 		 0.0105


In [28]:
print_score(rf_clf, X_train, y_train.ravel(), X_test, y_test.ravel(), train=False)

Test Results:

Accuracy Score: 0.8641

Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.99      0.92       306
           1       0.80      0.26      0.39        62

   micro avg       0.86      0.86      0.86       368
   macro avg       0.83      0.62      0.66       368
weighted avg       0.86      0.86      0.83       368
 

Confusion Matrix: 
 [[302   4]
 [ 46  16]] 



***

## Combined Models

In [56]:
en_en = pd.DataFrame()

In [57]:
en_en['tree_clf'] = pd.DataFrame(tree_clf.predict_proba(X_train))[1]

In [58]:
en_en['rf_clf'] = pd.DataFrame(rf_clf.predict_proba(X_train))[1]

In [59]:
en_en.head()

Unnamed: 0,tree_clf,rf_clf
0,1.0,0.5
1,0.0,0.1
2,0.0,0.2
3,0.0,0.1
4,0.0,0.0


In [60]:
col_name = en_en.columns

In [61]:
en_en = pd.concat([en_en, pd.DataFrame(y_train).reset_index(drop=True)], axis=1)

In [62]:
en_en.head()

Unnamed: 0,tree_clf,rf_clf,0
0,1.0,0.5,1
1,0.0,0.1,0
2,0.0,0.2,0
3,0.0,0.1,0
4,0.0,0.0,0


In [63]:
tmp = list(col_name)
tmp.append('ind')
en_en.columns = tmp

In [64]:
en_en.head()

Unnamed: 0,tree_clf,rf_clf,ind
0,1.0,0.5,1
1,0.0,0.1,0
2,0.0,0.2,0
3,0.0,0.1,0
4,0.0,0.0,0


## Meta Classifier

In [65]:
from sklearn.linear_model import LogisticRegression

In [66]:
m_clf = LogisticRegression(fit_intercept=False)

In [67]:
m_clf.fit(en_en[['tree_clf', 'rf_clf']], en_en['ind'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [68]:
en_test = pd.DataFrame()

In [70]:
en_test['tree_clf'] = pd.DataFrame(tree_clf.predict_proba(X_test))[1]
en_test['rf_clf'] = pd.DataFrame(rf_clf.predict_proba(X_test))[1]
col_name = en_test.columns
en_test['combined'] = m_clf.predict(en_test[['tree_clf', 'rf_clf']])

In [71]:
col_name = en_test.columns

In [73]:
tmp = list(col_name)
tmp.append('ind')

In [74]:
tmp

['tree_clf', 'rf_clf', 'combined', 'ind']

In [75]:
en_test = pd.concat([en_test, pd.DataFrame(y_test).reset_index(drop=True)], axis=1)

In [76]:
en_test.columns = tmp

In [77]:
print(pd.crosstab(en_test['ind'], en_test['combined']))

combined    0   1
ind              
0         261  45
1          40  22


In [80]:
print(round(accuracy_score(en_test['ind'], en_test['combined']), 4))

0.769


In [81]:
print(classification_report(en_test['ind'], en_test['combined']))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86       306
           1       0.33      0.35      0.34        62

   micro avg       0.77      0.77      0.77       368
   macro avg       0.60      0.60      0.60       368
weighted avg       0.78      0.77      0.77       368



***

## Model 3: AdaBoost

In [82]:
from sklearn.ensemble import AdaBoostClassifier

In [95]:
ada_clf = xgb.XGBClassifier()

In [96]:
ada_clf.fit(X_train, y_train.ravel())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [97]:
print_score(ada_clf, X_train, y_train.ravel(), X_test, y_test.ravel(), train=True)

Train Results:

Accuracy Score: 0.9483

Classification Report: 
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       927
           1       0.99      0.68      0.81       175

   micro avg       0.95      0.95      0.95      1102
   macro avg       0.97      0.84      0.89      1102
weighted avg       0.95      0.95      0.94      1102
 

Confusion Matrix: 
 [[926   1]
 [ 56 119]] 

Average Accuracy: 	 0.8702
Accuracy SD: 		 0.0285


In [98]:
print_score(ada_clf, X_train, y_train.ravel(), X_test, y_test.ravel(), train=False)

Test Results:

Accuracy Score: 0.8533

Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.97      0.92       306
           1       0.67      0.26      0.37        62

   micro avg       0.85      0.85      0.85       368
   macro avg       0.77      0.62      0.64       368
weighted avg       0.83      0.85      0.83       368
 

Confusion Matrix: 
 [[298   8]
 [ 46  16]] 



***

## Combined 2

In [99]:
en_en = pd.DataFrame()

In [100]:
en_en['tree_clf'] = pd.DataFrame(tree_clf.predict_proba(X_train))[1]

In [101]:
en_en['rf_clf'] = pd.DataFrame(rf_clf.predict_proba(X_train))[1]

In [102]:
en_en['ada_clf'] = pd.DataFrame(ada_clf.predict_proba(X_train))[1]

In [103]:
en_en.head()

Unnamed: 0,tree_clf,rf_clf,ada_clf
0,1.0,0.5,0.11026
1,0.0,0.1,0.05004
2,0.0,0.2,0.130274
3,0.0,0.1,0.060902
4,0.0,0.0,0.169468


In [104]:
col_name = en_en.columns

In [105]:
en_en = pd.concat([en_en, pd.DataFrame(y_train).reset_index(drop=True)], axis=1)

In [110]:
tmp = list(col_name)
tmp.append('ind')
en_en.columns = tmp

In [111]:
en_en.head()

Unnamed: 0,tree_clf,rf_clf,ada_clf,ind
0,1.0,0.5,0.11026,1
1,0.0,0.1,0.05004,0
2,0.0,0.2,0.130274,0
3,0.0,0.1,0.060902,0
4,0.0,0.0,0.169468,0


## Meta Classifier 2

In [112]:
from sklearn.linear_model import LogisticRegression

In [113]:
m_clf = LogisticRegression(fit_intercept=False)

In [114]:
m_clf.fit(en_en[['tree_clf', 'rf_clf', 'ada_clf']], en_en['ind'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [115]:
en_test = pd.DataFrame()

In [116]:
en_test['tree_clf'] = pd.DataFrame(tree_clf.predict_proba(X_test))[1]
en_test['rf_clf'] = pd.DataFrame(rf_clf.predict_proba(X_test))[1]
en_test['ada_clf'] = pd.DataFrame(ada_clf.predict_proba(X_test))[1]
col_name = en_test.columns
en_test['combined'] = m_clf.predict(en_test[['tree_clf', 'rf_clf', 'ada_clf']])

In [117]:
col_name = en_test.columns
tmp = list(col_name)
tmp.append('ind')

In [118]:
tmp

['tree_clf', 'rf_clf', 'ada_clf', 'combined', 'ind']

In [119]:
en_test = pd.concat([en_test, pd.DataFrame(y_test).reset_index(drop=True)], axis=1)

In [120]:
en_test.columns = tmp

In [121]:
print(pd.crosstab(en_test['ind'], en_test['combined']))

combined    0   1
ind              
0         261  45
1          40  22


In [122]:
print(round(accuracy_score(en_test['ind'], en_test['combined']), 4))

0.769


In [123]:
print(classification_report(en_test['ind'], en_test['combined']))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86       306
           1       0.33      0.35      0.34        62

   micro avg       0.77      0.77      0.77       368
   macro avg       0.60      0.60      0.60       368
weighted avg       0.78      0.77      0.77       368

