# Ensemble of Ensembles 2

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as ply
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
df = pd.read_csv("data/WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [3]:
df.pop('EmployeeCount')
df.pop('EmployeeNumber')
df.pop('Over18')
df.pop('StandardHours')
y = df['Attrition']
X = df
X.pop('Attrition')
from sklearn import preprocessing
le = preprocessing.LabelBinarizer()
y = le.fit_transform(y)

In [4]:
df.select_dtypes(['object']).head()

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,OverTime
0,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Yes
1,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,No
2,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Yes
3,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Yes
4,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,No


In [5]:
ind_BusinessTravel = pd.get_dummies(df['BusinessTravel'], prefix='BusinessTravel')
ind_Department = pd.get_dummies(df['Department'], prefix='Department')
ind_EducationField = pd.get_dummies(df['EducationField'], prefix='EducationField')
ind_Gender = pd.get_dummies(df['Gender'], prefix='Gender')
ind_JobRole = pd.get_dummies(df['JobRole'], prefix='JobRole')
ind_MaritalStatus = pd.get_dummies(df['MaritalStatus'], prefix='MaritalStatus')
ind_OverTime = pd.get_dummies(df['OverTime'], prefix='OverTime')

In [6]:
df1 = pd.concat([ind_BusinessTravel, ind_Department, 
                 ind_EducationField, ind_Gender, 
                 ind_JobRole, ind_MaritalStatus, 
                 ind_OverTime, df.select_dtypes(['int64'])], axis=1)

In [7]:
df1.dropna(inplace=True)
df1.shape

(1470, 51)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df1, y)

***

## Single Classifier

In [9]:
df1.head()

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,0,0,1,0,0,1,0,1,0,0,...,3,1,0,8,0,1,6,4,0,5
1,0,1,0,0,1,0,0,1,0,0,...,4,4,1,10,3,3,10,7,1,7
2,0,0,1,0,1,0,0,0,0,0,...,3,2,0,7,3,3,0,0,0,0
3,0,1,0,0,1,0,0,1,0,0,...,3,3,0,8,3,3,8,7,3,0
4,0,0,1,0,1,0,0,0,0,1,...,3,4,1,6,3,3,2,2,2,2


In [10]:
unique, counts = np.unique(y, return_counts=True)

In [11]:
dict(zip(unique, counts))

{0: 1233, 1: 237}

In [12]:
total = sum(counts)

In [13]:
for count in counts:
    print(count / total)

0.8387755102040816
0.16122448979591836


In [14]:
from sklearn.ensemble import RandomForestClassifier

In [15]:
from sklearn.ensemble import BaggingClassifier

In [16]:
from sklearn.ensemble import AdaBoostClassifier

In [17]:
class_weight = {0:0.84, 1:0.16}

In [18]:
pd.Series(list(y_train)).value_counts() / pd.Series(list(y_train)).count()

[0]    0.843013
[1]    0.156987
dtype: float64

In [19]:
forest = RandomForestClassifier(class_weight=class_weight)

In [20]:
ada = AdaBoostClassifier(base_estimator=forest, n_estimators=100,
                         learning_rate=0.5, random_state=42)

In [21]:
ada.fit(X_train, y_train.ravel())

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight={0: 0.84, 1: 0.16},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators='warn', n_jobs=None, oob_score=False,
            random_state=None, verbose=0, warm_start=False),
          learning_rate=0.5, n_estimators=100, random_state=42)

***

## Print Scores

In [22]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [23]:
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        print("Train Results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {} \n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {} \n".format(confusion_matrix(y_train, clf.predict(X_train))))
        
        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        print("Test Results:\n")
        print("Accuracy Score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {} \n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {} \n".format(confusion_matrix(y_test, clf.predict(X_test))))

In [24]:
print_score(ada, X_train, y_train, X_test, y_test, train=True)

Train Results:

Accuracy Score: 1.0000

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       929
           1       1.00      1.00      1.00       173

   micro avg       1.00      1.00      1.00      1102
   macro avg       1.00      1.00      1.00      1102
weighted avg       1.00      1.00      1.00      1102
 

Confusion Matrix: 
 [[929   0]
 [  0 173]] 

Average Accuracy: 	 0.8594
Accuracy SD: 		 0.0133


In [25]:
print_score(ada, X_train, y_train, X_test, y_test, train=False)

Test Results:

Accuracy Score: 0.8560

Classification Report: 
               precision    recall  f1-score   support

           0       0.86      0.99      0.92       304
           1       0.82      0.22      0.35        64

   micro avg       0.86      0.86      0.86       368
   macro avg       0.84      0.60      0.63       368
weighted avg       0.85      0.86      0.82       368
 

Confusion Matrix: 
 [[301   3]
 [ 50  14]] 



***

## Bagging

In [26]:
bag_clf = BaggingClassifier(base_estimator=ada, n_estimators=50,
                            max_samples=1.0, max_features=1.0, bootstrap=True,
                            bootstrap_features=False, n_jobs=-1,
                            random_state=42)

In [27]:
bag_clf.fit(X_train, y_train.ravel())

BaggingClassifier(base_estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight={0: 0.84, 1: 0.16},
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impur...=None, verbose=0, warm_start=False),
          learning_rate=0.5, n_estimators=100, random_state=42),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=50, n_jobs=-1, oob_score=False,
         random_state=42, verbose=0, warm_start=False)

In [28]:
print_score(bag_clf, X_train, y_train.ravel(), X_test, y_test.ravel(), train=True)

Train Results:

Accuracy Score: 0.9991

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       929
           1       1.00      0.99      1.00       173

   micro avg       1.00      1.00      1.00      1102
   macro avg       1.00      1.00      1.00      1102
weighted avg       1.00      1.00      1.00      1102
 

Confusion Matrix: 
 [[929   0]
 [  1 172]] 

Average Accuracy: 	 0.8566
Accuracy SD: 		 0.0110


In [29]:
print_score(bag_clf, X_train, y_train.ravel(), X_test, y_test.ravel(), train=False)

Test Results:

Accuracy Score: 0.8451

Classification Report: 
               precision    recall  f1-score   support

           0       0.84      1.00      0.91       304
           1       1.00      0.11      0.20        64

   micro avg       0.85      0.85      0.85       368
   macro avg       0.92      0.55      0.56       368
weighted avg       0.87      0.85      0.79       368
 

Confusion Matrix: 
 [[304   0]
 [ 57   7]] 



***