In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer



import warnings
warnings.filterwarnings('ignore')

In [14]:
# Read in csv data
attrition_df = pd.read_csv("data/IBM_attrition_data.csv")
data = attrition_df.copy()
data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [15]:
# Drop unnecessary columns
data = data.drop(["Over18", "EmployeeCount", "StandardHours", "EmployeeNumber"], axis=1)
data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,...,3,3,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,...,3,1,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,...,4,2,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,...,3,4,0,17,3,2,9,6,0,8


In [16]:
# Create train and test set from data
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

In [84]:
# Segregating target and feature variables 
data = train_set.drop("Attrition", axis=1)
data_labels = train_set["Attrition"].copy()
# data_labels = pd.get_dummies(data_labels, drop_first=True)
data

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1097,24,Travel_Rarely,350,Research & Development,21,2,Technical Degree,3,Male,57,...,3,2,3,2,3,3,1,1,0,0
727,18,Non-Travel,287,Research & Development,5,2,Life Sciences,2,Male,73,...,3,4,0,0,2,3,0,0,0,0
254,29,Travel_Rarely,1247,Sales,20,2,Marketing,4,Male,45,...,3,4,1,10,2,3,3,2,0,2
1175,39,Travel_Rarely,492,Research & Development,12,3,Medical,4,Male,66,...,4,3,0,7,3,3,5,4,1,0
1341,31,Travel_Rarely,311,Research & Development,20,3,Life Sciences,2,Male,89,...,3,1,1,10,2,3,10,8,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,35,Travel_Rarely,750,Research & Development,28,3,Life Sciences,2,Male,46,...,3,4,2,10,3,2,10,9,6,8
1294,41,Travel_Rarely,447,Research & Development,5,3,Life Sciences,2,Male,85,...,3,1,0,11,3,1,3,2,1,2
860,22,Travel_Frequently,1256,Research & Development,3,4,Life Sciences,3,Male,48,...,3,2,1,1,5,3,0,0,0,0
1459,29,Travel_Rarely,1378,Research & Development,13,2,Other,4,Male,46,...,3,1,1,10,2,3,4,3,0,3


In [85]:
data_labels

1097     No
727      No
254      No
1175     No
1341     No
       ... 
1130     No
1294     No
860     Yes
1459     No
1126     No
Name: Attrition, Length: 1176, dtype: object

In [86]:
# Preprocessing categorical columns 
def preprocess_cat_columns(data):
    data["Education"] = data["Education"].map({1:"Below College", 2:"College", 3:"Bachelor", 4:"Master",5:"Doctor"}) 
    data["EnvironmentSatisfaction"] = data["EnvironmentSatisfaction"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
    data["JobInvolvement"] = data["JobInvolvement"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
    data["JobSatisfaction"] = data["JobSatisfaction"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
    data["PerformanceRating"] = data["PerformanceRating"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
    data["RelationshipSatisfaction"] = data["RelationshipSatisfaction"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
    data["WorkLifeBalance"] = data["WorkLifeBalance"].map({1:"Bad", 2:"Good", 3:"Better", 4:"Best"})
    return data


data_tr = preprocess_cat_columns(data)
data_tr.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1097,24,Travel_Rarely,350,Research & Development,21,College,Technical Degree,High,Male,57,...,High,Medium,3,2,3,Better,1,1,0,0
727,18,Non-Travel,287,Research & Development,5,College,Life Sciences,Medium,Male,73,...,High,Very High,0,0,2,Better,0,0,0,0
254,29,Travel_Rarely,1247,Sales,20,College,Marketing,Very High,Male,45,...,High,Very High,1,10,2,Better,3,2,0,2
1175,39,Travel_Rarely,492,Research & Development,12,Bachelor,Medical,Very High,Male,66,...,Very High,High,0,7,3,Better,5,4,1,0
1341,31,Travel_Rarely,311,Research & Development,20,Bachelor,Life Sciences,Medium,Male,89,...,High,Low,1,10,2,Better,10,8,0,2


In [87]:
# Number pipeline transformer 
def num_pipeline_transformer(data):
    '''
    Function to process numerical transformations
    Argument:
        data: original dataframe 
    Returns:
        num_attrs: numerical dataframe
        num_pipeline: numerical pipeline object
        
    '''
    numerics = ['int64']

    num_attrs = data.select_dtypes(include=numerics)

    num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
        ])
    return num_attrs, num_pipeline

# Complete transformation for categorical and numeric data
def pipeline_transformer(data):
    '''
    Complete transformation pipeline for both
    nuerical and categorical data.
    
    Argument:
        data: original dataframe 
    Returns:
        prepared_data: transformed data, ready to use
    '''
    cat_attrs = ["BusinessTravel", "Department", "Education", 
                    "EducationField", "EnvironmentSatisfaction", "Gender",
                    "JobInvolvement", "JobRole", "JobSatisfaction", 
                    "MaritalStatus", "OverTime", "PerformanceRating", 
                    "RelationshipSatisfaction", "WorkLifeBalance"]
    num_attrs, num_pipeline = num_pipeline_transformer(data)
    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, list(num_attrs)),
        ("cat", OneHotEncoder(), cat_attrs),
        ])
    prepared_data = full_pipeline.fit_transform(data)
    return prepared_data

In [88]:
##from raw data to processed data in 2 steps
preprocessed_df = preprocess_cat_columns(data)
prepared_data = pipeline_transformer(preprocessed_df)
prepared_data

array([[-1.38855944, -1.10813858,  1.44039645, ...,  1.        ,
         1.        ,  1.        ],
       [-2.04073779, -1.2634814 , -0.52269928, ...,  1.        ,
         1.        ,  1.        ],
       [-0.84507748,  1.10364737,  1.31770296, ...,  1.        ,
         1.        ,  1.        ],
       ...,
       [-1.60595222,  1.1258392 , -0.76808624, ...,  1.        ,
         1.        ,  1.        ],
       [-0.84507748,  1.42666182,  0.45884859, ...,  1.        ,
         1.        ,  1.        ],
       [ 1.43754676, -1.32019386, -0.03192534, ...,  1.        ,
         1.        ,  1.        ]])

In [89]:
prepared_data[0]

array([-1.38855944, -1.10813858,  1.44039645, -0.47283217, -0.9322736 ,
       -0.86827746, -0.60189535, -1.05916816, -0.33924934,  2.54747106,
       -1.1673683 ,  0.15731946, -0.97426331, -0.88820751, -0.67610953,
       -1.14244794,  0.        ,  0.        ,  1.        ,  0.        ,
        1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  1.        ,
        0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  1.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  1.        ,  1.        ,
        1.        ])

In [90]:
## Selecting and Training Models 
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(prepared_data, data_labels)

LogisticRegression()

In [91]:
print(f"Logisitc Regression Training Data Score: {log_reg.score(prepared_data, data_labels)}")

Training Data Score: 0.8673469387755102


In [93]:
## Selecting and Training Models 
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier()
decision_tree.fit(prepared_data, data_labels)

DecisionTreeClassifier()

In [94]:
print(f"Decision Tree Training Data Score: {decision_tree.score(prepared_data, data_labels)}")

Decision Tree Training Data Score: 1.0


In [98]:
# Predicitons using Decision Tree 

attrition_predict = decision_tree.predict(prepared_data)
print(f"First 10 Predictions: {attrition_predict[:10]}")

First 10 Predictions: ['No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No']


In [99]:
# Random Forest Model 
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier()
forest_clf.fit(prepared_data, data_labels)

RandomForestClassifier()

In [100]:
print(f"Random Forest Training Data Score: {forest_clf.score(prepared_data, data_labels)}")

Random Forest Training Data Score: 1.0


In [104]:
from sklearn.svm import SVC
svc_model = SVC(kernel='linear')
svc_model.fit(prepared_data, data_labels)

SVC(kernel='linear')

In [151]:
print(f"SVC Training Data Score: {svc_model.score(prepared_data, data_labels)}")

SVC Training Data Score: 0.8715986394557823


In [174]:
# Hyperparameter Runing w/ GridSearchCV
from sklearn.model_selection import GridSearchCV

forest_clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [180]:
criterion = ['gini', 'entropy']
param_grid = {'n_estimators': [5, 10, 15, 20], 'criterion' : criterion, 'max_features': [5, 10, 15, 20]}
grid_search = GridSearchCV(forest_clf, param_grid, verbose=0)

In [181]:
grid_search.fit(prepared_data, data_labels)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': [5, 10, 15, 20],
                         'n_estimators': [5, 10, 15, 20]})

In [182]:
# Best Params
print(grid_search.best_params_)

{'criterion': 'gini', 'max_features': 5, 'n_estimators': 15}


In [183]:
# Checking feature importance 
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.0596979 , 0.06110592, 0.04422266, 0.04411606, 0.02062812,
       0.08115781, 0.06273491, 0.04338343, 0.04579424, 0.03492548,
       0.04815707, 0.02882235, 0.04928885, 0.02810755, 0.02948317,
       0.03370934, 0.00564923, 0.01180501, 0.01050428, 0.00205798,
       0.00795144, 0.01048576, 0.        , 0.004482  , 0.00983852,
       0.00593484, 0.00925745, 0.00432537, 0.01210261, 0.        ,
       0.01305943, 0.01064529, 0.        , 0.0057728 , 0.0038569 ,
       0.00743394, 0.00207516, 0.00394213, 0.00169164, 0.00860946,
       0.00947072, 0.01079301, 0.        , 0.00801695, 0.00980297,
       0.01857884, 0.03821515, 0.03830624, 0.        , 0.        ,
       0.        ])

In [184]:
numerics = ['int64']
num_attrs = list(data.select_dtypes(include=numerics))

attrs = num_attrs 
sorted(zip(attrs, feature_importances), reverse=True)

[('YearsWithCurrManager', 0.03370934362724043),
 ('YearsSinceLastPromotion', 0.029483165944279088),
 ('YearsInCurrentRole', 0.028107553479082784),
 ('YearsAtCompany', 0.049288854307184625),
 ('TrainingTimesLastYear', 0.028822350371100265),
 ('TotalWorkingYears', 0.04815706536084066),
 ('StockOptionLevel', 0.034925480447509344),
 ('PercentSalaryHike', 0.04579424438831266),
 ('NumCompaniesWorked', 0.04338342735468422),
 ('MonthlyRate', 0.06273490740405246),
 ('MonthlyIncome', 0.08115780592833947),
 ('JobLevel', 0.020628124165388704),
 ('HourlyRate', 0.04411606219574934),
 ('DistanceFromHome', 0.044222662603980126),
 ('DailyRate', 0.06110592336439846),
 ('Age', 0.059697896187623424)]

In [185]:
# Evaluating entire system on Test Data
final_model = grid_search.best_estimator_

X_test = test_set.drop("Attrition", axis=1)
y_test = test_set["Attrition"].copy()

X_test_preprocessed = preprocess_cat_columns(X_test)
X_test_prepared = pipeline_transformer(X_test_preprocessed)

predictions = final_model.predict(X_test_prepared)

ValueError: X has 71 features, but DecisionTreeClassifier is expecting 51 features as input.

In [149]:
print('Test Acc: %.3f' % grid_search.score(X_test_prepared, y_test))

ValueError: X has 71 features, but DecisionTreeClassifier is expecting 51 features as input.