In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Read in csv data
attrition_df = pd.read_csv("data/IBM_attrition_data.csv")
attrition_df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,23,2,Medical,1,2061,...,3,80,1,17,3,3,5,2,0,3
1466,39,No,Travel_Rarely,613,Research & Development,6,1,Medical,1,2062,...,1,80,1,9,5,3,7,7,1,7
1467,27,No,Travel_Rarely,155,Research & Development,4,3,Life Sciences,1,2064,...,2,80,1,6,0,3,6,2,0,3
1468,49,No,Travel_Frequently,1023,Sales,2,3,Medical,1,2065,...,4,80,0,17,3,2,9,6,0,8


In [7]:
attrition_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [8]:
# Drop unnecessary columns
attrition_df = attrition_df.drop(["Over18", "EmployeeCount", "StandardHours", "EmployeeNumber"], axis=1)

In [9]:
# Create train and test set from data
train_set, test_set = train_test_split(attrition_df, test_size=0.2, random_state=42)

In [11]:
# Segregating target and feature variables 
data = train_set.drop("Attrition", axis=1)
data_labels = train_set["Attrition"].copy()
data

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1097,24,Travel_Rarely,350,Research & Development,21,2,Technical Degree,3,Male,57,...,3,2,3,2,3,3,1,1,0,0
727,18,Non-Travel,287,Research & Development,5,2,Life Sciences,2,Male,73,...,3,4,0,0,2,3,0,0,0,0
254,29,Travel_Rarely,1247,Sales,20,2,Marketing,4,Male,45,...,3,4,1,10,2,3,3,2,0,2
1175,39,Travel_Rarely,492,Research & Development,12,3,Medical,4,Male,66,...,4,3,0,7,3,3,5,4,1,0
1341,31,Travel_Rarely,311,Research & Development,20,3,Life Sciences,2,Male,89,...,3,1,1,10,2,3,10,8,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,35,Travel_Rarely,750,Research & Development,28,3,Life Sciences,2,Male,46,...,3,4,2,10,3,2,10,9,6,8
1294,41,Travel_Rarely,447,Research & Development,5,3,Life Sciences,2,Male,85,...,3,1,0,11,3,1,3,2,1,2
860,22,Travel_Frequently,1256,Research & Development,3,4,Life Sciences,3,Male,48,...,3,2,1,1,5,3,0,0,0,0
1459,29,Travel_Rarely,1378,Research & Development,13,2,Other,4,Male,46,...,3,1,1,10,2,3,4,3,0,3


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1176 entries, 1097 to 1126
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1176 non-null   int64 
 1   BusinessTravel            1176 non-null   object
 2   DailyRate                 1176 non-null   int64 
 3   Department                1176 non-null   object
 4   DistanceFromHome          1176 non-null   int64 
 5   Education                 1176 non-null   int64 
 6   EducationField            1176 non-null   object
 7   EnvironmentSatisfaction   1176 non-null   int64 
 8   Gender                    1176 non-null   object
 9   HourlyRate                1176 non-null   int64 
 10  JobInvolvement            1176 non-null   int64 
 11  JobLevel                  1176 non-null   int64 
 12  JobRole                   1176 non-null   object
 13  JobSatisfaction           1176 non-null   int64 
 14  MaritalStatus        

In [19]:
# Preprocessing categorical columns 
def preprocess_cat_columns(attrition_df):
    attrition_df["Education"] = attrition_df["Education"].map({1:"Below College", 2:"College", 3:"Bachelor", 4:"Master",5:"Doctor"}) 
    attrition_df["EnvironmentSatisfaction"] = attrition_df["EnvironmentSatisfaction"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
    attrition_df["JobInvolvement"] = attrition_df["JobInvolvement"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
    attrition_df["JobSatisfaction"] = attrition_df["JobSatisfaction"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
    attrition_df["PerformanceRating"] = attrition_df["PerformanceRating"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
    attrition_df["RelationshipSatisfaction"] = attrition_df["RelationshipSatisfaction"].map({1:"Low", 2:"Medium", 3:"High", 4:"Very High"})
    attrition_df["WorkLifeBalance"] = attrition_df["WorkLifeBalance"].map({1:"Bad", 2:"Good", 3:"Better", 4:"Best"})
    return attrition_df


data_tr = preprocess_cat_columns(data)
data_tr.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1097,24,Travel_Rarely,350,Research & Development,21,College,Technical Degree,High,Male,57,...,High,Medium,3,2,3,Better,1,1,0,0
727,18,Non-Travel,287,Research & Development,5,College,Life Sciences,Medium,Male,73,...,High,Very High,0,0,2,Better,0,0,0,0
254,29,Travel_Rarely,1247,Sales,20,College,Marketing,Very High,Male,45,...,High,Very High,1,10,2,Better,3,2,0,2
1175,39,Travel_Rarely,492,Research & Development,12,Bachelor,Medical,Very High,Male,66,...,Very High,High,0,7,3,Better,5,4,1,0
1341,31,Travel_Rarely,311,Research & Development,20,Bachelor,Life Sciences,Medium,Male,89,...,High,Low,1,10,2,Better,10,8,0,2


In [20]:
# One Hot Encoding the categorical columns 
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1176 entries, 1097 to 1126
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1176 non-null   int64 
 1   BusinessTravel            1176 non-null   object
 2   DailyRate                 1176 non-null   int64 
 3   Department                1176 non-null   object
 4   DistanceFromHome          1176 non-null   int64 
 5   Education                 1176 non-null   object
 6   EducationField            1176 non-null   object
 7   EnvironmentSatisfaction   1176 non-null   object
 8   Gender                    1176 non-null   object
 9   HourlyRate                1176 non-null   int64 
 10  JobInvolvement            1176 non-null   object
 11  JobLevel                  1176 non-null   int64 
 12  JobRole                   1176 non-null   object
 13  JobSatisfaction           1176 non-null   object
 14  MaritalStatus        

In [21]:
# Isolate categorial columns 
data_cat = data_tr[["BusinessTravel", "Department", "Education", 
                    "EducationField", "EnvironmentSatisfaction", "Gender",
                    "JobInvolvement", "JobRole", "JobSatisfaction", 
                    "MaritalStatus", "OverTime", "PerformanceRating", 
                    "RelationshipSatisfaction", "WorkLifeBalance"]]
data_cat.head()

Unnamed: 0,BusinessTravel,Department,Education,EducationField,EnvironmentSatisfaction,Gender,JobInvolvement,JobRole,JobSatisfaction,MaritalStatus,OverTime,PerformanceRating,RelationshipSatisfaction,WorkLifeBalance
1097,Travel_Rarely,Research & Development,College,Technical Degree,High,Male,Medium,Laboratory Technician,Low,Divorced,No,High,Medium,Better
727,Non-Travel,Research & Development,College,Life Sciences,Medium,Male,High,Research Scientist,Very High,Single,No,High,Very High,Better
254,Travel_Rarely,Sales,College,Marketing,Very High,Male,High,Sales Executive,Very High,Divorced,No,High,Very High,Better
1175,Travel_Rarely,Research & Development,Bachelor,Medical,Very High,Male,High,Manufacturing Director,Medium,Married,No,Very High,High,Better
1341,Travel_Rarely,Research & Development,Bachelor,Life Sciences,Medium,Male,High,Laboratory Technician,High,Divorced,No,High,Low,Better


In [22]:
##onehotencoding the categorical values
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
data_cat_1hot = cat_encoder.fit_transform(data_cat)
data_cat_1hot   # returns a sparse matrix

<1176x55 sparse matrix of type '<class 'numpy.float64'>'
	with 16464 stored elements in Compressed Sparse Row format>

In [25]:
data_cat_1hot.toarray()[:5]

array([[0., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0.,
        0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0.,
        0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0., 0.,
        0., 0., 1., 0., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1., 1.,
        0., 0., 0., 0., 0

In [26]:
cat_encoder.categories_

[array(['Non-Travel', 'Travel_Frequently', 'Travel_Rarely'], dtype=object),
 array(['Human Resources', 'Research & Development', 'Sales'], dtype=object),
 array(['Bachelor', 'Below College', 'College', 'Doctor', 'Master'],
       dtype=object),
 array(['Human Resources', 'Life Sciences', 'Marketing', 'Medical',
        'Other', 'Technical Degree'], dtype=object),
 array(['High', 'Low', 'Medium', 'Very High'], dtype=object),
 array(['Female', 'Male'], dtype=object),
 array(['High', 'Low', 'Medium', 'Very High'], dtype=object),
 array(['Healthcare Representative', 'Human Resources',
        'Laboratory Technician', 'Manager', 'Manufacturing Director',
        'Research Director', 'Research Scientist', 'Sales Executive',
        'Sales Representative'], dtype=object),
 array(['High', 'Low', 'Medium', 'Very High'], dtype=object),
 array(['Divorced', 'Married', 'Single'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['High', 'Very High'], dtype=object),
 array(['High', 'Low', 

In [28]:
# Segregate numerical columns 
num_data = data[["Age", "DailyRate", "DistanceFromHome", 
                 "HourlyRate", "JobLevel", "MonthlyIncome", 
                 "MonthlyRate", "NumCompaniesWorked", "PercentSalaryHike", 
                 "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", 
                 "YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion",
                 "YearsWithCurrManager"]]

num_data.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,HourlyRate,JobLevel,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
1097,24,350,21,57,1,2296,10036,0,14,3,2,3,1,1,0,0
727,18,287,5,73,1,1051,13493,1,15,0,0,2,0,0,0,0
254,29,1247,20,45,2,6931,10732,2,14,1,10,2,3,2,0,2
1175,39,492,12,66,2,5295,7693,4,21,0,7,3,5,4,1,0
1341,31,311,20,89,2,4197,18624,1,11,1,10,2,10,8,0,2


In [30]:
# Creating a Pipeline of Tasks 

##Using Pipeline class
from sklearn.pipeline import Pipeline
##Using StandardScaler to scale all the numerical attributes
from sklearn.preprocessing import StandardScaler

numerics = ['int64']

num_data = data_tr.select_dtypes(include=numerics)

##pipeline for numerical attributes
##imputing -> adding attributes -> scale them
num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

num_data_tr = num_pipeline.fit_transform(num_data)
num_data_tr[0]

array([-1.38855944, -1.10813858,  1.44039645, -0.47283217, -0.9322736 ,
       -0.86827746, -0.60189535, -1.05916816, -0.33924934,  2.54747106,
       -1.1673683 ,  0.15731946, -0.97426331, -0.88820751, -0.67610953,
       -1.14244794])

In [33]:
# Transform numerical and categorical attributes 
##Transform different columns or subsets using ColumnTransformer
from sklearn.compose import ColumnTransformer

num_attrs = list(num_data)
cat_attrs = ["BusinessTravel", "Department", "Education", 
                    "EducationField", "EnvironmentSatisfaction", "Gender",
                    "JobInvolvement", "JobRole", "JobSatisfaction", 
                    "MaritalStatus", "OverTime", "PerformanceRating", 
                    "RelationshipSatisfaction", "WorkLifeBalance"]
##complete pipeline to transform 
##both numerical and cat. attributes
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attrs),
        ("cat", OneHotEncoder(), cat_attrs),
    ])

prepared_data = full_pipeline.fit_transform(data)
prepared_data[0]

array([-1.38855944, -1.10813858,  1.44039645, -0.47283217, -0.9322736 ,
       -0.86827746, -0.60189535, -1.05916816, -0.33924934,  2.54747106,
       -1.1673683 ,  0.15731946, -0.97426331, -0.88820751, -0.67610953,
       -1.14244794,  0.        ,  0.        ,  1.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  1.  