### Import

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Load Data

In [3]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [4]:
data_preprocessed = data_preprocessed.drop('Unnamed: 0', axis = 1)

In [5]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week
0,0,0,0,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,2015-07-14,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,2015-07-15,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,2015-07-16,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,2015-07-23,289,36,33,239.554,30,0,2,1,2,7,3


In [6]:
data_preprocessed.columns

Index(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Date',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
       'Pets', 'Absenteeism Time in Hours', 'Month Value', 'Day of the Week'],
      dtype='object')

In [7]:
columns = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Month Value', 'Day of the Week', 'Date',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children',
       'Pets', 'Absenteeism Time in Hours']

In [8]:
data_preprocessed = data_preprocessed[columns]

In [9]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,2015-07-07,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,2015-07-14,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,2015-07-15,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,2015-07-16,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,2015-07-23,289,36,33,239.554,30,0,2,1,2


### Create Target

In [10]:
median = data_preprocessed['Absenteeism Time in Hours'].median()

In [11]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > median, 1, 0)

In [12]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [13]:
data_preprocessed['Excessive Absenteeism'] = targets

In [14]:
data_preprocessed

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Date,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,2015-07-07,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,2015-07-14,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,2015-07-15,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,2015-07-16,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,2015-07-23,289,36,33,239.554,30,0,2,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,2018-05-23,179,22,40,237.656,22,1,2,0,8,1
696,1,0,0,0,5,2,2018-05-23,225,26,28,237.656,24,0,1,2,3,0
697,1,0,0,0,5,3,2018-05-24,330,16,28,237.656,25,1,0,0,8,1
698,0,0,0,1,5,3,2018-05-24,235,16,32,237.656,25,1,0,0,2,0


In [15]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [16]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Date', 'Day of the Week',
                                            'Daily Work Load Average', 'Distance to Work'], axis = 1)

In [17]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


### Select Inputs for Regression

In [18]:
data_with_targets.shape

(700, 12)

In [19]:
unscaled_inputs = data_with_targets.iloc[:, 0:11] # These are the inputs (Exclude Excessive Absenteeism because this is the target)

In [20]:
unscaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


### Standardize Data

In [21]:
# absenteeism_scaler = StandardScaler()

In [22]:
# absenteeism_scaler.fit(unscaled_inputs)

In [23]:
# scaled_input = absenteeism_scaler.transform(unscaled_inputs) # Scale the inputs

In [24]:
# scaled_input.shape

### Custom Scaler

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):

    def __init__(self, columns):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None

    def fit(self, X, y = None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y = None, copy = None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]

In [26]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [27]:
# columns_to_scale = ['Month Value','Transportation Expense',
#        'Age', 'Body Mass Index', 'Children', 'Pets']
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [28]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [29]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [30]:
absenteeism_scaler.fit(unscaled_inputs)

In [31]:
scaled_input = absenteeism_scaler.transform(unscaled_inputs)

In [32]:
scaled_input

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


### Split Data

In [33]:
train_test_split(scaled_input, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 662         0         0         0         1    -0.673803   
 641         0         0         0         1    -0.959313   
 486         0         0         0         1     0.468236   
 598         0         0         0         1    -1.244823   
 540         0         0         0         1     1.324766   
 ..        ...       ...       ...       ...          ...   
 322         1         0         0         0     1.324766   
 239         0         0         0         1     0.182726   
 605         0         0         0         1    -1.244823   
 416         0         0         0         1    -0.673803   
 458         1         0         0         0    -0.102784   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 662               -1.574681  2.130803         1.002633          0 -0.019280   
 641               -0.654143  0.248310         1.002633          0 -0.919030   
 486                1.0360

In [34]:
x_train, x_test, y_train, y_test = train_test_split(scaled_input, targets, train_size = 0.8, random_state = 9) 
# x_train = training inputs
# x_test = testing inputs
# y_train = training targets
# y_test = testing targets

In [35]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [36]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


## Logistic Regression

### Train the Model

In [37]:
reg = LogisticRegression()

In [38]:
reg.fit(x_train, y_train)

In [39]:
reg.score(x_train, y_train) # The model has an accuracy of 78%

0.7785714285714286

### Manually Check Accuracy

In [40]:
model_outputs = reg.predict(x_train) # A prediction of y_train

In [41]:
model_outputs == y_train # Shows which outputs were predicted correctly

array([ True,  True, False,  True,  True,  True,  True, False, False,
       False,  True,  True, False,  True, False,  True, False,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True, False,  True,  True, False, False,
        True,  True,  True, False,  True, False,  True,  True, False,
        True, False, False,  True,  True,  True, False,  True, False,
        True,  True,  True, False,  True,  True, False,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True, False, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True, False, False,  True,
        True,  True,

In [42]:
np.sum((model_outputs == y_train))

436

In [43]:
model_outputs.shape[0]

560

In [44]:
np.sum((model_outputs == y_train)) / model_outputs.shape[0] # This gives the accuracy of the model (same as reg.predict)

0.7785714285714286

### Find the Intercept and Coefficients

In [45]:
intercept = reg.intercept_

In [46]:
intercept

array([-1.75795903])

In [47]:
coeff = reg.coef_

In [48]:
coeff

array([[ 2.88674636,  1.00566759,  2.9425064 ,  0.87098156,  0.04582771,
         0.6628962 , -0.19447076,  0.20290829,  0.01483685,  0.37151105,
        -0.26756813]])

In [49]:
feature_name = unscaled_inputs.columns.values

In [50]:
feature_name

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [51]:
summary_table = pd.DataFrame(columns = ['Feature Name'], data = feature_name)

summary_table['Coefficient'] = np.transpose(reg.coef_) # Must be transposed because they are initially in a row but needs column

In [52]:
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Reason_1,2.886746
1,Reason_2,1.005668
2,Reason_3,2.942506
3,Reason_4,0.870982
4,Month Value,0.045828
5,Transportation Expense,0.662896
6,Age,-0.194471
7,Body Mass Index,0.202908
8,Education,0.014837
9,Children,0.371511


In [53]:
summary_table.index = summary_table.index + 1 # Shift all indices up by 1
summary_table.loc[0] = ['Intercept', intercept[0]]
summary_table = summary_table.sort_index()

In [54]:
summary_table

Unnamed: 0,Feature Name,Coefficient
0,Intercept,-1.757959
1,Reason_1,2.886746
2,Reason_2,1.005668
3,Reason_3,2.942506
4,Reason_4,0.870982
5,Month Value,0.045828
6,Transportation Expense,0.662896
7,Age,-0.194471
8,Body Mass Index,0.202908
9,Education,0.014837


### Interpret Coefficients

In [55]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [56]:
summary_table

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
0,Intercept,-1.757959,0.172396
1,Reason_1,2.886746,17.934861
2,Reason_2,1.005668,2.733732
3,Reason_3,2.942506,18.963316
4,Reason_4,0.870982,2.389255
5,Month Value,0.045828,1.046894
6,Transportation Expense,0.662896,1.940404
7,Age,-0.194471,0.82327
8,Body Mass Index,0.202908,1.22496
9,Education,0.014837,1.014947


In [57]:
summary_table.sort_values('Odds_ratio', ascending = False) 
# Sort features by least important features at the top, and most important at the bottom
# Daily Work Load Average' coefficient is very close to 0 so it's not important
# Distance to work and day of the week are also quite low but we'll keep them for now

Unnamed: 0,Feature Name,Coefficient,Odds_ratio
3,Reason_3,2.942506,18.963316
1,Reason_1,2.886746,17.934861
2,Reason_2,1.005668,2.733732
4,Reason_4,0.870982,2.389255
6,Transportation Expense,0.662896,1.940404
10,Children,0.371511,1.449924
8,Body Mass Index,0.202908,1.22496
5,Month Value,0.045828,1.046894
9,Education,0.014837,1.014947
7,Age,-0.194471,0.82327


## Testing

In [58]:
reg.score(x_test, y_test) # Finding the accuracy of the testing data, only a slight difference from the training data

0.75

In [60]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba # Left: P(Output = 0) Right: P(Output = 1)

array([[0.83950143, 0.16049857],
       [0.40119355, 0.59880645],
       [0.8031576 , 0.1968424 ],
       [0.27264686, 0.72735314],
       [0.8031576 , 0.1968424 ],
       [0.38706031, 0.61293969],
       [0.80108083, 0.19891917],
       [0.40119355, 0.59880645],
       [0.13985872, 0.86014128],
       [0.28142892, 0.71857108],
       [0.75597122, 0.24402878],
       [0.12138746, 0.87861254],
       [0.89493004, 0.10506996],
       [0.78827482, 0.21172518],
       [0.80316619, 0.19683381],
       [0.38396076, 0.61603924],
       [0.80108083, 0.19891917],
       [0.5984841 , 0.4015159 ],
       [0.09920698, 0.90079302],
       [0.73872907, 0.26127093],
       [0.09142003, 0.90857997],
       [0.26239223, 0.73760777],
       [0.89493004, 0.10506996],
       [0.68248905, 0.31751095],
       [0.30117571, 0.69882429],
       [0.87473809, 0.12526191],
       [0.32215565, 0.67784435],
       [0.50483555, 0.49516445],
       [0.60438804, 0.39561196],
       [0.5593572 , 0.4406428 ],
       [0.

In [61]:
predicted_proba[:,1] # Get only the probability of getting 1 (Probability of excessive absenteeism)

array([0.16049857, 0.59880645, 0.1968424 , 0.72735314, 0.1968424 ,
       0.61293969, 0.19891917, 0.59880645, 0.86014128, 0.71857108,
       0.24402878, 0.87861254, 0.10506996, 0.21172518, 0.19683381,
       0.61603924, 0.19891917, 0.4015159 , 0.90079302, 0.26127093,
       0.90857997, 0.73760777, 0.10506996, 0.31751095, 0.69882429,
       0.12526191, 0.67784435, 0.49516445, 0.39561196, 0.4406428 ,
       0.35394558, 0.24678204, 0.16585818, 0.19891917, 0.27245685,
       0.24795754, 0.60358691, 0.8218925 , 0.21391715, 0.21172518,
       0.25468435, 0.72630806, 0.62220992, 0.15411626, 0.50497733,
       0.11532716, 0.72938519, 0.50112209, 0.92425527, 0.20101237,
       0.77052497, 0.26987097, 0.29837398, 0.25914296, 0.58483455,
       0.23216287, 0.19478204, 0.328959  , 0.3708769 , 0.22030605,
       0.05585169, 0.11666884, 0.57021687, 0.22607151, 0.69328813,
       0.10630665, 0.69882429, 0.28550951, 0.83432735, 0.48235876,
       0.23216287, 0.51832496, 0.24645062, 0.27505818, 0.58483

### Save the Model

In [62]:
import pickle

In [63]:
with open ('model', 'wb') as file:
    pickle.dump(reg, file)

In [65]:
with open ('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)