In [1]:
# Statistical modelling for absenteeism

In [2]:
# Import libraries.

import pandas as pd
import numpy as np

In [3]:
# Load data we preprocessed earlier.

data_0 = pd.read_csv('Absenteeism_preprocess.csv')

In [4]:
# Check if data loaded correctly.

data_0.head()

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Month value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [5]:
data_0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   absence_reason_1           700 non-null    int64  
 1   absence_reason_2           700 non-null    int64  
 2   absence_reason_3           700 non-null    int64  
 3   absence_reason_4           700 non-null    int64  
 4   Month value                700 non-null    int64  
 5   Day of the week            700 non-null    int64  
 6   Transportation Expense     700 non-null    int64  
 7   Distance to Work           700 non-null    int64  
 8   Age                        700 non-null    int64  
 9   Daily Work Load Average    700 non-null    float64
 10  Body Mass Index            700 non-null    int64  
 11  Education                  700 non-null    int64  
 12  Children                   700 non-null    int64  
 13  Pets                       700 non-null    int64  

In [6]:
data_0.describe(include = 'all')

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Month value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.25,0.008571,0.09,0.597143,6.36,2.011429,222.347143,29.892857,36.417143,271.801774,26.737143,0.167143,1.021429,0.687143,6.761429
std,0.433322,0.09225,0.286386,0.490823,3.50501,1.480396,66.31296,14.804446,6.379083,40.021804,4.254701,0.37337,1.112215,1.166095,12.670082
min,0.0,0.0,0.0,0.0,1.0,0.0,118.0,5.0,27.0,205.917,19.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,3.0,1.0,179.0,16.0,31.0,241.476,24.0,0.0,0.0,0.0,2.0
50%,0.0,0.0,0.0,1.0,6.0,2.0,225.0,26.0,37.0,264.249,25.0,0.0,1.0,0.0,3.0
75%,0.25,0.0,0.0,1.0,10.0,3.0,260.0,50.0,40.0,294.217,31.0,0.0,2.0,1.0,8.0
max,1.0,1.0,1.0,1.0,12.0,6.0,388.0,52.0,58.0,378.884,38.0,1.0,4.0,8.0,120.0


In [7]:
# Create targets for logistic regression.

# People above median of absenteeism are considered abnormal.
# By using median we balance the data so the model can be more properly trained.

# Check median absenteeism hours.

data_0['Absenteeism Time in Hours'].median()

3.0

In [8]:
targets = np.where(data_0['Absenteeism Time in Hours'] > 
                   data_0['Absenteeism Time in Hours'].median(), 1, 0)

In [9]:
targets[0:10]

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1])

In [10]:
# Add new column to data.

data_0['Excessive Absenteeism'] = targets

In [11]:
# Check if column appended correctly.

data_0.head()

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Month value,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [12]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [13]:
# Drop variables that will not be used further.

data_targets = data_0.drop(['Absenteeism Time in Hours',
                           'Day of the week',
                           'Daily Work Load Average', 
                           'Distance to Work'], 
                           axis = 1)

In [15]:
# Check new data.

data_targets.head()

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Month value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


In [16]:
# Select inputs for regression.

data_targets.shape

(700, 12)

In [17]:
inputs = data_targets.iloc[:, :-1]

In [18]:
# Check if inputs extracted correctly.

inputs.head()

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Month value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1


In [19]:
# Standardize data.

In [21]:
# Import modules for standardization.

from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

In [22]:
# create the Custom Scaler class

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    def __init__(self,columns):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [23]:
inputs_to_omit = ['absence_reason_1', 'absence_reason_2',
                      'absence_reason_3', 'absence_reason_4','Education']

In [24]:
inputs_to_scale = [x for x in inputs.columns.values if x not in inputs_to_omit]

In [25]:
absenteeism_scaler = CustomScaler(inputs_to_scale)

In [26]:
absenteeism_scaler.fit(inputs)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [27]:
inputs_scaled = absenteeism_scaler.transform(inputs)

In [28]:
pd.DataFrame(inputs_scaled)

Unnamed: 0,absence_reason_1,absence_reason_2,absence_reason_3,absence_reason_4,Month value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [29]:
inputs_scaled.shape

(700, 11)

In [30]:
# Split data into train and test samples.

In [31]:
# Import module for splitting.

from sklearn.model_selection import train_test_split

In [32]:
# Split the data.

x_train, x_test, y_train, y_test = train_test_split(inputs_scaled, 
                                                    targets, 
                                                    train_size = 0.8,
                                                    shuffle = True,
                                                    random_state = 20)

In [33]:
# Import modules for logistic regression.

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [34]:
# Train the model.

reg = LogisticRegression()

In [35]:
reg.fit(x_train, y_train)

In [36]:
# Check accuracy of model.

reg.score(x_train, y_train)

0.7732142857142857

In [37]:
model_output = reg.predict(x_train)

In [38]:
np.sum((model_output == y_train)) / model_output.shape[0]

0.7732142857142857

In [39]:
# Get intercept and coeffcients.

reg.intercept_

array([-1.6474549])

In [40]:
feature_name = inputs.columns.values

In [41]:
summary_table = pd.DataFrame(columns = ['feature_name'],
                            data = feature_name)

In [42]:
summary_table['coefficient'] = np.transpose(reg.coef_)

In [43]:
summary_table

Unnamed: 0,feature_name,coefficient
0,absence_reason_1,2.800197
1,absence_reason_2,0.951884
2,absence_reason_3,3.115553
3,absence_reason_4,0.839001
4,Month value,0.15893
5,Transportation Expense,0.605284
6,Age,-0.169891
7,Body Mass Index,0.279811
8,Education,-0.210533
9,Children,0.348262


In [44]:
summary_table.index = summary_table.index + 1

In [45]:
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

In [46]:
summary_table = summary_table.sort_index()

In [47]:
summary_table

Unnamed: 0,feature_name,coefficient
0,Intercept,-1.647455
1,absence_reason_1,2.800197
2,absence_reason_2,0.951884
3,absence_reason_3,3.115553
4,absence_reason_4,0.839001
5,Month value,0.15893
6,Transportation Expense,0.605284
7,Age,-0.169891
8,Body Mass Index,0.279811
9,Education,-0.210533


In [48]:
# Interpreting the coefficients.

summary_table['odds_ratio'] = np.exp(summary_table.coefficient)

In [49]:
summary_table

Unnamed: 0,feature_name,coefficient,odds_ratio
0,Intercept,-1.647455,0.192539
1,absence_reason_1,2.800197,16.447892
2,absence_reason_2,0.951884,2.590585
3,absence_reason_3,3.115553,22.545903
4,absence_reason_4,0.839001,2.314054
5,Month value,0.15893,1.172256
6,Transportation Expense,0.605284,1.831773
7,Age,-0.169891,0.843757
8,Body Mass Index,0.279811,1.32288
9,Education,-0.210533,0.810152


In [50]:
summary_table.sort_values('odds_ratio', ascending = False)

Unnamed: 0,feature_name,coefficient,odds_ratio
3,absence_reason_3,3.115553,22.545903
1,absence_reason_1,2.800197,16.447892
2,absence_reason_2,0.951884,2.590585
4,absence_reason_4,0.839001,2.314054
6,Transportation Expense,0.605284,1.831773
10,Children,0.348262,1.416604
8,Body Mass Index,0.279811,1.32288
5,Month value,0.15893,1.172256
7,Age,-0.169891,0.843757
9,Education,-0.210533,0.810152


In [51]:
# A quick summary of the meaning of reasons.
# Reason 0, the base, means no justification for absence was given.
# Reason 1 comprises various diseases.
# Reason 2 is related to pregnancy and giving birth.
# Reason 3 is some kind of poisoning.
# Reason 4 covers light diseases.

In [52]:
# A way to interpret the coefficients for the reasons is:
# Someone with some kind of poisoning is 22 times more likely to be absent from work
# from someone who didn´t specify a reason.

In [53]:
# Testing the model.

In [54]:
reg.score(x_test, y_test)

0.75

In [55]:
# Test accuracy is around 75%, lower than train at 77% but still good.

In [56]:
# Get predicted probability.

pred_prob = reg.predict_proba(x_test)

In [57]:
pred_prob[0:10]

array([[0.71340413, 0.28659587],
       [0.58724228, 0.41275772],
       [0.44020821, 0.55979179],
       [0.78159464, 0.21840536],
       [0.08410854, 0.91589146],
       [0.33487603, 0.66512397],
       [0.29984576, 0.70015424],
       [0.13103971, 0.86896029],
       [0.78625404, 0.21374596],
       [0.74903632, 0.25096368]])

In [58]:
# First columns is probability of being 0, second columns is probability of being 1.

In [59]:
# Getting the probabilites of falling in 1.

pred_prob[0:10, 1]

array([0.28659587, 0.41275772, 0.55979179, 0.21840536, 0.91589146,
       0.66512397, 0.70015424, 0.86896029, 0.21374596, 0.25096368])

In [60]:
# Save the model for later use.

In [61]:
# Import module for saving.

import pickle

In [62]:
# Save model as pickle object.

with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [63]:
# Save scaler.

with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)