# Creating a logistic regression to predict absenteeism

## Import the relevant libraries

In [62]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle

## Load the data

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Create the targets

In [4]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

**Сreate targets for our logistic regression**
<br>

They have to be categories and we must find a way to say if someone is 'being absent too much' or not.
What we've decided to do is to take the median of the dataset as a cut-off line.
In this way the dataset will be balanced (there will be roughly equal number of 0s and 1s for the logistic regression).

Note that what line does is to assign 1 to anyone who has been absent 4 hours or more (more than 3 hours).
that is the equivalent of taking half a day off.

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [6]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [7]:
data_preprocessed['Excessive Absenteeism'] = targets

In [8]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pet,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


## A comment on the targets

Check if dataset is balanced (what % of targets are 1s).

In [9]:
targets.sum() / targets.shape[0]

0.45571428571428574

Create a checkpoint by dropping the unnecessary variables. Also drop the variables we 'eliminated' after exploring the weights.

In [10]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours', 'Day of the Week',
                                            'Daily Work Load Average', 'Distance to Work'], axis=1)

In [11]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


## Select the inputs for the regression

In [12]:
data_with_targets.shape

(700, 12)

In [13]:
data_with_targets.iloc[:, :14]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0,1
696,1,0,0,0,5,225,28,24,0,1,2,0
697,1,0,0,0,5,330,28,25,1,0,0,1
698,0,0,0,1,5,235,32,25,1,0,0,0


In [14]:
data_with_targets.iloc[:, :-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [15]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

## Standardize the data

In [18]:
absenteeism_scaler = StandardScaler()

In [19]:
class CustomScaler(BaseEstimator, TransformerMixin): 
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std
    
    def fit(self, X, y=None):
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, with_std=self.with_std)
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]

        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [20]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

In [21]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education']

In [22]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [23]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [24]:
absenteeism_scaler.fit(unscaled_inputs)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [25]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [26]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pet
0,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.030796,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.030796,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.030796,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.568019,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.568019,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.568019,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.568019,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [27]:
scaled_inputs.shape

(700, 11)

## Split the data into train & test and shuffle

In [30]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size = 0.2, random_state=101)

In [31]:
print(x_train.shape, y_train.shape)

(560, 11) (560,)


In [32]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


## Logistic regression with sklearn

### Training the model

In [34]:
reg = LogisticRegression()

In [35]:
reg.fit(x_train,y_train)

In [36]:
reg.score(x_train,y_train)

0.7696428571428572

### Manually check the accuracy

Find the model outputs according to our model.

In [41]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,

Compare them with the targets.

In [42]:
y_train

array([1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0,

ACTUALLY compare the two variables.

In [40]:
model_outputs == y_train

array([False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False,  True,  True,
        True,  True,  True,  True,  True, False, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True, False, False,
        True,  True, False, False,  True,  True,  True, False,  True,
       False, False,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True, False,  True, False,  True,
        True,  True,  True,  True,  True, False, False,  True,  True,
        True,  True,

Find out in how many instances we predicted correctly.

In [46]:
np.sum((model_outputs==y_train))

431

Get the total number of instances.

In [47]:
model_outputs.shape[0]

560

Calculate the accuracy of the model.

In [48]:
np.sum((model_outputs==y_train)) / model_outputs.shape[0]

0.7696428571428572

### Finding the intercept and coefficients

Get the intercept (bias) of our model.

In [49]:
reg.intercept_

array([-1.7655981])

Get the coefficients (weights) of our model.

In [50]:
reg.coef_

array([[ 2.80268587,  0.70410324,  3.33743605,  0.95319529, -0.03077714,
         0.58570195, -0.29049725,  0.23661321, -0.03132294,  0.45507779,
        -0.31495453]])

Check what were the names of our columns.

In [51]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pet'], dtype=object)

Save the names of the columns in an ad-hoc variable.

In [52]:
feature_name = unscaled_inputs.columns.values

Use the coefficients from this table.
Transpose the model coefficients (model.coef_) and throws them into a df (a vertical organization, so that they can be
multiplied by certain matrices later).

In [53]:
summary_table = pd.DataFrame (columns=['Feature name'], data=feature_name)

# add the coefficient values to the summary table
summary_table['Coefficient'] = np.transpose(reg.coef_)

# display the summary table
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.802686
1,Reason_2,0.704103
2,Reason_3,3.337436
3,Reason_4,0.953195
4,Month Value,-0.030777
5,Transportation Expense,0.585702
6,Age,-0.290497
7,Body Mass Index,0.236613
8,Education,-0.031323
9,Children,0.455078


Do a little Python trick to move the intercept to the top of the summary table.

In [54]:
# move all indices by 1
summary_table.index = summary_table.index + 1

# add the intercept at index 0
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

# sort the df by index
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.765598
1,Reason_1,2.802686
2,Reason_2,0.704103
3,Reason_3,3.337436
4,Reason_4,0.953195
5,Month Value,-0.030777
6,Transportation Expense,0.585702
7,Age,-0.290497
8,Body Mass Index,0.236613
9,Education,-0.031323


## Interpreting the coefficients

Create a new Series called: 'Odds ratio' which will show the.. odds ratio of each feature.

In [55]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [56]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-1.765598,0.171084
1,Reason_1,2.802686,16.488874
2,Reason_2,0.704103,2.022033
3,Reason_3,3.337436,28.146867
4,Reason_4,0.953195,2.593985
5,Month Value,-0.030777,0.969692
6,Transportation Expense,0.585702,1.796251
7,Age,-0.290497,0.747892
8,Body Mass Index,0.236613,1.266951
9,Education,-0.031323,0.969163


Sort the table according to odds ratio.

In [57]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
3,Reason_3,3.337436,28.146867
1,Reason_1,2.802686,16.488874
4,Reason_4,0.953195,2.593985
2,Reason_2,0.704103,2.022033
6,Transportation Expense,0.585702,1.796251
10,Children,0.455078,1.576296
8,Body Mass Index,0.236613,1.266951
5,Month Value,-0.030777,0.969692
9,Education,-0.031323,0.969163
7,Age,-0.290497,0.747892


## Testing the model

Assess the test accuracy of the model.

In [58]:
reg.score(x_test,y_test)

0.7214285714285714

Find the predicted probabilities of each class.
The first column shows the probability of a particular observation to be 0, while the second one - to be 1.

In [59]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.20392115, 0.79607885],
       [0.64489978, 0.35510022],
       [0.64489978, 0.35510022],
       [0.04342191, 0.95657809],
       [0.78750638, 0.21249362],
       [0.78750638, 0.21249362],
       [0.2891375 , 0.7108625 ],
       [0.77329752, 0.22670248],
       [0.68214843, 0.31785157],
       [0.71362476, 0.28637524],
       [0.06968383, 0.93031617],
       [0.42074405, 0.57925595],
       [0.32392279, 0.67607721],
       [0.78750638, 0.21249362],
       [0.70794196, 0.29205804],
       [0.54927844, 0.45072156],
       [0.52792075, 0.47207925],
       [0.82356287, 0.17643713],
       [0.44438425, 0.55561575],
       [0.1484951 , 0.8515049 ],
       [0.71362476, 0.28637524],
       [0.22381331, 0.77618669],
       [0.05466557, 0.94533443],
       [0.78284355, 0.21715645],
       [0.86925351, 0.13074649],
       [0.54485431, 0.45514569],
       [0.61516976, 0.38483024],
       [0.33204673, 0.66795327],
       [0.77472604, 0.22527396],
       [0.19099748, 0.80900252],
       [0.

In [60]:
predicted_proba.shape

(140, 2)

Select ONLY the probabilities referring to 1s.

In [61]:
predicted_proba[:, 1]

array([0.79607885, 0.35510022, 0.35510022, 0.95657809, 0.21249362,
       0.21249362, 0.7108625 , 0.22670248, 0.31785157, 0.28637524,
       0.93031617, 0.57925595, 0.67607721, 0.21249362, 0.29205804,
       0.45072156, 0.47207925, 0.17643713, 0.55561575, 0.8515049 ,
       0.28637524, 0.77618669, 0.94533443, 0.21715645, 0.13074649,
       0.45514569, 0.38483024, 0.66795327, 0.22527396, 0.80900252,
       0.17607505, 0.72577596, 0.228322  , 0.22189277, 0.82192424,
       0.66795327, 0.21403973, 0.21872707, 0.36335067, 0.77778344,
       0.22994965, 0.43934267, 0.73182544, 0.88095622, 0.47667457,
       0.59906231, 0.6359806 , 0.83090675, 0.66590632, 0.39799974,
       0.55106076, 0.64447043, 0.44388726, 0.31408666, 0.73247141,
       0.29015642, 0.61950676, 0.3238755 , 0.30424657, 0.32995837,
       0.61226641, 0.21249362, 0.63811125, 0.74253636, 0.228322  ,
       0.70896483, 0.94290399, 0.61007658, 0.228322  , 0.78095247,
       0.48417683, 0.63769005, 0.8386507 , 0.5714799 , 0.75424

## Save the model

Pickle the model file.

In [63]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

Pickle the scaler file.

In [65]:
with open('scaler','wb') as file:
    pickle.dump(absenteeism_scaler, file)