In [1]:
import pandas as pd
import numpy as np

In [19]:
#Load data
data_preprocessed = pd.read_csv('../database/Absenteeism_preprocessed.csv')

In [20]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [21]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head(5)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [22]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [23]:
data_with_targets = data_preprocessed.drop('Absenteeism Time in Hours', axis='columns')
data_with_targets.head(5)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


 ## Selecting and standardize the inputs for the regression

In [24]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

In [25]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler = StandardScaler()

In [27]:
absenteeism_scaler.fit(unscaled_inputs)
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035,  0.26848661]])

## Split the data into train & test and suffle

In [28]:
from sklearn.model_selection import train_test_split

In [32]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [33]:
print(x_train.shape, y_train.shape)

(560, 14) (560,)


In [34]:
print(x_test.shape, y_test.shape)

(140, 14) (140,)


## Logistic regression with Sklearn

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [36]:
reg = LogisticRegression()
reg.fit(x_train, y_train)

In [37]:
reg.score(x_train, y_train)

0.7839285714285714

### Manually check the accuracy

In [38]:
model_outputs = reg.predict(x_train)

In [41]:
np.sum(model_outputs == y_train) / model_outputs.shape[0] ## Score

0.7839285714285714

### Finding the intercept and coefficients

In [42]:
print(reg.intercept_,  reg.coef_)

[-0.22206736] [[ 2.07601767  0.33504757  1.56162303  1.32927434  0.18793677 -0.07062253
   0.70639316 -0.03986811 -0.20089491 -0.00456366  0.31933564 -0.135508
   0.38172443 -0.3332426 ]]


In [47]:
feature_name = unscaled_inputs.columns.values

In [48]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)

In [49]:
summary_table['Coefficient'] = np.transpose(reg.coef_)

In [50]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.222067
1,Reason_1,2.076018
2,Reason_2,0.335048
3,Reason_3,1.561623
4,Reason_4,1.329274
5,Month Value,0.187937
6,Day of the Week,-0.070623
7,Transportation Expense,0.706393
8,Distance to Work,-0.039868
9,Age,-0.200895
