In [1]:
import pandas as pd
import numpy as np

## Load the Data

In [3]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [4]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Create Targets

In [5]:
# under median normal absenteeism, above median exessive absenteeism

In [6]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [8]:
# List comprehension to get the targets
targets = [1 if x>data_preprocessed['Absenteeism Time in Hours'].median() else 0 for x in data_preprocessed['Absenteeism Time in Hours']]

In [9]:
data_preprocessed['Excessive Absenteeism'] = targets

In [10]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


## Comments on Targets

In [15]:
sum(targets)/len(targets)

0.45571428571428574

In [17]:
# Checkpoint

data_with_targets = data_preprocessed.drop(columns='Absenteeism Time in Hours')

## Select the inputs for the Regression

In [19]:
data_with_targets.shape

(700, 15)

In [123]:
unscaled_inputs = data_with_targets.iloc[:,0:14]
unscaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0


## Standardize the Data

In [124]:
from sklearn.preprocessing import StandardScaler

In [219]:
absenteeism_scaler = StandardScaler()

In [220]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [223]:
# only scale the non dummies variable

absenteeism_scaler.fit(unscaled_inputs[['Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']])

In [224]:
# only scale the non dummies variable

scaled_inputs_val = absenteeism_scaler.transform(unscaled_inputs[['Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']])

In [225]:
scaled_inputs_val

array([[ 0.18272635, -0.68370352,  1.00584437, ...,  0.76743118,
         0.88046927,  0.26848661],
       [ 0.18272635, -0.68370352, -1.57468098, ...,  1.00263338,
        -0.01928035, -0.58968976],
       [ 0.18272635, -0.00772546, -0.6541427 , ...,  1.00263338,
        -0.91902997, -0.58968976],
       ...,
       [-0.3882935 ,  0.66825259,  1.62456682, ..., -0.40857982,
        -0.91902997, -0.58968976],
       [-0.3882935 ,  0.66825259,  0.19094163, ..., -0.40857982,
        -0.91902997, -0.58968976],
       [-0.3882935 ,  0.66825259,  1.03602595, ..., -0.40857982,
        -0.01928035,  0.26848661]])

In [188]:
scaled_inputs.shape

(700, 14)

In [189]:
scaled_inputs = unscaled_inputs.copy()

In [190]:
scaled_inputs[['Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets']] = scaled_inputs_val

In [191]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.007725,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,-0.007725,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,0.668253,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.668253,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690


# Split the data into train & test and shuffle

### Import the relevant module

In [29]:
from sklearn.model_selection import train_test_split

### Split

In [192]:
train_test_split(scaled_inputs.values, targets)

[array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
         -0.91902997,  1.12666297],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         -0.91902997, -0.58968976],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.88046927, -0.58968976],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  1.        ,
          0.88046927, -0.58968976],
        [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         -0.91902997, -0.58968976],
        [ 1.        ,  0.        ,  0.        , ...,  0.        ,
          0.88046927,  0.26848661]]),
 array([[ 0.        ,  0.        ,  0.        , ...,  1.        ,
         -0.91902997, -0.58968976],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         -0.01928035,  1.12666297],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         -0.91902997, -0.58968976],
        ...,
        [ 1.        ,  0.        ,  0.        , ...,  

In [193]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

## Logistic Regresion with sklearn

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training model

In [194]:
reg = LogisticRegression()

In [195]:
reg.fit(x_train,y_train)

In [196]:
reg.score(x_train,y_train)

0.775

### Manually check the accuracy

In [197]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [198]:
np.array(y_train)

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [199]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [200]:
sum(model_outputs == y_train)/len(y_train)

0.775

### Finding intercept and coefficient

In [201]:
reg.intercept_

array([-1.65662792])

In [202]:
reg.coef_

array([[ 2.80136327e+00,  9.33540824e-01,  3.09673857e+00,
         8.57183147e-01,  1.66403124e-01, -8.43159241e-02,
         6.13215559e-01, -7.77871894e-03, -1.65545282e-01,
        -7.68487792e-05,  2.71154773e-01, -2.06026920e-01,
         3.61897667e-01, -2.85728905e-01]])

In [203]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [204]:
feature_name = unscaled_inputs.columns.values

In [205]:
summary_table = pd.DataFrame(columns = ['Features'], data = feature_name)

In [206]:
summary_table['Coefficient'] = reg.coef_.reshape(-1,)

In [207]:
summary_table

Unnamed: 0,Features,Coefficient
0,Reason_1,2.801363
1,Reason_2,0.933541
2,Reason_3,3.096739
3,Reason_4,0.857183
4,Month Value,0.166403
5,Day of the Week,-0.084316
6,Transportation Expense,0.613216
7,Distance to Work,-0.007779
8,Age,-0.165545
9,Daily Work Load Average,-7.7e-05


In [208]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept',reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Features,Coefficient
0,Intercept,-1.656628
1,Reason_1,2.801363
2,Reason_2,0.933541
3,Reason_3,3.096739
4,Reason_4,0.857183
5,Month Value,0.166403
6,Day of the Week,-0.084316
7,Transportation Expense,0.613216
8,Distance to Work,-0.007779
9,Age,-0.165545


### Intepreting the coefficients

In [209]:
summary_table['odds_ratio'] = np.exp(summary_table.Coefficient)

In [210]:
summary_table

Unnamed: 0,Features,Coefficient,odds_ratio
0,Intercept,-1.656628,0.190781
1,Reason_1,2.801363,16.467081
2,Reason_2,0.933541,2.543499
3,Reason_3,3.096739,22.125672
4,Reason_4,0.857183,2.356513
5,Month Value,0.166403,1.181049
6,Day of the Week,-0.084316,0.919141
7,Transportation Expense,0.613216,1.846359
8,Distance to Work,-0.007779,0.992251
9,Age,-0.165545,0.847431


In [86]:
summary_table.sort_values('odds_ratio', ascending=False)

Unnamed: 0,Features,Coefficient,odds_ratio
1,Reason_1,2.074582,7.96122
3,Reason_3,1.560732,4.762305
4,Reason_4,1.327762,3.77259
7,Transportation Expense,0.706407,2.026696
13,Children,0.381855,1.465
2,Reason_2,0.334541,1.397299
11,Body Mass Index,0.319048,1.375817
5,Month Value,0.188296,1.20719
10,Daily Work Load Average,-0.004278,0.995731
8,Distance to Work,-0.039371,0.961394


## Testing the Model

In [214]:
reg.score(x_test,y_test)

0.7428571428571429

In [216]:
predicted_proba = reg.predict_proba(x_test)
predicted_proba

array([[0.73844522, 0.26155478],
       [0.60830752, 0.39169248],
       [0.40932165, 0.59067835],
       [0.80500775, 0.19499225],
       [0.07319825, 0.92680175],
       [0.31989141, 0.68010859],
       [0.31315484, 0.68684516],
       [0.1333815 , 0.8666185 ],
       [0.797184  , 0.202816  ],
       [0.75285455, 0.24714545],
       [0.48221987, 0.51778013],
       [0.19628597, 0.80371403],
       [0.07847395, 0.92152605],
       [0.7063866 , 0.2936134 ],
       [0.30657701, 0.69342299],
       [0.57044278, 0.42955722],
       [0.54149644, 0.45850356],
       [0.57202077, 0.42797923],
       [0.38158691, 0.61841309],
       [0.04856239, 0.95143761],
       [0.6977674 , 0.3022326 ],
       [0.79590534, 0.20409466],
       [0.39509601, 0.60490399],
       [0.42263896, 0.57736104],
       [0.26631148, 0.73368852],
       [0.75609167, 0.24390833],
       [0.51061432, 0.48938568],
       [0.86799512, 0.13200488],
       [0.20219374, 0.79780626],
       [0.78649073, 0.21350927],
       [0.

In [217]:
predicted_proba[:,1]

array([0.26155478, 0.39169248, 0.59067835, 0.19499225, 0.92680175,
       0.68010859, 0.68684516, 0.8666185 , 0.202816  , 0.24714545,
       0.51778013, 0.80371403, 0.92152605, 0.2936134 , 0.69342299,
       0.42955722, 0.45850356, 0.42797923, 0.61841309, 0.95143761,
       0.3022326 , 0.20409466, 0.60490399, 0.57736104, 0.73368852,
       0.24390833, 0.48938568, 0.13200488, 0.79780626, 0.21350927,
       0.37375472, 0.68661431, 0.68825923, 0.54144395, 0.20409466,
       0.5080899 , 0.21062014, 0.74428002, 0.43679853, 0.59059004,
       0.22487149, 0.43478641, 0.21699119, 0.39320712, 0.81427639,
       0.5706047 , 0.69235426, 0.27269735, 0.20223705, 0.18048174,
       0.59250079, 0.34584097, 0.66753175, 0.28570041, 0.84967583,
       0.47073808, 0.8892122 , 0.25604441, 0.31941989, 0.31737343,
       0.72172164, 0.65693939, 0.31194258, 0.78717355, 0.19838182,
       0.26524577, 0.08189923, 0.2301838 , 0.72734148, 0.33451142,
       0.21060789, 0.29495458, 0.90900579, 0.43914262, 0.61975

## Save the Model

In [218]:
import pickle

In [230]:
with open('model', 'wb') as file:
    pickle.dump(reg,file)

In [231]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler,file)