# Creating a Logistic Regression model to predict absenteeism.

### Import the relevent libraries.

In [1]:
import pandas as pd
import numpy as np

### Load the data

In [2]:
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


### Create the targets
#### We will take the median value of the 'Absenteeism Time in Hours' and  use it as cut off line.

In [4]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

#### Any value >= 4 will be termed as Excessively absent (1) and <= 3 will be termed as Moderately absent (0).

In [5]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] >
                  data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
data_preprocessed['Excessive_Absenteeism'] = targets

In [7]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive_Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


#### Using median as a cut off is numerically stable and rigid.
#### Using median we have implicitly balanced the dataset (almost equal 1s and 0s).

In [8]:
targets.sum() / targets.shape[0]    # percentage of 1s around 50%

0.45571428571428574

In [9]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

In [10]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive_Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


### Selecting the inputs for our regression model.

In [11]:
data_with_targets.shape

(700, 15)

In [12]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

## Standardize the data
#### We will not standardize all the columns/variables as we also have dummy variables, which will loose their dummy meaning when standardize.
#### We can also standardize our data before creating dummy variables, or we can leave the dummy variables from standardizing (we will do the latter).

In [39]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [40]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month',
       'Day of the week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [41]:
columns_to_scale = ['Month','Day of the week', 'Transportation Expense', 'Distance to Work',
                   'Age', 'Daily Work Load Average', 'Body Mass Index','Children', 'Pets']

In [42]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [43]:
absenteeism_scaler.fit(unscaled_inputs)
# after this the absenteeism_scaler is not empty
# Standardization information is contained in it now (mean, std).



CustomScaler(columns=['Month', 'Day of the week', 'Transportation Expense',
                      'Distance to Work', 'Age', 'Daily Work Load Average',
                      'Body Mass Index', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [44]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
# .transform does the actual scaling.

In [45]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.007725,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,-0.007725,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,0.668253,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.668253,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690


In [46]:
scaled_inputs.shape

(700, 14)

### Train test split.
### and shuffling the data (to avoid any dependency that comes from the order of the dataset).

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)
# shuffle is True by default in test_train_split.

### Implementing Logistic Regression using sklearn.

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

#### Training the model

In [49]:
reg = LogisticRegression()
reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [50]:
reg.score(X_train, y_train)

0.775

#### To get a better idea of score:

In [51]:
model_outputs = reg.predict(X_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [52]:
# to look at the truly predicted values
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False, False, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [53]:
print('Number of correctly predicted outputs:', np.sum(model_outputs == y_train))
print('Number of total outputs:', y_train.shape[0])
print('Accuracy or score:',np.sum(model_outputs == y_train) / y_train.shape[0])

Number of correctly predicted outputs: 434
Number of total outputs: 560
Accuracy or score: 0.775


### Creating the Summary table:

In [54]:
# Intercepts and coefficient
print(reg.intercept_)
print(reg.coef_)

[-1.6561092]
[[ 2.80096498e+00  9.34857518e-01  3.09561645e+00  8.56587468e-01
   1.66248119e-01 -8.43703301e-02  6.12732578e-01 -7.79685996e-03
  -1.65922708e-01 -1.47005123e-04  2.71811477e-01 -2.05738037e-01
   3.61989880e-01 -2.85510745e-01]]


In [55]:
feature_name = unscaled_inputs.columns.values

In [56]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.800965
1,Reason_2,0.934858
2,Reason_3,3.095616
3,Reason_4,0.856587
4,Month,0.166248
5,Day of the week,-0.08437
6,Transportation Expense,0.612733
7,Distance to Work,-0.007797
8,Age,-0.165923
9,Daily Work Load Average,-0.000147


In [57]:
# Adding intercept row in the summary table
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.656109
1,Reason_1,2.800965
2,Reason_2,0.934858
3,Reason_3,3.095616
4,Reason_4,0.856587
5,Month,0.166248
6,Day of the week,-0.08437
7,Transportation Expense,0.612733
8,Distance to Work,-0.007797
9,Age,-0.165923


### Interpreting the coefficients
#### In logistic regression the coefficients we predicted are log(odds). Logistic Regressions by default are linear functions predicting log(odds), these log(odds) are later transformed into 0s and 1s.
log(odds) = intercept + b1x1 + b2x2 + ... + b14x14

In [58]:
summary_table['Odds_Ratio'] = np.exp(summary_table.Coefficient)

In [59]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_Ratio
0,Intercept,-1.656109,0.19088
1,Reason_1,2.800965,16.460523
2,Reason_2,0.934858,2.546851
3,Reason_3,3.095616,22.100858
4,Reason_4,0.856587,2.35511
5,Month,0.166248,1.180866
6,Day of the week,-0.08437,0.919091
7,Transportation Expense,0.612733,1.845467
8,Distance to Work,-0.007797,0.992233
9,Age,-0.165923,0.847112


In [60]:
summary_table.sort_values('Odds_Ratio', ascending=False)

Unnamed: 0,Feature name,Coefficient,Odds_Ratio
3,Reason_3,3.095616,22.100858
1,Reason_1,2.800965,16.460523
2,Reason_2,0.934858,2.546851
4,Reason_4,0.856587,2.35511
7,Transportation Expense,0.612733,1.845467
13,Children,0.36199,1.436184
11,Body Mass Index,0.271811,1.31234
5,Month,0.166248,1.180866
10,Daily Work Load Average,-0.000147,0.999853
8,Distance to Work,-0.007797,0.992233


#### A feature is particularly not important if:
#### - if its coefficient is around 0,
#### - if its odd ratio is around 1.

## Backward Elimination<br>
#### As the coefficients of the columns 'Daily Work Load Average', 'Distance to Work', 'Day of the week' are nearly zero, dropping them will not effect the model much. We can go back and drop them but we will not do that (either way a simpler model is more prefferable).

## Testing the data
#### Conceptually once we use the test data to check the accuracy we are not allowed to tweak the model anymore. As it has already seen the test data now. If we repeat this testing process, we are basically training the model but this time manually, using the test data.

In [61]:
reg.score(X_test, y_test)

0.7428571428571429

In [62]:
predicted_probability = reg.predict_proba(X_test)
predicted_probability

array([[0.73838887, 0.26161113],
       [0.60860095, 0.39139905],
       [0.40910176, 0.59089824],
       [0.80489361, 0.19510639],
       [0.0732329 , 0.9267671 ],
       [0.31965834, 0.68034166],
       [0.31302205, 0.68697795],
       [0.13341719, 0.86658281],
       [0.79712508, 0.20287492],
       [0.75274419, 0.24725581],
       [0.48222467, 0.51777533],
       [0.1964133 , 0.8035867 ],
       [0.07857533, 0.92142467],
       [0.70622367, 0.29377633],
       [0.30708515, 0.69291485],
       [0.57055326, 0.42944674],
       [0.54143955, 0.45856045],
       [0.57205946, 0.42794054],
       [0.38194051, 0.61805949],
       [0.04857923, 0.95142077],
       [0.6977753 , 0.3022247 ],
       [0.79578125, 0.20421875],
       [0.3949288 , 0.6050712 ],
       [0.42248618, 0.57751382],
       [0.26634773, 0.73365227],
       [0.75608758, 0.24391242],
       [0.51088279, 0.48911721],
       [0.86807166, 0.13192834],
       [0.20221381, 0.79778619],
       [0.78635626, 0.21364374],
       [0.

In [63]:
predicted_probability.shape

(140, 2)

Column 1 gives the probablity of being zero (low absenteeism), and column 2 gives probability of being one (excessive Absenteeism), we are interested in the latter.

In [64]:
predicted_probability[:,1]

array([0.26161113, 0.39139905, 0.59089824, 0.19510639, 0.9267671 ,
       0.68034166, 0.68697795, 0.86658281, 0.20287492, 0.24725581,
       0.51777533, 0.8035867 , 0.92142467, 0.29377633, 0.69291485,
       0.42944674, 0.45856045, 0.42794054, 0.61805949, 0.95142077,
       0.3022247 , 0.20421875, 0.6050712 , 0.57751382, 0.73365227,
       0.24391242, 0.48911721, 0.13192834, 0.79778619, 0.21364374,
       0.37354833, 0.68671888, 0.68840326, 0.54141425, 0.20421875,
       0.50817528, 0.21068631, 0.74426986, 0.43687316, 0.59038329,
       0.22501874, 0.43474443, 0.21701898, 0.39313905, 0.8143125 ,
       0.57069356, 0.69250264, 0.27274934, 0.20204647, 0.18057868,
       0.59237372, 0.34581089, 0.66771423, 0.28542145, 0.84957431,
       0.47045028, 0.88919506, 0.25614793, 0.31973858, 0.31768456,
       0.72178349, 0.6571659 , 0.31198576, 0.78711296, 0.19846624,
       0.26534346, 0.08192232, 0.23025544, 0.7270172 , 0.33464876,
       0.21066287, 0.29448939, 0.90909748, 0.43911695, 0.61982

## Save the model<br>
#### 'Pickle' is the standard Python tool for serialization and deserialization. In simple words, pickling means: converting a Python object (no matter what) into a string of characters. Logically, unpickling is about converting a string of characters (that has been pickled) into a Python object.<br>
#### 'Pickle' is not secure as anything can be pickled (e.g. a malicious code), therfore JSON is recommended.

In [65]:
import pickle

In [67]:
with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [68]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)