# Importing relevant librairies

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the data

In [50]:
df = pd.read_csv('Absenteeism_preprocessed.csv')
df.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


**The nice thing about regression is that the model itself will give us a fair indication of which variables are importatn for the analysis and which are not**

## We will be using Logistic Regression to predict Absenteeism of an individual from work.<br>

### Since Logistic Regression is a binary classification method, we will categorize absenteeism into 2 classes :
1. Moderately absent.
2. Excessively absent.

To do so, we will use median value of 'Absenteeism Time in Hours' and use it as a cut-off line.<br> If hours > median -> Excessively absent
<br> If hours < median -> Moderately absent

### Create the targets

In [51]:
df['Absenteeism Time in Hours'].median()

3.0

In [52]:
targets = np.where(df['Absenteeism Time in Hours'] > df['Absenteeism Time in Hours'].median(), 1, 0)

In [53]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [54]:
df['Excessive Absenteeism'] = targets

In [55]:
df.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


### Let's check for the balance of data

In [56]:
targets.sum() / targets.shape[0] * 100

45.57142857142858

**Here, exessive absenteeism is ~46% and moderate absenteeism is ~54%. We need to have a 50-50 data split for both categories. However, a 60-40 split or a 45-55 split wil usually work equally well for a logistic regression.<br> But this is not true for other algorithms like neural netwroks, etc.**

In [57]:
df_with_targets = df.drop(['Absenteeism Time in Hours', 'Day of the Week', 'Daily Work Load Average', 'Distance to Work'], axis = 1)

In [58]:
df_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,1,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,1,7,289,33,30,0,2,1,0


### Select the inputs for the regression

In [59]:
df_with_targets.shape

(700, 12)

In [76]:
unscaled_inputs = df_with_targets.iloc[:, 0:11]

In [77]:
unscaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1


### Standardize the data

In [78]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

This custom scaler will only scale the columns specified and will leave the dummies untouched 

In [79]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
    
    def fit(self, x, y=None):
        self.scaler.fit(x[self.columns], y)
        self.mean_ = np.mean(x[self.columns])
        self.var = np.var(x[self.columns])
        
    def transform(self, x, y=None, copy=None):
        init_col_order = x.columns
        x_scaled = pd.DataFrame(self.scaler.transform(x[self.columns]), columns=self.columns)
        x_not_scaled = x.loc[:,~x.columns.isin(self.columns)]
        return pd.concat([x_not_scaled, x_scaled], axis=1)[init_col_order]

In [80]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [81]:
# columns_to_scale = ['Month Value', 'Day of the Week', 'Transportation Expense', 'Distance to Work','Age', 
#                     'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children', 'Pets']

columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [82]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [83]:
columns_to_scale

['Month Value',
 'Transportation Expense',
 'Age',
 'Body Mass Index',
 'Children',
 'Pets']

In [84]:
scaler = CustomScaler(columns_to_scale)

In [85]:
scaler.fit(unscaled_inputs)

In [86]:
scaled_data = scaler.transform(unscaled_inputs)

In [87]:
scaled_data

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.182726,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [88]:
scaled_data.shape

(700, 11)

# Splitting data into train and test set and Shuffling

**In order to prevent overfitting, we split data into train and test set.<br>We will also shuffle the data to remove any type of dependency that comes with the order of data e.g. Day of the week.**

In [89]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(scaled_data, targets, train_size = 0.8, random_state = 20)

In [90]:
print(x_train.shape,y_train.shape)

(560, 11) (560,)


In [91]:
print(x_test.shape, y_test.shape)

(140, 11) (140,)


# Logistic Regression with sklearn

In [92]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Train the model

In [93]:
reg = LogisticRegression()

In [94]:
reg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [95]:
reg.score(x_train, y_train)

0.7928571428571428

#### Manually check the accuracy

In [96]:
model_outputs = reg.predict(x_train)

In [97]:
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [98]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [99]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True, False,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [100]:
np.sum(model_outputs == y_train)

444

**These are the total number of correct predictions(True entries)**

In [101]:
model_outputs.shape[0]

560

In [102]:
np.sum(model_outputs == y_train) / model_outputs.shape[0] * 100

79.28571428571428

### Finding the intercept and  coefficients (a.k.a Weights and Biases)

In [103]:
reg.intercept_ 

array([-1.58273409])

In [104]:
reg.coef_

array([[ 2.73763935,  0.95074858,  3.40015762,  0.57997714,  0.2125932 ,
         0.54808265, -0.15968526,  0.25363117, -0.3718915 ,  0.30751029,
        -0.42261264]])

In [105]:
feature_name = unscaled_inputs.columns.values

In [106]:
summary_table = pd.DataFrame({'Feature_names' : feature_name})

In [107]:
summary_table['Coefficients'] = np.transpose(reg.coef_)

In [108]:
summary_table

Unnamed: 0,Feature_names,Coefficients
0,Reason_1,2.737639
1,Reason_2,0.950749
2,Reason_3,3.400158
3,Reason_4,0.579977
4,Month Value,0.212593
5,Transportation Expense,0.548083
6,Age,-0.159685
7,Body Mass Index,0.253631
8,Education,-0.371892
9,Children,0.30751


**Now we want to include intercept also in this summary table. Two ways to do this is to append() or concat() intercept. But this will add these values at the bottom of dataframe. We want to add it at the starting of dataframe. We do this in the following way :**

In [109]:
summary_table.index = summary_table.index + 1

In [110]:
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature_names,Coefficients
0,Intercept,-1.582734
1,Reason_1,2.737639
2,Reason_2,0.950749
3,Reason_3,3.400158
4,Reason_4,0.579977
5,Month Value,0.212593
6,Transportation Expense,0.548083
7,Age,-0.159685
8,Body Mass Index,0.253631
9,Education,-0.371892


### Interpreting the coefficients

**In logistic regression, the coefficients we are predicting(Yhat) are called the 'log(odds)'<br> Logistic regression is nothing but linear functions predicting log(odds)[probabilities]. These log(odds) are later transaformed into 0's and 1's.**
<br><br>
**We compute the odds ratio which is exponential of the coefficients. Odds ratio tell us the predictive power of a particular feature. It tells how much responsible each feature is in predicting the result.**


In [111]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficients)

In [112]:
summary_table

Unnamed: 0,Feature_names,Coefficients,Odds_ratio
0,Intercept,-1.582734,0.205413
1,Reason_1,2.737639,15.450469
2,Reason_2,0.950749,2.587646
3,Reason_3,3.400158,29.968823
4,Reason_4,0.579977,1.785998
5,Month Value,0.212593,1.236881
6,Transportation Expense,0.548083,1.729933
7,Age,-0.159685,0.852412
8,Body Mass Index,0.253631,1.288696
9,Education,-0.371892,0.689429


We now sort the dataframe using df.sort_values() using the odd_ratio values

In [113]:
summary_table.sort_values('Odds_ratio', ascending = False)

Unnamed: 0,Feature_names,Coefficients,Odds_ratio
3,Reason_3,3.400158,29.968823
1,Reason_1,2.737639,15.450469
2,Reason_2,0.950749,2.587646
4,Reason_4,0.579977,1.785998
6,Transportation Expense,0.548083,1.729933
10,Children,0.30751,1.360035
8,Body Mass Index,0.253631,1.288696
5,Month Value,0.212593,1.236881
7,Age,-0.159685,0.852412
9,Education,-0.371892,0.689429


**Coefficients are ordered in descending order with most important features at the top.**

**A feature is not particularly important :**
- if its coefficient is around 0 
- if its odds ratio is around 1

This is because a weight(coefficient) of 0 implies that no matter the feature value, we will mutiply it by 0 (in the model)

# Now, we had made a mistake....

**When standardizing, we scaled the dummy variables as well, which is not a good practice. This is because, when we standardize we lose the whole interpretability of a dummy.<br><br>If we left them as 0's and 1's, we could have said that, for a unit change (in say, reason 1) it is 7.54 times more likely that a person will be excessively absent.**

**So if the reason given is Reason 1, we would have said : It is 7.54 times more likely that a person will be excessively absent as compared to no reason given. However, we standardized this variable and now the variable is completely uninterpretable. Now we don't know how the diferrent reasons compare.**

We will go back to the part where er standardized our cilumns and do so custom scaling...

**After rerunning the code, we see that 'Daily Work Load Average', 'Distance to Work' and 'Day of the Week' have coefficients nearly zero.**

## Backward Elimination

The idea is that we can simplify our model by removing all features which have close to no contribution to the model. So, even if we remove these variables, the rest of our modelshould not really change in terms of coefficient values.<br><br>
Lets go back to checkpoint when we created the targets and remove 'Daily Work Load Average', 'Distance to Work' and 'Day of the Week' and then check the accuracy again....

After running all the cells once again, we see that the accuracy is ~80% still after removing 3 variables. This means that those 3 variables were not having an impact on our model 

# Testing the model

In [114]:
reg.score(x_test, y_test)

0.7642857142857142

In [119]:
reg.predict(x_test)

array([0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0])

In [116]:
predicted_proba = reg.predict_proba(x_test)

In [117]:
predicted_proba

array([[0.71062021, 0.28937979],
       [0.64278296, 0.35721704],
       [0.46660236, 0.53339764],
       [0.7996291 , 0.2003709 ],
       [0.08580093, 0.91419907],
       [0.31568265, 0.68431735],
       [0.33302802, 0.66697198],
       [0.10982917, 0.89017083],
       [0.85077009, 0.14922991],
       [0.75789769, 0.24210231],
       [0.10493323, 0.89506677],
       [0.02375633, 0.97624367],
       [0.0548654 , 0.9451346 ],
       [0.20632285, 0.79367715],
       [0.24681669, 0.75318331],
       [0.63498734, 0.36501266],
       [0.72712682, 0.27287318],
       [0.13001974, 0.86998026],
       [0.417815  , 0.582185  ],
       [0.04114741, 0.95885259],
       [0.75651722, 0.24348278],
       [0.7996291 , 0.2003709 ],
       [0.35415618, 0.64584382],
       [0.35415618, 0.64584382],
       [0.2206953 , 0.7793047 ],
       [0.80801547, 0.19198453],
       [0.55066456, 0.44933544],
       [0.870697  , 0.129303  ],
       [0.12766294, 0.87233706],
       [0.7996291 , 0.2003709 ],
       [0.

The first column shows the probability of excessive absenteeism being 0.<br>
The second column shows the probability of excessive absenteeism being 1.

## Save the model

In [121]:
import pickle

with open('model', 'wb') as file:
    pickle.dump(reg, file)

In [122]:
with open('scaler', 'wb') as file:
    pickle.dump(scaler, file)