## Import the relevant libraries

In [60]:
import pandas as pd
import numpy as np

In [62]:
data_preprocessed = pd.read_csv('../data/processed/Absenteeism_preprocessed.csv')

In [63]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


## Create the targets

In [64]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [65]:
## here we find the median of the data so that we can:
# moderately absent<= 3 
# excressively absent >=4

In [66]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours']>
                   data_preprocessed['Absenteeism Time in Hours'].median(),1,0)

In [67]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [68]:
data_preprocessed['Excessive Absenteeism'] = targets

In [69]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [70]:
targets.sum()/targets.shape[0]

0.45571428571428574

In [71]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'],axis =1)

In [72]:
data_with_targets is data_preprocessed

False

In [73]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


## SElect the inputs for the regreesion

In [74]:
data_with_targets.shape

(700, 15)

In [75]:
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0


In [76]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

## Standardize the Data

In [17]:
# from sklearn.preprocessing import StandardScaler
# absenteeism_scaler = StandardScaler()

In [57]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    def __init_(self,columns, copy =True, with_mean = True, with_std = True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns =columns
        self.mean = None
        self.var_ = None
    def fit(self, X , y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean = np.mean(X[self.columns])
        self.var = np.var(X[self.columns])
        return self
    def transform(self, X, y = None, copy = None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transofrm(X[self.columns]), columns= self.columns)
        X_not_scaled = X.loc[:,X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis =1)[init_col_order]

In [77]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [79]:
columns_to_scale =[ 'Month Value','Day of the Week', 'Transportation Expense', 'Distance to Work','Age', 'Daily Work Load Average', 'Body Mass Index','Children', 'Pets']

In [80]:
absenteeism_scaler = CustomScaler(columns_to_scale)

TypeError: CustomScaler() takes no arguments

In [18]:
absenteeism_scaler.fit(unscaled_inputs)

In [19]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [20]:
scaled_inputs

array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035,  0.26848661]])

In [21]:
scaled_inputs.shape

(700, 14)

## Splitting the Data into Train and Test and Shuffle


### Import the relevant module

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
#spliting

In [24]:
train_test_split(scaled_inputs, targets)

[array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.91902997,  1.12666297],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976],
        [ 1.73205081, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976],
        ...,
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.01928035,  1.12666297],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
          0.88046927,  0.26848661],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976]]),
 array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.01928035,  1.12666297],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
          0.88046927,  0.26848661],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
          0.88046927, -0.58968976],
        ...,
        [-0.57735027, -0.09298136, -0.31448545, ..., -

In [25]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [26]:
print(x_train)

[[-0.57735027 -0.09298136 -0.31448545 ... -0.44798003 -0.91902997
  -0.58968976]
 [-0.57735027 -0.09298136  3.17979734 ...  2.23224237  0.88046927
  -0.58968976]
 [ 1.73205081 -0.09298136 -0.31448545 ...  2.23224237 -0.91902997
  -0.58968976]
 ...
 [-0.57735027 -0.09298136 -0.31448545 ... -0.44798003 -0.91902997
  -0.58968976]
 [-0.57735027 -0.09298136 -0.31448545 ... -0.44798003 -0.01928035
   2.8430157 ]
 [-0.57735027 -0.09298136 -0.31448545 ... -0.44798003 -0.91902997
  -0.58968976]]


In [27]:
print(x_train.shape, y_train.shape)

(560, 14) (560,)


In [28]:
print(x_test.shape, y_test.shape)

(140, 14) (140,)


## Logistic regression with sklearn

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [30]:
reg = LogisticRegression()

In [31]:
reg.fit(x_train,y_train)

In [32]:
reg.score(x_train,y_train)

0.7839285714285714

### Manually check the accuracy


In [33]:
model_outputs = reg.predict(x_train)

In [34]:
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [35]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [36]:
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

In [37]:
np.sum(model_outputs==y_train)

439

In [38]:
model_outputs.shape[0]

560

In [39]:
np.sum(model_outputs==y_train)/model_outputs.shape[0]

0.7839285714285714

## Finding the intercept and coefficents

In [40]:
reg.intercept_

array([-0.22178284])

In [41]:
reg.coef_

array([[ 2.07458229,  0.33454073,  1.56073174,  1.32776177,  0.18829573,
        -0.07090252,  0.70640686, -0.03937135, -0.20039574, -0.00427805,
         0.31904773, -0.13505278,  0.38185537, -0.33342255]])

In [42]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [43]:
feature_name = unscaled_inputs.columns.values

In [44]:
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_name)
summary_table['Coefficient'] = np.transpose(reg.coef_)

In [45]:
summary_table

Unnamed: 0,Feature name,Coefficient
0,Reason_1,2.074582
1,Reason_2,0.334541
2,Reason_3,1.560732
3,Reason_4,1.327762
4,Month Value,0.188296
5,Day of the Week,-0.070903
6,Transportation Expense,0.706407
7,Distance to Work,-0.039371
8,Age,-0.200396
9,Daily Work Load Average,-0.004278


In [46]:
summary_table.index = summary_table.index +1
summary_table.loc[0] = ["Intercept",reg.intercept_[0]
]
summary_table = summary_table.sort_index()

In [47]:
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-0.221783
1,Reason_1,2.074582
2,Reason_2,0.334541
3,Reason_3,1.560732
4,Reason_4,1.327762
5,Month Value,0.188296
6,Day of the Week,-0.070903
7,Transportation Expense,0.706407
8,Distance to Work,-0.039371
9,Age,-0.200396


## Interpreting the coefficient 

In [48]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficient)

In [49]:
summary_table

Unnamed: 0,Feature name,Coefficient,Odds_ratio
0,Intercept,-0.221783,0.801089
1,Reason_1,2.074582,7.96122
2,Reason_2,0.334541,1.397299
3,Reason_3,1.560732,4.762305
4,Reason_4,1.327762,3.77259
5,Month Value,0.188296,1.20719
6,Day of the Week,-0.070903,0.931553
7,Transportation Expense,0.706407,2.026696
8,Distance to Work,-0.039371,0.961394
9,Age,-0.200396,0.818407


In [50]:
summary_table.sort_values('Odds_ratio', ascending = False)

Unnamed: 0,Feature name,Coefficient,Odds_ratio
1,Reason_1,2.074582,7.96122
3,Reason_3,1.560732,4.762305
4,Reason_4,1.327762,3.77259
7,Transportation Expense,0.706407,2.026696
13,Children,0.381855,1.465
2,Reason_2,0.334541,1.397299
11,Body Mass Index,0.319048,1.375817
5,Month Value,0.188296,1.20719
10,Daily Work Load Average,-0.004278,0.995731
8,Distance to Work,-0.039371,0.961394


In [51]:
summary_table.sort_values('Odds_ratio')

Unnamed: 0,Feature name,Coefficient,Odds_ratio
14,Pets,-0.333423,0.716467
0,Intercept,-0.221783,0.801089
9,Age,-0.200396,0.818407
12,Education,-0.135053,0.87367
6,Day of the Week,-0.070903,0.931553
8,Distance to Work,-0.039371,0.961394
10,Daily Work Load Average,-0.004278,0.995731
5,Month Value,0.188296,1.20719
11,Body Mass Index,0.319048,1.375817
2,Reason_2,0.334541,1.397299


In [52]:
summary_table.sort_values('Odds_ratio', ascending = False)


Unnamed: 0,Feature name,Coefficient,Odds_ratio
1,Reason_1,2.074582,7.96122
3,Reason_3,1.560732,4.762305
4,Reason_4,1.327762,3.77259
7,Transportation Expense,0.706407,2.026696
13,Children,0.381855,1.465
2,Reason_2,0.334541,1.397299
11,Body Mass Index,0.319048,1.375817
5,Month Value,0.188296,1.20719
10,Daily Work Load Average,-0.004278,0.995731
8,Distance to Work,-0.039371,0.961394


In [53]:
# A feature is not particularly important: 
# if its coefficient is around 0
# if its odds ratio is around 1

# A weight(coefficient ) of 0 implies that no matter the feature value, we will multipy i by 0 (in the model)

In [54]:
# 9 ma xu