In [3]:
import pandas as pd
import numpy as np

In [4]:
#Load data
data_preprocessed = pd.read_csv('../database/Absenteeism_preprocessed.csv')

In [5]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [6]:
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)
data_preprocessed['Excessive Absenteeism'] = targets
data_preprocessed.head(5)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [7]:
targets.sum() / targets.shape[0]

0.45571428571428574

In [8]:
data_with_targets = data_preprocessed.drop('Absenteeism Time in Hours', axis='columns')
data_with_targets.head(5)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


 ## Selecting and standardize the inputs for the regression

In [9]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

In [41]:
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
# absenteeism_scaler = StandardScaler() Before custom

class CustomScaler(BaseEstimator,TransformerMixin):
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y = None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]
        

In [42]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [43]:
columns_to_scale = ['Month Value', 'Day of the Week', 'Transportation Expense', 'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education', 'Children', 'Pets']

In [44]:
absenteeism_scaler = CustomScaler(columns_to_scale)


In [45]:
absenteeism_scaler.fit(unscaled_inputs)
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
scaled_inputs

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,-0.447980,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.447980,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.248310,-0.806331,1.002633,-0.447980,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,-0.447980,0.880469,-0.589690
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,-0.447980,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,-0.007725,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,2.232242,0.880469,-0.589690
696,1,0,0,0,-0.388293,-0.007725,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,-0.447980,-0.019280,1.126663
697,1,0,0,0,-0.388293,0.668253,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,2.232242,-0.919030,-0.589690
698,0,0,0,1,-0.388293,0.668253,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,2.232242,-0.919030,-0.589690


## Split the data into train & test and suffle

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size=0.8, random_state=20)

In [48]:
print(x_train.shape, y_train.shape)

(560, 14) (560,)


In [49]:
print(x_test.shape, y_test.shape)

(140, 14) (140,)


## Logistic regression with Sklearn

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the model

In [51]:
reg = LogisticRegression()
reg.fit(x_train, y_train)

In [52]:
reg.score(x_train, y_train)

0.7785714285714286

### Manually check the accuracy

In [53]:
model_outputs = reg.predict(x_train)

In [54]:
np.sum(model_outputs == y_train) / model_outputs.shape[0] ## Score

0.7785714285714286

### Finding the intercept and coefficients

In [55]:
print(reg.intercept_,  reg.coef_)

[-1.68969191] [[ 2.80088908e+00  9.37679263e-01  3.09784623e+00  8.55189097e-01
   1.65600560e-01 -8.38855532e-02  6.13417431e-01 -9.52263740e-03
  -1.66485910e-01 -9.39188259e-04  2.69858214e-01 -8.32092336e-02
   3.60649946e-01 -2.86053551e-01]]


In [56]:
feature_name = unscaled_inputs.columns.values

In [57]:
summary_table = pd.DataFrame(columns=['Feature name'], data=feature_name)

In [58]:
summary_table['Coefficient'] = np.transpose(reg.coef_)

In [59]:
summary_table.index = summary_table.index + 1
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature name,Coefficient
0,Intercept,-1.689692
1,Reason_1,2.800889
2,Reason_2,0.937679
3,Reason_3,3.097846
4,Reason_4,0.855189
5,Month Value,0.165601
6,Day of the Week,-0.083886
7,Transportation Expense,0.613417
8,Distance to Work,-0.009523
9,Age,-0.166486


### Interpretin the Coefficients

In [60]:
summary_table['Odds_ratio'] =  np.exp(summary_table.Coefficient)
sumarry_table.sort_values('Odds_value', ascending=False)

NameError: name 'sumarry_table' is not defined