In [1]:
import pandas as pd, numpy as np
import warnings
warnings.filterwarnings('ignore')

## Load the Data

In [2]:
df_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
df_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,7,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,14,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,15,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,16,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,23,289,36,33,239.554,30,0,2,1,2


## Create the targets

Based on the median value of'Absenteeism Time in Hours', we create 2 targets:
0 : moderately absent -> below the median value
1 : excessively absent -> equal or more than the median value

In [3]:
targets = np.where(df_preprocessed['Absenteeism Time in Hours']> df_preprocessed['Absenteeism Time in Hours'].median(),1,0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [4]:
df_preprocessed['Excessive Absenteeism'] = targets
#Drop 'Absenteeism Time in Hours'
data_with_targets = df_preprocessed.drop('Absenteeism Time in Hours', axis=1)
data_with_targets

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,7,7,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,14,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,15,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,16,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,23,289,36,33,239.554,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,23,179,22,40,237.656,22,1,2,0,1
696,1,0,0,0,5,23,225,26,28,237.656,24,0,1,2,0
697,1,0,0,0,5,24,330,16,28,237.656,25,1,0,0,1
698,0,0,0,1,5,24,235,16,32,237.656,25,1,0,0,0


## Select the Input

In [5]:
unscaled_inputs = data_with_targets.iloc[:,:-1] 

## Data Standardization

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self_var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [7]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of Week', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [8]:
columns_to_scale = ['Month Value', 'Day of Week', 'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education','Children', 'Pets']

In [9]:
scaler = CustomScaler(columns_to_scale)

In [10]:
scaler.fit(unscaled_inputs)

CustomScaler(columns=['Month Value', 'Day of Week', 'Transportation Expense',
                      'Distance to Work', 'Age', 'Daily Work Load Average',
                      'Body Mass Index', 'Education', 'Children', 'Pets'],
             copy=None, with_mean=None, with_std=None)

In [11]:
scaled_inputs = scaler.transform(unscaled_inputs)

In [12]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-1.103121,1.005844,0.412816,-0.536062,-0.806331,0.767431,-0.447980,0.880469,0.268487
1,0,0,0,0,0.182726,-0.228337,-1.574681,-1.141882,2.130803,-0.806331,1.002633,-0.447980,-0.019280,-0.589690
2,0,0,0,1,0.182726,-0.103367,-0.654143,1.426749,0.248310,-0.806331,1.002633,-0.447980,-0.919030,-0.589690
3,1,0,0,0,0.182726,0.021602,0.854936,-1.682647,0.405184,-0.806331,-0.643782,-0.447980,0.880469,-0.589690
4,0,0,0,1,0.182726,0.896386,1.005844,0.412816,-0.536062,-0.806331,0.767431,-0.447980,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.388293,0.896386,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,2.232242,0.880469,-0.589690
696,1,0,0,0,-0.388293,0.896386,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,-0.447980,-0.019280,1.126663
697,1,0,0,0,-0.388293,1.021355,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,2.232242,-0.919030,-0.589690
698,0,0,0,1,-0.388293,1.021355,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,2.232242,-0.919030,-0.589690


In [13]:
scaled_inputs.shape

(700, 14)

## Train Test Split

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets,  test_size=0.2, random_state=40)

In [15]:
print(X_train.shape, X_test.shape)

(560, 14) (140, 14)


In [16]:
print(y_train.shape, y_test.shape)

(560,) (140,)


## Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the Model

In [18]:
logreg = LogisticRegression()

In [19]:
logreg.fit(X_train, y_train)

LogisticRegression()

In [20]:
logreg.score(X_train, y_train)

0.7821428571428571

## Intercept and Coefficient

In [21]:
logreg.intercept_

array([-1.69067969])

In [22]:
logreg.coef_

array([[ 3.07564864,  0.32019918,  3.01928174,  0.79489307,  0.04652154,
        -0.0679888 ,  0.51811901, -0.04770161, -0.2991186 , -0.1166305 ,
         0.24155801, -0.03911421,  0.3951578 , -0.29334256]])

In [23]:
feature_names = unscaled_inputs.columns.values

In [24]:
summary_table = pd.DataFrame(feature_names, columns=['Features'])
summary_table['Coefficient'] = np.transpose(logreg.coef_)
summary_table

Unnamed: 0,Features,Coefficient
0,Reason_1,3.075649
1,Reason_2,0.320199
2,Reason_3,3.019282
3,Reason_4,0.794893
4,Month Value,0.046522
5,Day of Week,-0.067989
6,Transportation Expense,0.518119
7,Distance to Work,-0.047702
8,Age,-0.299119
9,Daily Work Load Average,-0.11663


In [25]:
summary_table.index = summary_table.index+1
summary_table.loc[0] = ['Intercept', logreg.intercept_[0]]
summary_table = summary_table.sort_index()

In [26]:
summary_table

Unnamed: 0,Features,Coefficient
0,Intercept,-1.69068
1,Reason_1,3.075649
2,Reason_2,0.320199
3,Reason_3,3.019282
4,Reason_4,0.794893
5,Month Value,0.046522
6,Day of Week,-0.067989
7,Transportation Expense,0.518119
8,Distance to Work,-0.047702
9,Age,-0.299119


In [27]:
odds_ratio = np.exp(summary_table.Coefficient)
summary_table['Odds Ratio'] = odds_ratio
summary_table.sort_values(by='Odds Ratio', ascending=False)

Unnamed: 0,Features,Coefficient,Odds Ratio
1,Reason_1,3.075649,21.663929
3,Reason_3,3.019282,20.476579
4,Reason_4,0.794893,2.214204
7,Transportation Expense,0.518119,1.678867
13,Children,0.395158,1.484618
2,Reason_2,0.320199,1.377402
11,Body Mass Index,0.241558,1.273231
5,Month Value,0.046522,1.047621
12,Education,-0.039114,0.961641
8,Distance to Work,-0.047702,0.953418


Feature that have coefficient near to 0 and odds ratio near to 1 is considered not important because it does not add value to the logistic regression model.