In [1]:
import pandas as pd, numpy as np
import warnings
warnings.filterwarnings('ignore')

## Load the Data

In [2]:
df_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')
df_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,7,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,14,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,15,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,16,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,23,289,36,33,239.554,30,0,2,1,2


## Create the targets

Based on the median value of'Absenteeism Time in Hours', we create 2 targets:
0 : moderately absent -> below the median value
1 : excessively absent -> equal or more than the median value

In [3]:
targets = np.where(df_preprocessed['Absenteeism Time in Hours']> df_preprocessed['Absenteeism Time in Hours'].median(),1,0)
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [4]:
df_preprocessed['Excessive Absenteeism'] = targets
#Drop 'Absenteeism Time in Hours', 'Month Value', 'Distance to Work', 'Day of Week'
data_with_targets = df_preprocessed.drop(['Absenteeism Time in Hours', 'Month Value', 'Distance to Work', 'Day of Week'], axis=1)
data_with_targets

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,289,33,239.554,30,0,2,1,1
1,0,0,0,0,118,50,239.554,31,0,1,0,0
2,0,0,0,1,179,38,239.554,31,0,0,0,0
3,1,0,0,0,279,39,239.554,24,0,2,0,1
4,0,0,0,1,289,33,239.554,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,179,40,237.656,22,1,2,0,1
696,1,0,0,0,225,28,237.656,24,0,1,2,0
697,1,0,0,0,330,28,237.656,25,1,0,0,1
698,0,0,0,1,235,32,237.656,25,1,0,0,0


## Select the Input

In [5]:
unscaled_inputs = data_with_targets.iloc[:,:-1] 

## Data Standardization

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self_var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [7]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Age', 'Daily Work Load Average',
       'Body Mass Index', 'Education', 'Children', 'Pets'], dtype=object)

In [8]:
#columns_to_scale = ['Month Value', 'Day of Week', 'Transportation Expense', 'Distance to Work', 'Age',
       #'Daily Work Load Average', 'Body Mass Index','Children', 'Pets']
columns_to_omit = ['Reason_1, Reason_2', 'Reason_3', 'Reason_4', 'Education']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [9]:
scaler = CustomScaler(columns_to_scale)

In [10]:
scaler.fit(unscaled_inputs)

CustomScaler(columns=['Reason_1', 'Reason_2', 'Transportation Expense', 'Age',
                      'Daily Work Load Average', 'Body Mass Index', 'Children',
                      'Pets'],
             copy=None, with_mean=None, with_std=None)

In [11]:
scaled_inputs = scaler.transform(unscaled_inputs)

In [12]:
scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,-0.577350,-0.092981,0,1,1.005844,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,-0.577350,-0.092981,0,0,-1.574681,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690
2,-0.577350,-0.092981,0,1,-0.654143,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690
3,1.732051,-0.092981,0,0,0.854936,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690
4,-0.577350,-0.092981,0,1,1.005844,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1.732051,-0.092981,0,0,-0.654143,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690
696,1.732051,-0.092981,0,0,0.040034,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663
697,1.732051,-0.092981,0,0,1.624567,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690
698,-0.577350,-0.092981,0,1,0.190942,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690


In [13]:
scaled_inputs.shape

(700, 11)

## Train Test Split

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(scaled_inputs, targets,  test_size=0.2, random_state=40)

In [15]:
print(X_train.shape, X_test.shape)

(560, 11) (140, 11)


In [16]:
print(y_train.shape, y_test.shape)

(560,) (140,)


## Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

### Training the Model

In [18]:
logreg = LogisticRegression()

In [19]:
logreg.fit(X_train, y_train)

LogisticRegression()

In [20]:
logreg.score(X_train, y_train)

0.7803571428571429

## Intercept and Coefficient

In [21]:
logreg.intercept_

array([-1.21283935])

In [22]:
logreg.coef_

array([[ 1.58379912,  0.11914421,  3.46481471,  1.24015662,  0.52066923,
        -0.29938859, -0.11955553,  0.25897448, -0.07969076,  0.42423594,
        -0.29757894]])

In [23]:
feature_names = unscaled_inputs.columns.values

In [24]:
summary_table = pd.DataFrame(feature_names, columns=['Features'])
summary_table['Coefficient'] = np.transpose(logreg.coef_)
summary_table

Unnamed: 0,Features,Coefficient
0,Reason_1,1.583799
1,Reason_2,0.119144
2,Reason_3,3.464815
3,Reason_4,1.240157
4,Transportation Expense,0.520669
5,Age,-0.299389
6,Daily Work Load Average,-0.119556
7,Body Mass Index,0.258974
8,Education,-0.079691
9,Children,0.424236


In [25]:
summary_table.index = summary_table.index+1
summary_table.loc[0] = ['Intercept', logreg.intercept_[0]]
summary_table = summary_table.sort_index()

In [26]:
summary_table

Unnamed: 0,Features,Coefficient
0,Intercept,-1.212839
1,Reason_1,1.583799
2,Reason_2,0.119144
3,Reason_3,3.464815
4,Reason_4,1.240157
5,Transportation Expense,0.520669
6,Age,-0.299389
7,Daily Work Load Average,-0.119556
8,Body Mass Index,0.258974
9,Education,-0.079691


In [27]:
odds_ratio = np.exp(summary_table.Coefficient)
summary_table['Odds Ratio'] = odds_ratio
summary_table.sort_values(by='Odds Ratio', ascending=False)

Unnamed: 0,Features,Coefficient,Odds Ratio
3,Reason_3,3.464815,31.970535
1,Reason_1,1.583799,4.873435
4,Reason_4,1.240157,3.456155
5,Transportation Expense,0.520669,1.683154
10,Children,0.424236,1.528422
8,Body Mass Index,0.258974,1.295601
2,Reason_2,0.119144,1.126532
9,Education,-0.079691,0.923402
7,Daily Work Load Average,-0.119556,0.887315
11,Pets,-0.297579,0.742614


## Simplifying the Model

Feature that have coefficient near to 0 and odds ratio near to 1 is considered not important because it does not add value to the logistic regression model. With or without those features, the model accuracy is remain the same. 
<br>
So I will remove/drop these features from the model and restart and run all the cells:
1. Month Value
<br>
2. Distance to Work
<br>
3. Day of Week
<br>

After I dropped those 3 features, the model accuracy has not change significantly.

## Model Interpretation

Seems like features 'Reasons' have the largest impact on the employee absenteeism with the descending order: Reason_3, Reason_1, Reason_4, Reason_2. Reason_3 is related with injury, poisoning, and health status check which means that employee with who got injured, food poisoned or had lab findings have to take more absence until they recovered. It's the same explanation with reason 1 and reason 4. However reason 2 get the lowest weight among other reasons because it's related to pregnancy and prenatal condition so maybe the employee's absence is just for visiting doctors or have short period check and can return to their work after that. 

## Testing the Model

In [28]:
logreg.fit(X_test, y_test)

LogisticRegression()

In [31]:
logreg.score(X_test, y_test)

0.7714285714285715

In [34]:
y_predict = logreg.predict_proba(X_test)[:,1]
y_predict

array([0.73263721, 0.512694  , 0.54973895, 0.24357788, 0.90415792,
       0.27828129, 0.89584495, 0.53389945, 0.27345886, 0.66104252,
       0.09772276, 0.24382104, 0.74595569, 0.61373183, 0.72523162,
       0.49864251, 0.24280717, 0.09943074, 0.37775255, 0.14203829,
       0.49020143, 0.09736235, 0.24177323, 0.5436891 , 0.23645362,
       0.51190885, 0.18960103, 0.35999521, 0.13856158, 0.09522207,
       0.1083441 , 0.26648171, 0.55646601, 0.5754745 , 0.20601979,
       0.28562072, 0.53069726, 0.50633493, 0.73708796, 0.612239  ,
       0.23645362, 0.62048446, 0.23311288, 0.74524493, 0.10467366,
       0.67033782, 0.69017407, 0.51515632, 0.49543213, 0.10467366,
       0.70688121, 0.53516482, 0.90763485, 0.24745887, 0.44017309,
       0.72667475, 0.26934776, 0.28034062, 0.49696272, 0.43198642,
       0.36403406, 0.57722058, 0.87425373, 0.107647  , 0.25217842,
       0.55040781, 0.25772689, 0.28562072, 0.10813311, 0.89522255,
       0.61784455, 0.27259875, 0.0995124 , 0.39598181, 0.72025