### Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
from logisticRegression import LogisticRegression

### Load the preprocessed data

In [2]:
df_preprocessed_with_targets = pd.read_csv('df_preprocessed_with_targets.csv')
df_preprocessed_with_targets.head()

Unnamed: 0,Type_1,Type_2,Type_3,Type_4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Targets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


In [3]:
# Define target
targets = df_preprocessed_with_targets['Targets']

In [5]:
targets.head()

0    1
1    0
2    0
3    1
4    0
Name: Targets, dtype: int64

In [4]:
# Define input
unscaled_inputs = df_preprocessed_with_targets.drop(['Targets'],axis=1)

In [6]:
unscaled_inputs.head()

Unnamed: 0,Type_1,Type_2,Type_3,Type_4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1


### Normalize the inputs (features)

In [7]:
# The problem is if we normalize (scale) with categorical value Reason for Absence and Education
# It will be useless if we want to figure out how changes of categorical value impact on the result
# Thus, I will build my own Scaler which can scale the desired column
from sklearn.base import BaseEstimator, TransformerMixin

# the custom scaler class 
class CustomScaler(BaseEstimator,TransformerMixin): 
    
    def __init__(self,columns):
        self.scaler = StandardScaler()
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns])
        self.mean_ = np.array(np.mean(X[self.columns]))
        self.var_ = np.array(np.var(X[self.columns]))
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]

        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [8]:
# Choose the column to scale 
columns_to_scaled = ['Month', 'Day of Week', 'Transportation Expense', 'Distance to Work', 'Age',
                     'Daily Work Load Average', 'Body Mass Index','Children', 'Pets']

In [9]:
# Initialize the scaler
absenteeism_scaler = CustomScaler(columns=columns_to_scaled)

In [10]:
# Feature scaling
absenteeism_scaler.fit(unscaled_inputs)

In [11]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [12]:
scaled_inputs.head()

Unnamed: 0,Type_1,Type_2,Type_3,Type_4,Month,Day of Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.01928,-0.58969
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.24831,-0.806331,1.002633,0,-0.91903,-0.58969
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.58969
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487


In [13]:
# After feature is scaled and targets is defined we split data for training and testing

from sklearn.model_selection import train_test_split
input_train, input_test, targets_train, targets_test = train_test_split(scaled_inputs,targets,test_size=0.2,random_state=42)

In [14]:
# Initialize and train Logistic Regression model
model = LogisticRegression()
model.fit(input_train,targets_train)

In [15]:
# Build my own function to check accuracy
def check_accuracy(model_need_to_check,input_check,target_check):
    predicted_result = model_need_to_check.predict(input_check)
    compared_result = predicted_result == target_check
    # compared_result will save matches case between reality and prediction
    return compared_result.sum()/target_check.shape[0]

In [17]:
check_accuracy(model,input_test,targets_test)

0.7785714285714286

In [18]:
# We can see that with the limited samples of dataset (700 samples) the accuracy is 77,8%
# If we have more sample, I am confident to say that the accuracy will increase
# This means our processing data is good

### Create a table for model summary

In [20]:
# Create a table have name of feature and their weight
column_name = np.array(unscaled_inputs.columns.values)
model_summary = pd.DataFrame({'Feature Name':column_name, 'Coefficient':model.coef_.reshape(14)})
model_summary

Unnamed: 0,Feature Name,Coefficient
0,Type_1,2.928834
1,Type_2,0.732911
2,Type_3,3.071973
3,Type_4,0.994322
4,Month,0.079737
5,Day of Week,-0.158204
6,Transportation Expense,0.674526
7,Distance to Work,-0.056784
8,Age,-0.257761
9,Daily Work Load Average,-0.020752


In [21]:
odd_coef = np.exp(model.coef_.reshape(14))
model_summary['Coefficint Odds'] = odd_coef
model_summary

Unnamed: 0,Feature Name,Coefficient,Coefficint Odds
0,Type_1,2.928834,18.70581
1,Type_2,0.732911,2.08113
2,Type_3,3.071973,21.584445
3,Type_4,0.994322,2.70289
4,Month,0.079737,1.083002
5,Day of Week,-0.158204,0.853675
6,Transportation Expense,0.674526,1.963103
7,Distance to Work,-0.056784,0.944798
8,Age,-0.257761,0.77278
9,Daily Work Load Average,-0.020752,0.979462


In [22]:
model_summary.sort_values(by='Coefficint Odds',ascending=False)

Unnamed: 0,Feature Name,Coefficient,Coefficint Odds
2,Type_3,3.071973,21.584445
0,Type_1,2.928834,18.70581
3,Type_4,0.994322,2.70289
1,Type_2,0.732911,2.08113
6,Transportation Expense,0.674526,1.963103
12,Children,0.41818,1.519193
10,Body Mass Index,0.24536,1.278081
4,Month,0.079737,1.083002
9,Daily Work Load Average,-0.020752,0.979462
7,Distance to Work,-0.056784,0.944798


### Evaluate the model

In [None]:
# The coefficients and odds ratios of the logistic regression model provide insights into 
# the influence of each feature on predicting whether an employee will be moderately absent or excessively absent.
# The nearer coefficient = 0 or Coefficint Odds = 1, the smaller feature affect to the result.

# Type_1, Type_3: Employees associated with these types have significantly higher odds 
# of being excessively absent compared to other types, as indicated by their high positive coefficients and odds ratios.

# Type_2, Type_4: While still influential, employees belonging to Type_2 and Type_4 exhibit relatively lower odds 
# of excessive absenteeism compared to Type_1 and Type_3, supported by their lower positive coefficients and odds ratios.

# Month: Each unit increase in month slightly increases the odds of excessive absenteeism, 
# although the effect is relatively small.

# Day of Week: The coefficient and odds ratio suggest that the day of the week has a negligible effect on absenteeism.

# Transportation Expense: Higher transportation expenses correspond to increased odds of excessive absenteeism.

# Distance to Work: The coefficient and odds ratio imply a minor decrease in the odds of excessive absenteeism 
# with an increase in distance to work, although the effect is relatively small.

# Age: Older employees tend to have slightly lower odds of excessive absenteeism.

# Daily Work Load Average: An increase in the daily workload average leads to a slight decrease in the odds of excessive absenteeism, 
# albeit the effect is minimal based on the low negative coefficient and odds ratio.

# Body Mass Index: Higher BMI is associated with increased odds of excessive absenteeism.

# Education, Children, Pets: These features exhibit relatively lower impact on absenteeism, 
# with coefficients and odds ratios suggesting minimal influence on the outcome.


### Save model for deployment

In [23]:
# After we have the Logistic Regression model with the desired accuracy,
# we will save it for deployment, applying our model to the reality
# import pickle

# with open('model', 'wb') as f:
#     pickle.dump(model, f)

# with open('scaler', 'wb') as f:
#     pickle.dump(absenteeism_scaler, f)

In [None]:
# Now we have the pre-trained model, it's time for deploying 
# We will create a module which has preprocessing function, predict function
# Using module is so convenient because we just need to import the module like other modules in Python