In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.linear_model import LogisticRegression

In [2]:
data_preprocessed = pd.read_excel('Desktop/Tunde/Data Science Course 2020/Preprocessed_Absenteeism_data.xlsx')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


# Create the Target

In [4]:
data_preprocessed.describe(include='all')

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
count,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0,700.0
mean,0.25,0.008571,0.09,0.597143,6.36,2.011429,222.347143,29.892857,36.417143,271.801774,26.737143,0.167143,1.021429,0.687143,6.761429
std,0.433322,0.09225,0.286386,0.490823,3.50501,1.480396,66.31296,14.804446,6.379083,40.021804,4.254701,0.37337,1.112215,1.166095,12.670082
min,0.0,0.0,0.0,0.0,1.0,0.0,118.0,5.0,27.0,205.917,19.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,3.0,1.0,179.0,16.0,31.0,241.476,24.0,0.0,0.0,0.0,2.0
50%,0.0,0.0,0.0,1.0,6.0,2.0,225.0,26.0,37.0,264.249,25.0,0.0,1.0,0.0,3.0
75%,0.25,0.0,0.0,1.0,10.0,3.0,260.0,50.0,40.0,294.217,31.0,0.0,2.0,1.0,8.0
max,1.0,1.0,1.0,1.0,12.0,6.0,388.0,52.0,58.0,378.884,38.0,1.0,4.0,8.0,120.0


In [5]:
#Using the median as cut-off line is numerically stable and rigid

In [6]:
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [7]:
 data_preprocessed['Excessive Absenteeism'] = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                                                       data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [8]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


In [9]:
# Mentioned earlier that using the median as cut-off line is numerically stable and rigid
# Using median ensures that our 'Target" data is balanced i.e. approx 50% lower and 50% higher than median
# Check if above is true 

In [10]:
Pct_of_1_target = data_preprocessed['Excessive Absenteeism'].sum()/data_preprocessed['Excessive Absenteeism'].shape[0]

In [11]:
Pct_of_1_target

0.45571428571428574

In [12]:
print('Percentage of 1s Target: {0:.2f}%'.format(Pct_of_1_target*100))

Percentage of 1s Target: 45.57%


In [13]:
# We initially used all the variables except 'Absenteeism Time in Hours' since this has been converted into 'Excessive Absenteeism'
# Using Backward Elimination, we are removing 'Month Value', 'Daily Work Load Average', 'Distance to Work' as they have little/no effect on our model

In [14]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Month','Distance to Work',
                                            'Daily Work Load Average'],axis=1)

In [15]:
data_with_targets is data_preprocessed

False

In [16]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,1,1,289,33,30,0,2,1,1
1,0,0,0,0,1,118,50,31,0,1,0,0
2,0,0,0,1,2,179,38,31,0,0,0,0
3,1,0,0,0,3,279,39,24,0,2,0,1
4,0,0,0,1,3,289,33,30,0,2,1,0


# Selection of input for regression

In [17]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

In [18]:
#pd.set_option('display.max_rows', None)
unscaled_inputs.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,1,289,33,30,0,2,1
1,0,0,0,0,1,118,50,31,0,1,0
2,0,0,0,1,2,179,38,31,0,0,0
3,1,0,0,0,3,279,39,24,0,2,0
4,0,0,0,1,3,289,33,30,0,2,1


# Standardize Data

In [19]:
from sklearn.preprocessing import StandardScaler

In [20]:
input_scaler = StandardScaler()

In [21]:
input_scaler.fit(unscaled_inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [22]:
scaled_input = input_scaler.transform(unscaled_inputs)

In [23]:
scaled_input

array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         0.88046927,  0.26848661],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.91902997, -0.58968976],
       ...,
       [ 1.73205081, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
        -0.91902997, -0.58968976],
       [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
        -0.01928035,  0.26848661]])

In [24]:
scaled_input.shape

(700, 11)

In [25]:
target = data_with_targets.iloc[:,-1]

In [26]:
target.head()

0    1
1    0
2    0
3    1
4    0
Name: Excessive Absenteeism, dtype: int64

# Split Data into Train & Testing Samples and Shuffle

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
x_train,x_test,y_train,y_test = train_test_split(scaled_input,target,test_size=0.2,random_state=42)

In [29]:
print(x_train.shape,x_test.shape)

(560, 11) (140, 11)


In [30]:
print(y_train.shape,y_test.shape)

(560,) (140,)


# Logistic Regression with sklearn

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Training Model

In [32]:
model = LogisticRegression()

In [33]:
model.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [34]:
model.score(x_train,y_train)

0.7892857142857143

# Calculating Accuracy Manually

In [35]:
train_output = model.predict(x_train)

In [36]:
train_output

array([1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,

In [37]:
y_train = y_train.reset_index(drop=True)

In [38]:
y_train

0      1
1      0
2      1
3      0
4      1
      ..
555    1
556    0
557    1
558    0
559    0
Name: Excessive Absenteeism, Length: 560, dtype: int64

In [39]:
train_output == y_train

0       True
1       True
2       True
3       True
4      False
       ...  
555    False
556     True
557     True
558     True
559     True
Name: Excessive Absenteeism, Length: 560, dtype: bool

In [40]:
np.sum(train_output == y_train)

442

In [41]:
train_output.shape[0]

560

In [42]:
np.sum(train_output == y_train) / train_output.shape[0]

0.7892857142857143

# Finding intercept & Co-efficients

In [43]:
model.intercept_

array([-0.18399469])

In [44]:
model.coef_

array([[ 2.11797791,  0.33850246,  1.52260921,  1.37877289, -0.155963  ,
         0.78334129, -0.26662921,  0.27062305, -0.12051211,  0.43104239,
        -0.36611714]])

In [45]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Day of the Week',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [46]:
#nd arrays by default are row not columns so we must transpose model.coef_ to columns

In [47]:
summary = DataFrame()
summary['Features'] = unscaled_inputs.columns.values
summary['Weights'] = model.coef_.T

In [48]:
summary

Unnamed: 0,Features,Weights
0,Reason_1,2.117978
1,Reason_2,0.338502
2,Reason_3,1.522609
3,Reason_4,1.378773
4,Day of the Week,-0.155963
5,Transportation Expense,0.783341
6,Age,-0.266629
7,Body Mass Index,0.270623
8,Education,-0.120512
9,Children,0.431042


# Add intercept

In [49]:
summary.index = summary.index + 1
summary.loc[0]= ['Intercept',model.intercept_[0]]

In [50]:
summary = summary.sort_index()

In [51]:
summary

Unnamed: 0,Features,Weights
0,Intercept,-0.183995
1,Reason_1,2.117978
2,Reason_2,0.338502
3,Reason_3,1.522609
4,Reason_4,1.378773
5,Day of the Week,-0.155963
6,Transportation Expense,0.783341
7,Age,-0.266629
8,Body Mass Index,0.270623
9,Education,-0.120512


In [52]:
# Predicted weights / coefficients of logistic regression are log odds
# We need to find the exp of the weights to make them more meaningful

In [53]:
summary['Odds Ratio'] = np.exp(summary['Weights'])

In [54]:
summary

Unnamed: 0,Features,Weights,Odds Ratio
0,Intercept,-0.183995,0.83194
1,Reason_1,2.117978,8.314308
2,Reason_2,0.338502,1.402845
3,Reason_3,1.522609,4.584171
4,Reason_4,1.378773,3.970027
5,Day of the Week,-0.155963,0.855591
6,Transportation Expense,0.783341,2.188773
7,Age,-0.266629,0.765957
8,Body Mass Index,0.270623,1.310781
9,Education,-0.120512,0.886466


In [55]:
summary.sort_values('Odds Ratio',ascending=False)

Unnamed: 0,Features,Weights,Odds Ratio
1,Reason_1,2.117978,8.314308
3,Reason_3,1.522609,4.584171
4,Reason_4,1.378773,3.970027
6,Transportation Expense,0.783341,2.188773
10,Children,0.431042,1.538861
2,Reason_2,0.338502,1.402845
8,Body Mass Index,0.270623,1.310781
9,Education,-0.120512,0.886466
5,Day of the Week,-0.155963,0.855591
0,Intercept,-0.183995,0.83194


In [56]:
# if the weight is circa 0 or the Odds Ratio is circa 1, then feature is not important
# Also Reason_0 which represent someone was absent but with no particular reason was dropped (when creating Dummy variable)
# So Reason_0 is our BASE CASE

In [57]:
# Backward Elimination - The idea is that we can simplify our model by removing all features with no contributions to the model
# We can remove Month Value, Daily Work Load Average, Distance to Work

# Testing the Model

In [58]:
model.score(x_test,y_test)

0.7857142857142857