# ML Modelling

#### Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

import pickle


#### Loading Data

In [2]:
data = pd.read_csv("Absenteeism Data Preprocessed.csv")

data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,7,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,7,1
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,7,3
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,7,3


#### Classification

There will be two classes in our case-

A = Moderately absent

B = Excessively absent



Since there is difference between excessive and moderate so, we will use MEDIAN to determine the limit

In [3]:
# Creating TARGET

data['Absenteeism Time in Hours'].median()


3.0

The median is 3 

So, the limits and classification would be


<li>Moderately absent ( <= 3 )</li>
<li>Excessively absent ( >= 4 )</li>


In [4]:
# IF condition in numpy = WHERE

targets = np.where( 
    
    data['Absenteeism Time in Hours'] > data['Absenteeism Time in Hours'].median() , # Condition
    
    1,  # IF true, make it 1
    
    0   # ELSE, make it 0
    
)

In [5]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [6]:
# Creating targets and merging them in data

data['Excessive Absenteeism'] = targets

data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,289,36,33,239.554,30,0,2,1,4,7,1,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,0,7,1,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,2,7,2,0
3,1,0,0,0,279,5,39,239.554,24,0,2,0,4,7,3,1
4,0,0,0,1,289,36,33,239.554,30,0,2,1,2,7,3,0


#### Balancing Dataset

In [7]:
targets.sum() / targets.shape[0] * 100

# therefore around 45% of the people are from EXCESSIVE absent group

# the dataset is almost balanced so we don't need to do much

45.57142857142858

In [8]:
# CHECKPOINT

data1 = data.copy()


Dropping column

In [9]:
data1 = data.drop(['Absenteeism Time in Hours'], axis = 1)

data1.head()


Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week,Excessive Absenteeism
0,0,0,0,1,289,36,33,239.554,30,0,2,1,7,1,1
1,0,0,0,0,118,13,50,239.554,31,0,1,0,7,1,0
2,0,0,0,1,179,51,38,239.554,31,0,0,0,7,2,0
3,1,0,0,0,279,5,39,239.554,24,0,2,0,7,3,1
4,0,0,0,1,289,36,33,239.554,30,0,2,1,7,3,0


#### Selecting the inputs

In [10]:
inputs = data1.iloc[ : , 0:-1]
inputs.shape

(700, 14)

#### Standardization or Feature scaling

When we standardize the dataset, scaling DUMMY VARIABLES is not a good practice. Although it will not affect result much, but still for a better model, we should not scale the DUMMIES


##### Custom Scaler

In [11]:
class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std
        
    def fit(self, X, y=None):
        self.scaler = StandardScaler(copy=self.copy, with_mean=self.with_mean, with_std=self.with_std)
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    
    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]

In [12]:
columns_to_scale = [
       'Transportation Expense',  'Age',
       'Daily Work Load Average', 'Distance to Work', 'Body Mass Index', 
       'Children', 'Pets', 'Month Value', 'Day of the Week'
]

# Removed dummies = 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4','Education',

In [13]:
absenteeism_scaler = CustomScaler(columns_to_scale)

In [14]:
absenteeism_scaler.fit(inputs)


  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


In [15]:
scaled_inputs = absenteeism_scaler.transform(inputs)

scaled_inputs

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Month Value,Day of the Week
0,0,0,0,1,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487,0.182726,-0.683704
1,0,0,0,0,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.019280,-0.589690,0.182726,-0.683704
2,0,0,0,1,-0.654143,1.426749,0.248310,-0.806331,1.002633,0,-0.919030,-0.589690,0.182726,-0.007725
3,1,0,0,0,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.589690,0.182726,0.668253
4,0,0,0,1,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487,0.182726,0.668253
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.654143,-0.533522,0.562059,-0.853789,-1.114186,1,0.880469,-0.589690,-0.388293,-0.007725
696,1,0,0,0,0.040034,-0.263140,-1.320435,-0.853789,-0.643782,0,-0.019280,1.126663,-0.388293,-0.007725
697,1,0,0,0,1.624567,-0.939096,-1.320435,-0.853789,-0.408580,1,-0.919030,-0.589690,-0.388293,0.668253
698,0,0,0,1,0.190942,-0.939096,-0.692937,-0.853789,-0.408580,1,-0.919030,-0.589690,-0.388293,0.668253


In [16]:
scaled_inputs.shape

(700, 14)

#### Splitting the Dataset

In [17]:
x_train, x_test, y_train, y_test = train_test_split(
    scaled_inputs, 
    targets,
    train_size= 0.8,
    shuffle= True,
    random_state= 100   # To have same form of shuffle always when we run this code
)

In [18]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(560, 14) (560,)
(140, 14) (140,)


#### Training the model

In [19]:
model = LogisticRegression()

In [20]:
model.fit(x_train, y_train)

In [21]:
model.score(x_train, y_train)

0.7678571428571429

#### Manually finding accuracy

In [22]:
model_output = model.predict(x_train)


In [23]:
model_output == y_train

array([ True,  True,  True, False,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True, False, False,  True,
       False, False,  True,  True,  True,  True, False, False,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True, False,  True,
        True, False,  True,  True, False,  True,  True,  True,  True,
       False,  True, False,  True,  True,  True,  True, False,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,

In [24]:
# Summing the TRUE values
np.sum( model_output == y_train )

430

In [25]:
model_output.shape[0]

560

In [26]:
accuracy = round(
    (np.sum( model_output == y_train ) / model_output.shape[0]) * 100, 
    2)

accuracy

76.79

#### Summary Table

In [27]:
feature_names = inputs.columns.values
feature_names

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4',
       'Transportation Expense', 'Distance to Work', 'Age',
       'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets', 'Month Value', 'Day of the Week'], dtype=object)

In [28]:
summary_table = pd.DataFrame(
    columns= ["Features"],
    data = feature_names
)

weights = model.coef_

summary_table['Weights'] = np.transpose( weights )


# Index
summary_table.index = summary_table.index + 1


summary_table.loc[0] = ['BIAS', model.intercept_[0]]

summary_table = summary_table.sort_index()

summary_table

Unnamed: 0,Features,Weights
0,BIAS,-1.697203
1,Reason_1,2.698381
2,Reason_2,0.534055
3,Reason_3,3.239065
4,Reason_4,0.900046
5,Transportation Expense,0.588479
6,Distance to Work,-0.06662
7,Age,-0.217569
8,Daily Work Load Average,0.014939
9,Body Mass Index,0.179699


In [29]:
# Log Odds - since this LOG reg, so the variables are transformed to 0 and 1, we will find that log odd


summary_table['Odds Ratio'] = np.exp(summary_table.Weights )


# Now we will sort as per the ODDS RATIO    

summary_table.sort_values(
    'Odds Ratio',
    ascending= False,
)



Unnamed: 0,Features,Weights,Odds Ratio
3,Reason_3,3.239065,25.509866
1,Reason_1,2.698381,14.855664
4,Reason_4,0.900046,2.459716
5,Transportation Expense,0.588479,1.801246
2,Reason_2,0.534055,1.705836
11,Children,0.438213,1.549936
9,Body Mass Index,0.179699,1.196857
13,Month Value,0.125973,1.134252
10,Education,0.075695,1.078633
8,Daily Work Load Average,0.014939,1.015051


Insignigicant FEATURE => 
    <li> If weight is close to 0 </li> OR
    <li> Odd ratio is close to 1 </li>
 

#### Backward Elimination

This is the idea to eliminate those features which brings almost no value to the model

<b>If P-VALUE is greater than 0.05, then we remove them</b>

<hr>

#### Testing

In [30]:
test_score = model.score(x_test, y_test)
test_score

0.7571428571428571

In [31]:
predicted_proba = model.predict_proba(x_test)
predicted_proba

# below array has a shape of (140,2) = 140 is test data and 2 is probability of that example being 0 and 1
# if you some both columns of a single data, you will get 1

array([[0.47685139, 0.52314861],
       [0.32678829, 0.67321171],
       [0.27224479, 0.72775521],
       [0.60907852, 0.39092148],
       [0.59786645, 0.40213355],
       [0.54228446, 0.45771554],
       [0.38553094, 0.61446906],
       [0.10080055, 0.89919945],
       [0.85914693, 0.14085307],
       [0.8161396 , 0.1838604 ],
       [0.90188637, 0.09811363],
       [0.2930008 , 0.7069992 ],
       [0.25866732, 0.74133268],
       [0.64243191, 0.35756809],
       [0.65680147, 0.34319853],
       [0.61303138, 0.38696862],
       [0.81136034, 0.18863966],
       [0.75504422, 0.24495578],
       [0.85914693, 0.14085307],
       [0.63124487, 0.36875513],
       [0.82518929, 0.17481071],
       [0.19111941, 0.80888059],
       [0.4652688 , 0.5347312 ],
       [0.1509933 , 0.8490067 ],
       [0.3272941 , 0.6727059 ],
       [0.34174486, 0.65825514],
       [0.1084391 , 0.8915609 ],
       [0.35860041, 0.64139959],
       [0.85272287, 0.14727713],
       [0.13441181, 0.86558819],
       [0.

In [32]:
# Accuracy of the model

print(f"{round(test_score * 100,2)}%")

75.71%


#### Saving the model

In [33]:
# We save or PICKLE the MODEL that we trained in a file
# then we UNPICKLE the model using that file


# MODEL
with open('model', 'wb') as file:
    pickle.dump(model, file)
    
# SCALER
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)