### Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler # To be inherited by CustomScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pickle

### Import the preprocessed dataset

In [2]:
data_preprocessed = pd.read_csv("Absenteeism_processed_data.csv")
df = data_preprocessed.copy()
df.head()

Unnamed: 0,Absence Reason 1,Absence Reason 2,Absence Reason 3,Absence Reason 4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


We will classify people into "moderately absent" and "excessively absent".

Total absent hours will be the grouped into two, and the boundary of these two groups will be the "median" value of absenteeism time.

In [3]:
df["Absenteeism Time in Hours"].median()

3.0

In [4]:
df["Absenteeism Time in Hours"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 700 entries, 0 to 699
Series name: Absenteeism Time in Hours
Non-Null Count  Dtype
--------------  -----
700 non-null    int64
dtypes: int64(1)
memory usage: 5.6 KB


In [5]:
targets = []

for i in range(df.shape[0]):
    if df.loc[i,"Absenteeism Time in Hours"] > int(df["Absenteeism Time in Hours"].median()):
        targets.append(1)
    else:
        targets.append(0)

df["Target"] = targets
df = df.drop(["Absenteeism Time in Hours"],axis=1)
df.head()

Unnamed: 0,Absence Reason 1,Absence Reason 2,Absence Reason 3,Absence Reason 4,Month,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Target
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


We will drop month, daily work load average and distance to work. These features have weights very close to 1 towards the likelihood of having excess absence. This decision was made after a model was fitted while keeping these features in.

This action is equivalent to "not rejecting that their effect on absence is zero", i.e. throwing out independent variables whose weights' p-values are too large under the null hypothesis. This means that under the assumption that they don't affect absence times, current sample of absences is possible to encounter.

In [6]:
df = df.drop(["Month", "Daily Work Load Average", "Distance to Work"],axis=1)
df

Unnamed: 0,Absence Reason 1,Absence Reason 2,Absence Reason 3,Absence Reason 4,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Target
0,0,0,0,1,1,289,33,30,0,2,1,1
1,0,0,0,0,1,118,50,31,0,1,0,0
2,0,0,0,1,2,179,38,31,0,0,0,0
3,1,0,0,0,3,279,39,24,0,2,0,1
4,0,0,0,1,3,289,33,30,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,2,179,40,22,1,2,0,1
696,1,0,0,0,2,225,28,24,0,1,2,0
697,1,0,0,0,3,330,28,25,1,0,0,1
698,0,0,0,1,3,235,32,25,1,0,0,0


### Separate inputs and targets

In [7]:
input_data_nonscaled = df.iloc[:,:-1]
target_data = df.iloc[:,-1]

In [8]:
input_data_nonscaled.head()

Unnamed: 0,Absence Reason 1,Absence Reason 2,Absence Reason 3,Absence Reason 4,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,1,289,33,30,0,2,1
1,0,0,0,0,1,118,50,31,0,1,0
2,0,0,0,1,2,179,38,31,0,0,0
3,1,0,0,0,3,279,39,24,0,2,0
4,0,0,0,1,3,289,33,30,0,2,1


### Standardize the inputs

In [9]:
help(StandardScaler())

Help on StandardScaler in module sklearn.preprocessing._data object:

class StandardScaler(sklearn.base.OneToOneFeatureMixin, sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
 |  StandardScaler(*, copy=True, with_mean=True, with_std=True)
 |  
 |  Standardize features by removing the mean and scaling to unit variance.
 |  
 |  The standard score of a sample `x` is calculated as:
 |  
 |      z = (x - u) / s
 |  
 |  where `u` is the mean of the training samples or zero if `with_mean=False`,
 |  and `s` is the standard deviation of the training samples or one if
 |  `with_std=False`.
 |  
 |  Centering and scaling happen independently on each feature by computing
 |  the relevant statistics on the samples in the training set. Mean and
 |  standard deviation are then stored to be used on later data using
 |  :meth:`transform`.
 |  
 |  Standardization of a dataset is a common requirement for many
 |  machine learning estimators: they might behave badly if the
 |  individual fea

In [10]:
# taken from the course resources
from sklearn.base import BaseEstimator, TransformerMixin

class CustomScaler(BaseEstimator,TransformerMixin): 
    
    # init or what information we need to declare a CustomScaler object
    # and what is calculated/declared as we do
    
    def __init__(self,columns,copy=True,with_mean=True,with_std=True):
        
        # scaler is nothing but a Standard Scaler object
        self.scaler = StandardScaler() # here, all true arguments had been put in as arguments for standardscaler, and that caused initiation to fail.
        # so, I deleted the given arguments.
        # with some columns 'twist'
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    
    # the fit method, which, again based on StandardScale
    
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self
    
    # the transform method which does the actual scaling

    def transform(self, X, y=None, copy=None):
        
        # record the initial order of the columns
        init_col_order = X.columns
        
        # scale all features that you chose when creating the instance of the class
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        
        # declare a variable containing all information that was not scaled
        X_not_scaled = X.loc[:,~X.columns.isin(self.columns)]
        
        # return a data frame which contains all scaled features and all 'not scaled' features
        # use the original order (that you recorded in the beginning)
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [11]:
# check what are all columns that we've got
input_data_nonscaled.columns.values

array(['Absence Reason 1', 'Absence Reason 2', 'Absence Reason 3',
       'Absence Reason 4', 'Day of the Week', 'Transportation Expense',
       'Age', 'Body Mass Index', 'Education', 'Children', 'Pets'],
      dtype=object)

In [12]:
columns_to_omit = ['Absence Reason 1', 'Absence Reason 2', 'Absence Reason 3',
       'Absence Reason 4', 'Education']
columns_to_scale = [x for x in input_data_nonscaled.columns.values if x not in columns_to_omit]
columns_to_scale

['Day of the Week',
 'Transportation Expense',
 'Age',
 'Body Mass Index',
 'Children',
 'Pets']

In [13]:
absenteeism_scaler = CustomScaler(columns=columns_to_scale)
absenteeism_scaler.fit(input_data_nonscaled)
input_data_scaled = absenteeism_scaler.transform(input_data_nonscaled)
input_data_scaled

  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


Unnamed: 0,Absence Reason 1,Absence Reason 2,Absence Reason 3,Absence Reason 4,Day of the Week,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,-0.683704,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,-0.683704,-1.574681,2.130803,1.002633,0,-0.019280,-0.589690
2,0,0,0,1,-0.007725,-0.654143,0.248310,1.002633,0,-0.919030,-0.589690
3,1,0,0,0,0.668253,0.854936,0.405184,-0.643782,0,0.880469,-0.589690
4,0,0,0,1,0.668253,1.005844,-0.536062,0.767431,0,0.880469,0.268487
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,-0.007725,-0.654143,0.562059,-1.114186,1,0.880469,-0.589690
696,1,0,0,0,-0.007725,0.040034,-1.320435,-0.643782,0,-0.019280,1.126663
697,1,0,0,0,0.668253,1.624567,-1.320435,-0.408580,1,-0.919030,-0.589690
698,0,0,0,1,0.668253,0.190942,-0.692937,-0.408580,1,-0.919030,-0.589690


In [14]:
input_data_scaled.shape

(700, 11)

### Prepare train and test data

In [15]:
help(train_test_split)

Help on function train_test_split in module sklearn.model_selection._split:

train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)
    Split arrays or matrices into random train and test subsets.
    
    Quick utility that wraps input validation,
    ``next(ShuffleSplit().split(X, y))``, and application to input data
    into a single call for splitting (and optionally subsampling) data into a
    one-liner.
    
    Read more in the :ref:`User Guide <cross_validation>`.
    
    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.
    
    test_size : float or int, default=None
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to

In [16]:
x_train, x_test, y_train, y_test = train_test_split(input_data_scaled,target_data, train_size = 0.83, shuffle=True, random_state=42)

### Model Fitting

In [17]:
reg = LogisticRegression()
reg.fit(x_train,y_train)
reg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [18]:
reg.score(x_train,y_train)

0.7676419965576592

In [19]:
reg.score(x_test,y_test)

0.8235294117647058

In [20]:
reg.predict_proba(x_test)
# first entry in each row is the prob. of output being 0, and second entry is the prob. of output being 1

array([[0.80927939, 0.19072061],
       [0.88276698, 0.11723302],
       [0.74615201, 0.25384799],
       [0.51582683, 0.48417317],
       [0.50147608, 0.49852392],
       [0.08524432, 0.91475568],
       [0.59247801, 0.40752199],
       [0.3734374 , 0.6265626 ],
       [0.78314629, 0.21685371],
       [0.70382213, 0.29617787],
       [0.88276698, 0.11723302],
       [0.72158449, 0.27841551],
       [0.27062429, 0.72937571],
       [0.54889554, 0.45110446],
       [0.74158084, 0.25841916],
       [0.40950001, 0.59049999],
       [0.91755431, 0.08244569],
       [0.25266512, 0.74733488],
       [0.88276698, 0.11723302],
       [0.59937804, 0.40062196],
       [0.64554948, 0.35445052],
       [0.77946225, 0.22053775],
       [0.7154246 , 0.2845754 ],
       [0.69423878, 0.30576122],
       [0.87055492, 0.12944508],
       [0.21074517, 0.78925483],
       [0.59937804, 0.40062196],
       [0.59022407, 0.40977593],
       [0.77946225, 0.22053775],
       [0.59937804, 0.40062196],
       [0.

### Summarize the model

In [21]:
model_coefficients = reg.coef_
model_intercept = reg.intercept_
coefficient_names = np.transpose(input_data_nonscaled.columns.values)

In [22]:
model_coefficients.transpose()

array([[ 2.87035055],
       [ 0.64757244],
       [ 3.00812968],
       [ 1.01471336],
       [-0.19575089],
       [ 0.68355783],
       [-0.1918796 ],
       [ 0.21331314],
       [-0.26278644],
       [ 0.38579009],
       [-0.32807831]])

In [23]:
model_summary = pd.DataFrame(data=coefficient_names,columns=["Feature Names"])
model_summary["Coefficients"] = model_coefficients.transpose()

#### Add the intercept value to the summary table

In [24]:
model_summary.index = model_summary.index + 1
model_summary.loc[0] = ["Intercept", reg.intercept_[0]]
model_summary = model_summary.sort_index()
model_summary

Unnamed: 0,Feature Names,Coefficients
0,Intercept,-1.704434
1,Absence Reason 1,2.870351
2,Absence Reason 2,0.647572
3,Absence Reason 3,3.00813
4,Absence Reason 4,1.014713
5,Day of the Week,-0.195751
6,Transportation Expense,0.683558
7,Age,-0.19188
8,Body Mass Index,0.213313
9,Education,-0.262786


### Add the change in the odds wr. to features to the summary table

In [25]:
odds_weights = np.exp(model_summary["Coefficients"])
odds_weights

0      0.181875
1     17.643202
2      1.910896
3     20.249491
4      2.758573
5      0.822217
6      1.980913
7      0.825406
8      1.237772
9      0.768906
10     1.470776
11     0.720307
Name: Coefficients, dtype: float64

In [26]:
odds_summary = pd.DataFrame(data=odds_weights.values,columns=["Weight Towards Odds Ratio"])
odds_summary["Feature Names"] = model_summary["Feature Names"]
odds_summary = odds_summary[["Feature Names", "Weight Towards Odds Ratio"]]
odds_summary = odds_summary.sort_values(["Weight Towards Odds Ratio"],ascending=False)
odds_summary

Unnamed: 0,Feature Names,Weight Towards Odds Ratio
3,Absence Reason 3,20.249491
1,Absence Reason 1,17.643202
4,Absence Reason 4,2.758573
6,Transportation Expense,1.980913
2,Absence Reason 2,1.910896
10,Children,1.470776
8,Body Mass Index,1.237772
7,Age,0.825406
5,Day of the Week,0.822217
9,Education,0.768906


In [27]:
with open("model","wb") as file:
    pickle.dump(reg, file)

with open("scaler","wb") as file:
    pickle.dump(absenteeism_scaler, file)