In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('absenteeism_preprocessed.csv')

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

### taking out target from the dataframe 
- absenteeism time in hours is our target

In [5]:
df['Absenteeism Time in Hours']

0        4
1        0
2        2
3        4
4        2
5        2
6        8
7        4
8       40
9        8
10       8
11       8
12       8
13       1
14       4
15       8
16       2
17       8
18       8
19       2
20       8
21       1
22      40
23       4
24       8
25       7
26       1
27       4
28       8
29       2
30       8
31       8
32       4
33       8
34       2
35       1
36       8
37       4
38       8
39       4
40       2
41       4
42       4
43       8
44       2
45       3
46       3
47       4
48       8
49      32
50       0
51       0
52       2
53       2
54       0
55       0
56       3
57       3
58       0
59       1
60       3
61       4
62       3
63       3
64       0
65       1
66       3
67       3
68       3
69       2
70       2
71       5
72       8
73       3
74      16
75       8
76       2
77       8
78       1
79       3
80       1
81       1
82       8
83       8
84       5
85      32
86       8
87      40
88       1
89       8
90       3

In [6]:
df['Absenteeism Time in Hours'].value_counts()

Absenteeism Time in Hours
8      195
2      149
3      106
1       87
4       57
0       39
16      18
24      15
5        7
40       7
32       6
64       3
56       2
80       2
120      2
112      2
7        1
104      1
48       1
Name: count, dtype: int64

In [7]:
targets=np.where(df['Absenteeism Time in Hours']<
        df['Absenteeism Time in Hours'].median(), 0, 1)

In [8]:
df['Excessive Absenteeism'] = np.where(df['Absenteeism Time in Hours']<
        df['Absenteeism Time in Hours'].median(), 0, 1)

In [9]:
df['Excessive Absenteeism'].value_counts()

Excessive Absenteeism
1    425
0    275
Name: count, dtype: int64

In [10]:
# evenly distributing data
targets.sum()/targets.shape[0]

0.6071428571428571

In [11]:
# data_targets as new checkpoint
data_targets = df.drop(['Absenteeism Time in Hours','Daily Work Load Average', 'Distance to Work', 'Day of the Week'],axis=1)

In [12]:
data_targets

Unnamed: 0,reason_type_1,reason_type_2,reason_type_3,reason_type_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets,Excessive Absenteeism
0,0,0,0,0,7,289,33,30,0,2,1,1
1,0,0,0,0,7,118,50,31,0,1,0,0
2,0,0,0,0,7,179,38,31,0,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0,1
4,0,0,0,0,7,289,33,30,0,2,1,0
5,0,0,0,0,10,179,38,31,0,0,0,0
6,0,0,0,0,7,361,28,27,0,1,4,1
7,0,0,0,0,7,260,36,23,0,4,0,1
8,0,0,1,1,6,155,34,25,0,2,0,1
9,0,0,0,0,7,235,37,29,1,1,1,1


In [13]:
unscaled_inputs = data_targets.iloc[:, :-1]

In [14]:
# standarderizing the data

In [15]:
# creating the custom scaler

In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.columns = columns
        self.copy = copy
        self.with_mean = with_mean
        self.with_std = with_std

        self.scaler = StandardScaler(copy=copy, with_mean=with_mean, with_std=with_std)

        self.mean_ = None
        self.var_ = None

    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns],y)
        self.mean_=np.mean(X[self.columns], axis=0)
        self.var_ = np.var(X[self.columns], axis=0)
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order =X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns=self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]

In [17]:
unscaled_inputs.columns.values

array(['reason_type_1', 'reason_type_2', 'reason_type_3', 'reason_type_4',
       'Month Value', 'Transportation Expense', 'Age', 'Body Mass Index',
       'Education', 'Children', 'Pets'], dtype=object)

In [18]:
columns_to_drop=['reason_type_1','reason_type_2','reason_type_3','reason_type_4','Education']
columns_to_Scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_drop]

In [19]:
absenteeism_scaler = CustomScaler(columns_to_Scale)

In [20]:
absenteeism_scaler.fit(unscaled_inputs)

In [21]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

In [22]:
scaled_inputs

Unnamed: 0,reason_type_1,reason_type_2,reason_type_3,reason_type_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,0,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.030796,-1.574681,2.130803,1.002633,0,-0.01928,-0.58969
2,0,0,0,0,0.030796,-0.654143,0.24831,1.002633,0,-0.91903,-0.58969
3,1,0,0,0,0.030796,0.854936,0.405184,-0.643782,0,0.880469,-0.58969
4,0,0,0,0,0.030796,1.005844,-0.536062,0.767431,0,0.880469,0.268487
5,0,0,0,0,0.929019,-0.654143,0.24831,1.002633,0,-0.91903,-0.58969
6,0,0,0,0,0.030796,2.092381,-1.320435,0.061825,0,-0.01928,2.843016
7,0,0,0,0,0.030796,0.568211,-0.065439,-0.878984,0,2.679969,-0.58969
8,0,0,1,1,-0.268611,-1.016322,-0.379188,-0.40858,0,0.880469,-0.58969
9,0,0,0,0,0.030796,0.190942,0.091435,0.532229,1,-0.01928,0.268487


## Training and testing

In [24]:
scaled_inputs.shape

(700, 11)

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, test_size=0.2, random_state=20)

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [28]:
reg = LogisticRegression()

In [29]:
reg.fit(x_train, y_train)

In [30]:
reg.score(x_train, y_train)

0.7035714285714286

In [31]:
# extracting coefficient and intercept

In [32]:
reg.coef_

array([[ 2.08057447,  0.65579087,  1.19963515,  1.19963515,  0.0295087 ,
         0.37463449, -0.29286994,  0.36041554, -0.47123106,  0.32289834,
        -0.27595064]])

In [33]:
reg.intercept_[0]

-0.06882689634909465

In [34]:
feature_names = unscaled_inputs.columns.values

In [35]:
summary_table = pd.DataFrame(data = feature_names, columns=['Feature Names'])
summary_table['Coefficients'] = np.transpose(reg.coef_)

In [36]:
summary_table

Unnamed: 0,Feature Names,Coefficients
0,reason_type_1,2.080574
1,reason_type_2,0.655791
2,reason_type_3,1.199635
3,reason_type_4,1.199635
4,Month Value,0.029509
5,Transportation Expense,0.374634
6,Age,-0.29287
7,Body Mass Index,0.360416
8,Education,-0.471231
9,Children,0.322898


In [37]:
summary_table.index = summary_table.index+1

In [38]:
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]

In [39]:
summary_table = summary_table.sort_index()

In [40]:
summary_table

Unnamed: 0,Feature Names,Coefficients
0,Intercept,-0.068827
1,reason_type_1,2.080574
2,reason_type_2,0.655791
3,reason_type_3,1.199635
4,reason_type_4,1.199635
5,Month Value,0.029509
6,Transportation Expense,0.374634
7,Age,-0.29287
8,Body Mass Index,0.360416
9,Education,-0.471231


In [41]:
# interpreting the coefficients

In [42]:
summary_table['Odds Ratio'] = np.exp(summary_table['Coefficients'])

In [43]:
summary_table.sort_values('Odds Ratio', ascending=False)

Unnamed: 0,Feature Names,Coefficients,Odds Ratio
1,reason_type_1,2.080574,8.009069
3,reason_type_3,1.199635,3.318906
4,reason_type_4,1.199635,3.318906
2,reason_type_2,0.655791,1.926666
6,Transportation Expense,0.374634,1.45446
8,Body Mass Index,0.360416,1.433925
10,Children,0.322898,1.381125
5,Month Value,0.029509,1.029948
0,Intercept,-0.068827,0.933488
11,Pets,-0.275951,0.75885


### Testing the data


In [45]:
reg.score(x_test, y_test)

0.6642857142857143

In [46]:
predicted_probability = reg.predict_proba(x_test)

In [47]:
predicted_probability

array([[0.53291807, 0.46708193],
       [0.51357169, 0.48642831],
       [0.29990041, 0.70009959],
       [0.54827636, 0.45172364],
       [0.04945482, 0.95054518],
       [0.13160227, 0.86839773],
       [0.14096912, 0.85903088],
       [0.06786182, 0.93213818],
       [0.55363349, 0.44636651],
       [0.53950955, 0.46049045],
       [0.41292881, 0.58707119],
       [0.16425899, 0.83574101],
       [0.03739282, 0.96260718],
       [0.60305869, 0.39694131],
       [0.23817979, 0.76182021],
       [0.45638645, 0.54361355],
       [0.41078867, 0.58921133],
       [0.42152155, 0.57847845],
       [0.29884078, 0.70115922],
       [0.03241583, 0.96758417],
       [0.54488436, 0.45511564],
       [0.54827636, 0.45172364],
       [0.20278584, 0.79721416],
       [0.19713253, 0.80286747],
       [0.10364152, 0.89635848],
       [0.55581581, 0.44418419],
       [0.41369709, 0.58630291],
       [0.78082151, 0.21917849],
       [0.09613714, 0.90386286],
       [0.54827636, 0.45172364],
       [0.

In [48]:
predicted_probability[:, 1]

array([0.46708193, 0.48642831, 0.70009959, 0.45172364, 0.95054518,
       0.86839773, 0.85903088, 0.93213818, 0.44636651, 0.46049045,
       0.58707119, 0.83574101, 0.96260718, 0.39694131, 0.76182021,
       0.54361355, 0.58921133, 0.57847845, 0.70115922, 0.96758417,
       0.45511564, 0.45172364, 0.79721416, 0.80286747, 0.89635848,
       0.44418419, 0.58630291, 0.21917849, 0.90386286, 0.45172364,
       0.47760484, 0.86174641, 0.87530607, 0.5913481 , 0.45172364,
       0.73044573, 0.44855089, 0.91243651, 0.56801277, 0.70586843,
       0.4495364 , 0.68455266, 0.43982609, 0.70586843, 0.87774707,
       0.84391733, 0.88157062, 0.45610364, 0.36145096, 0.44735111,
       0.69961281, 0.55236704, 0.86839773, 0.35131871, 0.88352824,
       0.49084356, 0.94145587, 0.47950615, 0.69637648, 0.69824129,
       0.82008776, 0.87139738, 0.48612457, 0.90039113, 0.44078667,
       0.46488338, 0.35304501, 0.45073727, 0.85966712, 0.40543111,
       0.43547725, 0.46526353, 0.94965415, 0.56367184, 0.72343

### saving the model and scaler

In [50]:
import pickle

In [103]:
with open('model','wb') as file:
    pickle.dump(reg, file)

In [105]:
with open('scaler','wb') as file1:
    pickle.dump(absenteeism_scaler, file1)