### Libraries

In [36]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
# Import data
data_preprocessed = pd.read_csv('Absenteeism_preprocessed.csv')

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


### Create the targets

In [4]:
# Search for a division that will divide people into two classes
data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [6]:
# Add Targets
# Moderately absent <= 3 -- Target -- > 0
# Excessively absent >= 4 -- Target --> 1

# If value in Absenteeism Time in Hours is grater than median (in this case 3), put 1, else put 0.
# Implicitly balance the dataset
targets = np.where(data_preprocessed['Absenteeism Time in Hours'] > 
                   data_preprocessed['Absenteeism Time in Hours'].median(), 1, 0)

In [7]:
# Add new column Excessive Absenteeism
data_preprocessed['Excessive Absenteeism'] = targets

In [8]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


### A comment on the targets

In [11]:
# Around 46% of the targets are 1s
targets.sum() / targets.shape[0]

0.45571428571428574

In [12]:
# Note: A 60-40 split will usually work equally well for a logistic regression

In [13]:
# Checkpoint
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis=1)

### Select the inputs for the regression

In [14]:
data_with_targets.shape

(700, 15)

In [16]:
data_with_targets.iloc[:,:-1]

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,2,179,22,40,237.656,22,1,2,0
696,1,0,0,0,5,2,225,26,28,237.656,24,0,1,2
697,1,0,0,0,5,3,330,16,28,237.656,25,1,0,0
698,0,0,0,1,5,3,235,16,32,237.656,25,1,0,0


In [17]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

### Standardize the data

In [21]:
# Create empty StandardScaler object. 'absenteeism_scaler' will be used to substract the mean and divide
# by the standard deviation variablewise (featurewise)
absenteeism_scaler = StandardScaler()

In [22]:
# Calculate and store the mean and the standard deviation
absenteeism_scaler.fit(unscaled_inputs)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [23]:
# .transform() does the actual scaling
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)

### Split the data into train & test and shuffle

In [None]:
# Split the data into inputs and targets.
# Output: array 1: a training dataset with inputs
#         array 2: a training dataset with targets
#         array 3: a test dataset with inputs
#         array 4: a test dataset with targets
train_test_split(scaled_inputs, targets)

In [None]:
# Random_state is the method that always shuffle the observations in the same 'random' way
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [None]:
print(x_train.shape, y_train.shape)

In [None]:
print(x_test.shape, y_test.shape)

### Logistic regression

In [None]:
# Training the model
reg = LogisticRegression()

In [None]:
reg.fit(x_train, y_train)

In [None]:
# Based on the used data, the model learned to classify ~80% (0,7804) of the observations correctly
reg.score(x_train, y_train)

In [None]:
# Random_state is the method that always shuffle the observations in the same 'random' way
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [None]:
print(x_train.shape, y_train.shape)

In [None]:
print(x_test.shape, y_test.shape)

### Logistic regression

In [None]:
# Training the model
reg = LogisticRegression()

In [None]:
reg.fit(x_train, y_train)

In [None]:
# Based on the used data, the model learned to classify ~80% (0,7804) of the observations correctly
reg.score(x_train, y_train)

[array([[-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
          0.88046927, -0.58968976],
        [-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
         -0.01928035,  0.26848661],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.01928035, -0.58968976],
        ...,
        [ 1.73205081, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.01928035,  1.12666297],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.01928035,  1.12666297],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976]]),
 array([[-0.57735027, -0.09298136, -0.31448545, ...,  2.23224237,
         -0.91902997, -0.58968976],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.91902997, -0.58968976],
        [-0.57735027, -0.09298136, -0.31448545, ..., -0.44798003,
         -0.01928035,  2.8430157 ],
        ...,
        [ 1.73205081, -0.09298136, -0.31448545, ..., -

In [33]:
# Random_state is the method that always shuffle the observations in the same 'random' way
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8, random_state = 20)

In [34]:
print(x_train.shape, y_train.shape)

(560, 14) (560,)


In [35]:
print(x_test.shape, y_test.shape)

(140, 14) (140,)


### Logistic regression

In [40]:
# Training the model
reg = LogisticRegression()

In [41]:
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
# Based on the used data, the model learned to classify ~80% (0,7804) of the observations correctly
reg.score(x_train, y_train)

0.7839285714285714