## Import relevant libraries


In [None]:
import numpy as np 
import pandas as pd

## Load the data

In [None]:
data_preprocessed = pd.read_csv("df_preprocessed.csv")

In [None]:
data_preprocessed.head()

In [None]:
### Logisitic regression will be applied on reasons , work transportation expenses , distance to work ,
### age , daily work load average , education , children and pets and will predict their Absenteeism

### The nice thing about regressions is that the model itself will give us a fair indication of which variables 
### are important for the analysis and which aren't


## Create the targets

In [None]:
# we will divide the categories of Absenteeism into : 1- Excessively Absent     2- Slightly Absent 
# This can be done by specifying the median , in which the values below it lies in the 2nd category and the values above it lies in the first 

data_preprocessed['Absenteeism Time in Hours'].median()

In [None]:
# Everyone who is absent above 3 hours , he is considered as excessively absent person 
# 0 : Moderately absent 
# 1 : Highly Absent 

targets = np.where(data_preprocessed['Absenteeism Time in Hours'] >  data_preprocessed['Absenteeism Time in Hours'].median() ,1,0)

In [None]:
targets

In [None]:
type(targets)

In [None]:
type(data_preprocessed)

In [None]:
data_preprocessed['Excessively Absent'] = targets

In [None]:
type(data_preprocessed['Excessively Absent'])

In [None]:
data_preprocessed.head(15)

### Using the median as we see , it's a rigid and simple technique to keep your dataset balanced around (50-50,60-40,and 55-45). Some techniques may accept like logistic regression with 60-40 classification; but NN techniques may decline 
#### 55-45 ratio is very sufficient 


In [None]:
targets.sum()/targets.shape[0]

In [None]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours'], axis = 1)

In [None]:
# data_with_targets = data_preprocessed.copy() --> doesn't make them refer to the same object (even if they have the same columns numbers)

In [None]:
data_with_targets is data_preprocessed  #=> false because the new df has 1 column less than the original df

In [None]:
data_with_targets.head(10)

## Selecting the inputs with pandas iloc

In [None]:
data_with_targets.shape
#-> rows : 0 to 699 
#-> cols : 0 to 14 

In [None]:
# to select the inputs for our regression we need to exclude the last column 
data_with_targets.iloc[:,:-1] #=> excludes the last 
#=> iloc[rows_range , columns_range]
#### the following has the same meaning ####
# 0 : 14 --> from 0 to 13 
# :14 --> from the beginning to 13 
# :-1 --> till the last columns (minus sign meaning the number of columns at end you want to skip)
# :shape[1]-1 --> same meaning

In [None]:
unscaled_inputs = data_with_targets.iloc[:,:-1]

## Standardize the data 

In [None]:
from sklearn.preprocessing import StandardScaler
absenteeism_scaler  = StandardScaler()

In [None]:
absenteeism_scaler.fit(unscaled_inputs)

In [None]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
# we subtract the mean and divide by standard deviation

In [None]:
scaled_inputs

In [None]:
scaled_inputs.shape
# 700 observations , 14 features

## Train-test data split 
##### When using the overfitting technique on all of our data and it gets exposed to new data , it fails miserably on this new data. One way to work it around is to split the data to train data for the machine to train on and work on it and the hidden (rest of the data) is used as a test to this machine model

## Import the relevant module

In [None]:
from sklearn.model_selection import train_test_split

## Split

In [None]:
train_test_split(scaled_inputs,targets, train_size = 0.9 , random_state = 20 )

In [None]:
x_train , x_test , y_train , y_test = train_test_split(scaled_inputs,targets, train_size = 0.8, random_state = 20) 
# train_size takes values between 0 and 1 
# shuffle parameter default is always true [but it doesn't imply robust accurate results]
# random states controls the shuffle a bit , setting it to 20 

In [None]:
print (x_train.shape , y_train.shape) # 75% by default split  but this is 80
# => they have the same observations but differ in feature numbers , Why ? 
# =>  because the y_train stands for one target which is the last column 

In [None]:
print(x_test.shape , y_test.shape) # 25% by default split but this is 20%

## Logistic regression with sci-kit learn

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics 

## Training the module 

In [None]:
log_reg = LogisticRegression()

In [None]:
log_reg.fit(x_train,y_train)
LogisticRegression()

In [None]:
log_reg.score(x_train,y_train) #=> accuracy 

## Manually checking the accuracy

In [None]:
model_outputs = log_reg.predict(x_train)
model_outputs

In [None]:
y_train

In [None]:
model_outputs == y_train

In [None]:
# sum(y_train==model_outputs)/y_train.shape[0]
np.sum(y_train==model_outputs)/model_outputs.shape[0]