In [1]:
import numpy as np 
import pandas as pd

In [2]:
pre_data = pd.read_csv('df-preprocessed.csv')

In [3]:
pre_data.head(20)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2
5,0,0,0,1,7,4,179,51,38,239.554,31,0,0,0,2
6,0,0,0,1,7,4,361,52,28,239.554,27,0,1,4,8
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0,4
8,0,0,1,0,7,0,155,12,34,239.554,25,0,2,0,40
9,0,0,0,1,7,0,235,11,37,239.554,29,1,1,1,8


Logistic Regression is a type of classification

## Creating the Targets for the Logistic Regression
### Create two classes:
    - Moderately absent (<= 3 hours)
    - Excessively absent ( >= 4 hours)
    

In [4]:
pre_data['Absenteeism Time in Hours'].median()

3.0

- Using the median as a cut-off line 
- We've balanced the dataset
    - Half of the targets are 0 and the other half 1
- This will prevent the model from learning to output just one of the two classes exlusively 

In [5]:
targets = np.where(pre_data['Absenteeism Time in Hours'] > 
                   pre_data['Absenteeism Time in Hours'].median(), 1, 0)

In [6]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

#### Create a new column called "Excessive Absenteeism" where:
   - 1 if absenteesim time in hours > 3
   - 0 otherwise

In [7]:
pre_data['Excessive_Absenteeism'] = targets

In [8]:
pre_data.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessive_Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0


- Unfortunately, here I went from a pandas DF to a numpy array and back to a pandas DF

- ToDO: Fix to transform within the DF


#### A comment of the targets:
    - Prove the dataset is balanced:
        - Divide total 1's targets by total targets

In [9]:
targets.sum() / targets.shape[0]

0.45571428571428574

- Around 46% of the targets are 1's so 54% are 0's
#### This will be a sufficient split of the dataset
    - 60-40 split will work equally well for LOGISTIC REGRESSION
        - Not true for other algorithms like neural networks
    - However 55-45 split will almost always work equally well for all algorithms

In [10]:
data_with_targets = pre_data.drop(['Absenteeism Time in Hours'], axis=1)

### Check to make sure done correctly:
    - Check if data_with_targets points to same piece of memory as pre_data:

In [11]:
data_with_targets is pre_data

False

- They're different, so data_with_targets is the new checkpoint

 ## Selecting Inputs for the Regression:

In [12]:
data_with_targets.shape

(700, 15)

In [13]:
data_with_targets.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Excessive_Absenteeism
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,0


#### To select inputs for regression, select all the rows and columsn except "Excessive Absenteeism":

In [14]:
data_with_targets.iloc[:, :-1].head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1


In [15]:
unscaled_inputs = data_with_targets.iloc[:, :-1]

In [16]:
unscaled_inputs.head(20)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1
5,0,0,0,1,7,4,179,51,38,239.554,31,0,0,0
6,0,0,0,1,7,4,361,52,28,239.554,27,0,1,4
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0
8,0,0,1,0,7,0,155,12,34,239.554,25,0,2,0
9,0,0,0,1,7,0,235,11,37,239.554,29,1,1,1


## Standardize data:

In [17]:
from sklearn.preprocessing import StandardScaler

# absentee_scaler = StandardScaler()

## Omitting the dummy variables from the Standardization:
### To preserve the dummy variable usability:

### THIS IS THE FIXED CUSTOM SCALER:
#### USUALLY WE'LL WANT TO STANDARDIZE BEFORE CREATING THE DUMMIES, but it's always good to control exactly what is being standardized:

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns, copy=True, with_mean=True, with_std=True):
        self.scaler = StandardScaler(copy, with_mean, with_std)
        self.columns = columns
        self.mean_ = None
        self.var_ = None
        
    def fit(self, X, y=None):
        self.scaler.fit(X[self.columns], y)
        self.mean_ = np.mean(X[self.columns])
        self.var_ = np.var(X[self.columns])
        return self

    def transform(self, X, y=None, copy=None):
        init_col_order = X.columns
        X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), 
                               columns = self.columns)
        X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
        return pd.concat([X_not_scaled, X_scaled], axis=1)[init_col_order]    

### Still fixing the scaler:
#### See the columns available:

In [19]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

### Still fixing the scaler:
#### Create a new list of the features you want to scale (omitting the dummies: 'Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education'):

In [20]:
# columns_to_scale = ['Month Value', 'Day of the Week', 'Transportation Expense', 'Distance to Work',
#        'Age', 'Daily Work Load Average', 'Body Mass Index',
#        'Children', 'Pets']

### From later on in code from Backward Elimination:
- Create columns_to_omit variable:

In [21]:
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']

In [22]:
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]
columns_to_scale

['Month Value',
 'Day of the Week',
 'Transportation Expense',
 'Distance to Work',
 'Age',
 'Daily Work Load Average',
 'Body Mass Index',
 'Children',
 'Pets']

### Still fixing the scaler:
#### Now create the fixed scaler object:

In [23]:
absentee_scaler = CustomScaler(columns_to_scale)

### Now the scaler is fixed and we can continue on:

- Below line will calculate and store the mean and standard deviation of each feature/variable from unscaled_inputs
    - Stored in absentee_scaler object
    
- Whenever we get new data the standardization info is contained in absentee_scaler

In [24]:
absentee_scaler.fit(unscaled_inputs)

  return self.partial_fit(X, y)


CustomScaler(columns=['Month Value', 'Day of the Week', 'Transportation Expense', 'Distance to Work', 'Age', 'Daily Work Load Average', 'Body Mass Index', 'Children', 'Pets'],
       copy=None, with_mean=None, with_std=None)

### We have just prepared the scaling
#### No let's apply it through 'transform':

In [25]:
scaled_inputs = absentee_scaler.transform(unscaled_inputs)



### Whenever you get new data, you will just apply below to reach same transformation as above (most common and useful way to transform new data when deploying a model):

new_data_raw = pd.read_csv('new_data.csv')

new_data_scaled = absentee_scaler.transform(new_data_raw)

In [26]:
# Below we can see that all the dummies remain untouched (0's and 1's):
scaled_inputs.head(20)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets
0,0,0,0,1,0.182726,-0.683704,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
1,0,0,0,0,0.182726,-0.683704,-1.574681,-1.141882,2.130803,-0.806331,1.002633,0,-0.01928,-0.58969
2,0,0,0,1,0.182726,-0.007725,-0.654143,1.426749,0.24831,-0.806331,1.002633,0,-0.91903,-0.58969
3,1,0,0,0,0.182726,0.668253,0.854936,-1.682647,0.405184,-0.806331,-0.643782,0,0.880469,-0.58969
4,0,0,0,1,0.182726,0.668253,1.005844,0.412816,-0.536062,-0.806331,0.767431,0,0.880469,0.268487
5,0,0,0,1,0.182726,1.344231,-0.654143,1.426749,0.24831,-0.806331,1.002633,0,-0.91903,-0.58969
6,0,0,0,1,0.182726,1.344231,2.092381,1.494345,-1.320435,-0.806331,0.061825,0,-0.01928,2.843016
7,0,0,0,1,0.182726,1.344231,0.568211,1.359154,-0.065439,-0.806331,-0.878984,0,2.679969,-0.58969
8,0,0,1,0,0.182726,-1.359682,-1.016322,-1.209478,-0.379188,-0.806331,-0.40858,0,0.880469,-0.58969
9,0,0,0,1,0.182726,-1.359682,0.190942,-1.277074,0.091435,-0.806331,0.532229,1,-0.01928,0.268487


In [27]:
scaled_inputs.shape

(700, 14)

## Train/Test Split of Data:

#### Data Shuffling:
- In order to remove all types of dependencies that come from the order of the data set (day of the week)

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
train_test_split(scaled_inputs, targets)

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  Day of the Week  \
 384         1         0         0         0    -1.244823        -0.007725   
 578         0         0         0         1    -1.530333        -1.359682   
 164         1         0         0         0    -0.959313        -1.359682   
 92          1         0         0         0     1.324766         1.344231   
 444         0         0         0         1    -0.102784         0.668253   
 595         0         0         0         1    -1.244823        -1.359682   
 97          0         0         0         1     1.324766         0.668253   
 658         1         0         0         0    -0.673803         0.668253   
 202         1         0         0         0    -0.673803         1.344231   
 486         0         0         0         1     0.468236         0.668253   
 303         0         0         0         0     1.039256         2.696187   
 439         1         0         0         0    -0.388293       

In [30]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets)

- Train Inputs contains 525 observations along 14 variables
- Train Targets are a vector/array of length 525
    - Corresponding to 'Excessive Absenteesism" column
    
- Test Inputs contains 175 observations along 14 variables
- Test Targets are a vector/array of length 175
    - Corresponding to 'Excessive Absenteesism" column

In [31]:
x_train.shape, y_train.shape

((525, 14), (525,))

In [32]:
x_test.shape, y_test.shape

((175, 14), (175,))

#### Usually better for make testing smaller though:

In [33]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, train_size = 0.8)



In [34]:
x_train.shape, y_train.shape

((560, 14), (560,))

In [35]:
x_test.shape, y_test.shape

((140, 14), (140,))

Add in random_state parameter to have random shuffle be consistent to have same output everytime:

In [36]:
x_train, x_test, y_train, y_test = train_test_split(scaled_inputs, targets, random_state = 20, train_size = 0.8)

# Training the model and Assessing its Accuracy:

## Logistic Regression with sklearn:

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Training the Model:

In [38]:
reg = LogisticRegression()

In [39]:
reg.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [40]:
reg.score(x_train, y_train)

0.7714285714285715

## Manually check the accuracy
- Accuracy: 
    - In this case 78% of the model ouputs match the targets
- So to find accuracy of model manually, find the model outputs and compare them with the targets:

In [41]:
model_outputs = reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [43]:
# See which elements have been guessed correctly and whihc haven't
model_outputs == y_train

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True, False, False,  True,  True,  True,  True,
       False,  True, False,  True, False, False,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False, False,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
       False,  True, False,  True,  True, False, False, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,

#### How many matches are there?:

In [44]:
np.sum(model_outputs == y_train)

432

In [45]:
model_outputs.shape[0]

560

#### Check/verify the accuracy:

In [46]:
np.sum(model_outputs == y_train) / model_outputs.shape[0]

0.7714285714285715

## Extracting the Intercept (/Bias) and Coefficients (/ Weights) from a Logistic Regression:

#### Finding the intercept and coefficients:

In [47]:
reg.intercept_

array([-1.47082548])

In [48]:
reg.coef_

array([[ 2.62614283,  0.84432   ,  2.93971875,  0.67771914,  0.16145652,
        -0.08257256,  0.60695978, -0.00741802, -0.16862442, -0.00383568,
         0.26749958, -0.23085763,  0.35572764, -0.28559431]])

#### Which variables do these coefficients refer to:

In [49]:
scaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Day of the Week', 'Transportation Expense', 'Distance to Work',
       'Age', 'Daily Work Load Average', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

#### Above we get an error. We can go back and see when we created the scaler object that the DF was converted into an nd.array
#### Note: Whenever we employ sklearn, it works on pandas DF's, however the results are usually converted into arrays

In [50]:
feature_name = unscaled_inputs.columns.values

#### Create a DF that will contain the intercept, the feature names, and the corresponding coefficients:

In [51]:
summary_table = pd.DataFrame(columns=['Feature Name'],
                             data = feature_name)

# Let's transpose the array because by default nd.arrays are rows and 
# not columns:
summary_table['Coefficients'] = np.transpose(reg.coef_)

summary_table

Unnamed: 0,Feature Name,Coefficients
0,Reason_1,2.626143
1,Reason_2,0.84432
2,Reason_3,2.939719
3,Reason_4,0.677719
4,Month Value,0.161457
5,Day of the Week,-0.082573
6,Transportation Expense,0.60696
7,Distance to Work,-0.007418
8,Age,-0.168624
9,Daily Work Load Average,-0.003836


### Now just add the intercept info:
- Since this is an array, appending to it usually adds to the end.
- Shift up the indicies to leave the index at 0 empty:

In [52]:
summary_table.index = summary_table.index + 1

In [53]:
summary_table.loc[0] = ['Intercept', reg.intercept_[0]]
summary_table = summary_table.sort_index()
summary_table

Unnamed: 0,Feature Name,Coefficients
0,Intercept,-1.470825
1,Reason_1,2.626143
2,Reason_2,0.84432
3,Reason_3,2.939719
4,Reason_4,0.677719
5,Month Value,0.161457
6,Day of the Week,-0.082573
7,Transportation Expense,0.60696
8,Distance to Work,-0.007418
9,Age,-0.168624


# Interpreting the Logistic Regression Coefficients:
- Higher magnitude coefficients have larger weights and impact (when the data is standardized)

#### Create a new series in DF called 'Odds_ratio' = output after getting expoential of coefficients:

In [54]:
summary_table['Odds_ratio'] = np.exp(summary_table.Coefficients)

In [55]:
summary_table

Unnamed: 0,Feature Name,Coefficients,Odds_ratio
0,Intercept,-1.470825,0.229736
1,Reason_1,2.626143,13.82036
2,Reason_2,0.84432,2.326395
3,Reason_3,2.939719,18.910527
4,Reason_4,0.677719,1.969381
5,Month Value,0.161457,1.175221
6,Day of the Week,-0.082573,0.920745
7,Transportation Expense,0.60696,1.834845
8,Distance to Work,-0.007418,0.992609
9,Age,-0.168624,0.844826


#### A feature/variable is not particularly important if:
- its coefficient is close to 0
- its odds ratio is close  to 1
    - For one unit change in the standardized feature, the odds increase by a multiple equal to the odds ratio

In [56]:
summary_table.sort_values('Odds_ratio', ascending=False)

Unnamed: 0,Feature Name,Coefficients,Odds_ratio
3,Reason_3,2.939719,18.910527
1,Reason_1,2.626143,13.82036
2,Reason_2,0.84432,2.326395
4,Reason_4,0.677719,1.969381
7,Transportation Expense,0.60696,1.834845
13,Children,0.355728,1.427219
11,Body Mass Index,0.2675,1.306693
5,Month Value,0.161457,1.175221
10,Daily Work Load Average,-0.003836,0.996172
8,Distance to Work,-0.007418,0.992609


- Above we can see that given all the features, apparently 'Daily Work Load Average' and 'Distance to Work' are especially negligable 
- Reason_0 represented a situation where a person was absent but no reason was given; therefore the base model is when there is no reason

# Interpreting the Important Predictors:
- Reason_0 = No reason = baseline model (when no reason is given)
- Reason_1 = Various Diseases
- Reason_2 = Pregnancy and giving birth
- Reason_3 = Poisoning and peculiar reasons
- Reason_4 = Light Diseases

- The reasons weights mean that the odds of them being absent are the (odds_ratio) times higher than when no reason was given 
    - ex.) Reason_3 = about 19x more likely

### Now look to: Transportation Expense:
#### This is the most important non dummy feature in the model, but it's standardized.
    - We don't have direct interpretability of it
    - Odds ratio implies that for one standardized unit (or for one standard deviation increase in transportation expense) it's about 2x as likely to be excessively absent
- Standardized models almost always yield higher accuracy because the optimization algorithms work better

#### However should we opt for predicting values we definitely prefer higher accuracy so standardization is would be better

### Interpret a negative coefficient: Pet
- Pet is a continuous variable
- It's odds ratio is 0.751567
- Therefore its odds are 1 - 0.751567 ~= 25% lower than the base model (no pets)
    - One explanation could be that if you have several pets, you have somebody else taking care of them too


# Simplifying the Model:
## Backward Elimination:
- Simplify the model by removing all features which have close to no contribution to the model
    - From the p-values, get rid of all the coefficients with p-values > 0.05
- HOWEVER, with sklearn we don't have p-values because we don't really need them
    - Go back to 'columns_to_scale' variable and subtract from 'columns_to_omit' variable we now create

# Testing the Model:
## Model accuracy so far has meant 'training' accuracy form the training data
- So now, instead of testing, we use test data to train more, but now manually (We're not really testing)

In [57]:
reg.score(x_test, y_test)

0.7428571428571429

- This time the parameters were x_test and y_test.
- Accuracy = 0.7428

#### So based on data that the model has never seen before, in 74.28% of the cases, the model will predict correctly if the person is going to be excessively absent

### Now apart from the accuracy we can get the outputs themselves:

####  Geth the probability of an output being a 0 or 1:
    - First column: Probability model assigned the observation being 0
    - Second column: Probability model assigned the observation being 1
        - (Prob of being excessively absent)
    - Where (First column) + (Second column) = 1


In [59]:
predicted_probability = reg.predict_proba(x_test)
predicted_probability

array([[0.73685721, 0.26314279],
       [0.60918629, 0.39081371],
       [0.41367898, 0.58632102],
       [0.80123548, 0.19876452],
       [0.07360132, 0.92639868],
       [0.31482473, 0.68517527],
       [0.30996677, 0.69003323],
       [0.13143709, 0.86856291],
       [0.79426475, 0.20573525],
       [0.75019913, 0.24980087],
       [0.48367988, 0.51632012],
       [0.19799007, 0.80200993],
       [0.07753484, 0.92246516],
       [0.70995519, 0.29004481],
       [0.30224948, 0.69775052],
       [0.57267672, 0.42732328],
       [0.54083385, 0.45916615],
       [0.57389761, 0.42610239],
       [0.38014204, 0.61985796],
       [0.04814556, 0.95185444],
       [0.69716193, 0.30283807],
       [0.79219655, 0.20780345],
       [0.39173151, 0.60826849],
       [0.4186287 , 0.5813713 ],
       [0.25818169, 0.74181831],
       [0.75527615, 0.24472385],
       [0.51057945, 0.48942055],
       [0.86794852, 0.13205148],
       [0.19785515, 0.80214485],
       [0.78285796, 0.21714204],
       [0.

#### Slice out Second column for pure probability of being excessively absent:

In [60]:
predicted_probability[:, 1]

array([0.26314279, 0.39081371, 0.58632102, 0.19876452, 0.92639868,
       0.68517527, 0.69003323, 0.86856291, 0.20573525, 0.24980087,
       0.51632012, 0.80200993, 0.92246516, 0.29004481, 0.69775052,
       0.42732328, 0.45916615, 0.42610239, 0.61985796, 0.95185444,
       0.30283807, 0.20780345, 0.60826849, 0.5813713 , 0.74181831,
       0.24472385, 0.48942055, 0.13205148, 0.80214485, 0.21714204,
       0.3749067 , 0.68956837, 0.68976236, 0.53913229, 0.20780345,
       0.50790295, 0.21408487, 0.75084196, 0.43502114, 0.58927018,
       0.22777885, 0.43722182, 0.21924622, 0.43428676, 0.81295852,
       0.58244563, 0.69724136, 0.27338655, 0.20552251, 0.18317411,
       0.59131998, 0.38115576, 0.67045219, 0.28704924, 0.8497776 ,
       0.46817998, 0.89190316, 0.25870233, 0.35673457, 0.35514834,
       0.72095696, 0.6618861 , 0.31310986, 0.79285261, 0.20025819,
       0.26743725, 0.09841081, 0.2328088 , 0.73526565, 0.32952965,
       0.2134669 , 0.32994943, 0.90821459, 0.43626886, 0.61791

#### Logistic Regressions compute these in the background but it automatically reduces to:
- Prob < 0.5, it places a 0
- Prob > 0.5, it places a 1

# Next Steps:
## 1.) Save the model
- Use later on
- No need to train every time
- Just determine the weights once and save for later use

## 2.) Create a module

## 3.) Get new data, classify it, pass through SQL, and analyze in Tableau

# Save the Model:

### Train a model once and then save it

### If we go back we can see that the object reg where:
- reg = LogisticRegression()
- LogisticRegression( C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False)

### The logistic regression class saves all of these values 
### So saving the model is the same as saving the reg object

### We can do this through "pickle":

In [61]:
import pickle
# model_file => newly created filename
# wb => write bytes; or rb => read bytes for unpickeling
# dump => save info to file ; or load => load info from file for unpickeling
# reg => object to tbe dump

with open('model.pkl', 'wb') as file:
    pickle.dump(reg, file)

with open('model.pkl', 'rb') as file:
    mp = pickle.load(file)

reg.predict(x_train)


array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [62]:
mp.predict(x_test)

array([0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0])

### Must also save the scaler object too:
- It was used to standardize all numerical variables
- And stored the columns to scale, and the mean and std of each feature

### So pickle the scaler:
- Our code was dependent upon training data which trained the machine learning used, but once the model is trained and we've obtained the coefficients, we can save it 
- Info in scaler object (absentee_scaler) is used to preprocess new data using the same rules to preprocess the training data

In [64]:
with open('scaler_file.pkl', 'wb') as file:
    pickle.dump(absentee_scaler, file)

# Creating a Module for Later Use of the Model:
## In the next notebook we will:
- Apply the model to new data:
        - Create a module
        - Storing code in module will allow us to reuse it wihtout trouble