## Import relevant libraries


In [1]:
import numpy as np 
import pandas as pd

## Load the data

In [2]:
data_preprocessed = pd.read_csv("df_preprocessed.csv")

In [3]:
data_preprocessed.head()

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2


In [4]:
### Logisitic regression will be applied on reasons , work transportation expenses , distance to work ,
### age , daily work load average , education , children and pets and will predict their Absenteeism

### The nice thing about regressions is that the model itself will give us a fair indication of which variables 
### are important for the analysis and which aren't


## Create the targets

In [7]:
# we will divide the categories of Absenteeism into : 1- Excessively Absent     2- Slightly Absent 
# This can be done by specifying the median , in which the values below it lies in the 2nd category and the values above it lies in the first 

data_preprocessed['Absenteeism Time in Hours'].median()

3.0

In [8]:
# Everyone who is absent above 3 hours , he is considered as excessively absent person 
# 0 : Moderately absent 
# 1 : Highly Absent 

targets = np.where(data_preprocessed['Absenteeism Time in Hours'] >  data_preprocessed['Absenteeism Time in Hours'].median() ,1,0)

In [9]:
targets

array([1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,

In [10]:
type(targets)

numpy.ndarray

In [11]:
type(data_preprocessed)

pandas.core.frame.DataFrame

In [12]:
data_preprocessed['Excessively Absent'] = targets

In [13]:
type(data_preprocessed['Excessively Absent'])

pandas.core.series.Series

In [14]:
data_preprocessed.head(15)

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Day of the Week,Transportation Expense,Distance to Work,Age,Daily Work Load Average,Body Mass Index,Education,Children,Pets,Absenteeism Time in Hours,Excessively Absent
0,0,0,0,1,7,1,289,36,33,239.554,30,0,2,1,4,1
1,0,0,0,0,7,1,118,13,50,239.554,31,0,1,0,0,0
2,0,0,0,1,7,2,179,51,38,239.554,31,0,0,0,2,0
3,1,0,0,0,7,3,279,5,39,239.554,24,0,2,0,4,1
4,0,0,0,1,7,3,289,36,33,239.554,30,0,2,1,2,0
5,0,0,0,1,7,4,179,51,38,239.554,31,0,0,0,2,0
6,0,0,0,1,7,4,361,52,28,239.554,27,0,1,4,8,1
7,0,0,0,1,7,4,260,50,36,239.554,23,0,4,0,4,1
8,0,0,1,0,7,0,155,12,34,239.554,25,0,2,0,40,1
9,0,0,0,1,7,0,235,11,37,239.554,29,1,1,1,8,1


### Using the median as we see , it's a rigid and simple technique to keep your dataset balanced around (50-50,60-40,and 55-45). Some techniques may accept like logistic regression with 60-40 classification; but NN techniques may decline 
#### 55-45 ratio is very sufficient 


In [15]:
targets.sum()/targets.shape[0]

0.45571428571428574

In [23]:
data_with_targets = data_preprocessed.drop(['Absenteeism Time in Hours','Day of the Week', 'Daily Work Load Average', 'Distance to Work'], axis = 1)

In [24]:
# data_with_targets = data_preprocessed.copy() --> doesn't make them refer to the same object (even if they have the same columns numbers)

In [25]:
data_with_targets is data_preprocessed  #=> false because the new df has 4 column less than the original df

False

In [28]:
data_with_targets.head(10)
data_preprocessed.shape[1] - data_with_targets.shape[1] #=> far2 l columns 

4

## Selecting the inputs with pandas iloc

In [29]:
data_with_targets.shape
#-> rows : 0 to 699 
#-> cols : 0 to 14 

(700, 12)

In [31]:
# to select the inputs for our regression we need to exclude the last column 
data_with_targets.iloc[:,:-1] #=> excludes the last 
#=> iloc[rows_range , columns_range]
#### the following has the same meaning ####
# 0 : 14 --> from 0 to 13 
# :14 --> from the beginning to 13 
# :-1 --> till the last columns (minus sign meaning the number of columns at end you want to skip)
# :shape[1]-1 --> same meaning

#=> 7/7 : loc cannot do this , it gives type error 

Unnamed: 0,Reason_1,Reason_2,Reason_3,Reason_4,Month Value,Transportation Expense,Age,Body Mass Index,Education,Children,Pets
0,0,0,0,1,7,289,33,30,0,2,1
1,0,0,0,0,7,118,50,31,0,1,0
2,0,0,0,1,7,179,38,31,0,0,0
3,1,0,0,0,7,279,39,24,0,2,0
4,0,0,0,1,7,289,33,30,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
695,1,0,0,0,5,179,40,22,1,2,0
696,1,0,0,0,5,225,28,24,0,1,2
697,1,0,0,0,5,330,28,25,1,0,0
698,0,0,0,1,5,235,32,25,1,0,0


In [34]:
unscaled_inputs = data_with_targets.iloc[:,:-1] 
type(unscaled_inputs)

pandas.core.frame.DataFrame

## Standardize the data 
#### Standardization is a z-score normalized data 
#### Normal Distribution (Gaussian by nature) N(u,sigma) , Standard normal Z(0,1) (used for non-gausssian)
#### Standardization puts the features in range of 0->1 (so no need to standardize dummy variables)
#### Drawback of standardization is that 1- it makes the numbers lose its interpretability , 2- cannot be used with nn 

In [41]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

class CustomScaler(BaseEstimator, TransformerMixin):

        def __init__(self, columns):
            self.scaler = StandardScaler()
            self.columns = columns
            self.mean_ = None
            self.var_ = None
        
        #The fit method fits the StandardScaler only on the columns specified.
        #It also calculates the mean and variance of those columns, and stores them in mean_ and var_ respectively.
        
        def fit(self, X, y = None):
            self.scaler.fit(X[self.columns], y)
            self.mean_ = np.mean(X[self.columns], axis=1)
            self.var_ = np.var(X[self.columns])
            return self
        
        #The transform method scales only the specified columns using the fitted StandardScaler.
        #It also preserves the order of columns in the original dataframe. It does this by:

        # 1- Scaling only the selected columns using the fitted StandardScaler
        # 2- Taking the non-scaled columns from the original dataframe
        # 3- Combining the scaled and non-scaled columns while preserving the original order
        
        def transform(self, X, y = None, copy = None):
            init_col_order = X.columns
            X_scaled = pd.DataFrame(self.scaler.transform(X[self.columns]), columns = self.columns)
            # Here, X.columns returns the column labels of the DataFrame X as a Pandas Index object, 
            # and the isin() method checks if each column label is present in the self.columns attribute. 
            # The tilde (~) operator negates the resulting boolean values, 
            # so the final output is a boolean array indicating whether each column in X is not present in self.columns.
            X_not_scaled = X.loc[:, ~X.columns.isin(self.columns)]
            return pd.concat([X_not_scaled, X_scaled], axis = 1)[init_col_order]

In [42]:
unscaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [47]:
columns_to_scale = ['Month Value','Transportation Expense',
          'Age',
            'Body Mass Index', 'Children', 'Pets']
# OR
columns_to_omit = ['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Education']
columns_to_scale = [x for x in unscaled_inputs.columns.values if x not in columns_to_omit]

In [48]:
# Custom scaler 
absenteeism_scaler = CustomScaler(columns_to_scale)


In [49]:
absenteeism_scaler.fit(unscaled_inputs)

CustomScaler(columns=['Month Value', 'Transportation Expense', 'Age',
                      'Body Mass Index', 'Children', 'Pets'])

In [50]:
scaled_inputs = absenteeism_scaler.transform(unscaled_inputs)
type(scaled_inputs) # => sklearn results are ndarrays , so take care
# we subtract the mean and divide by standard deviation

pandas.core.frame.DataFrame

In [55]:
scaled_inputs.columns.values

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [60]:
scaled_inputs.shape
# 700 observations , 11 features

(700, 11)

## Train-test data split 
##### When using the overfitting technique on all of our data and it gets exposed to new data , it fails miserably on this new data. One way to work it around is to split the data to train data for the machine to train on and work on it and the hidden (rest of the data) is used as a test to this machine model

## Import the relevant module

In [53]:
from sklearn.model_selection import train_test_split

## Split

In [65]:
train_test_split(scaled_inputs,targets, train_size = 0.8 , random_state = 20 )

[     Reason_1  Reason_2  Reason_3  Reason_4  Month Value  \
 346         0         0         0         1     1.610276   
 91          0         0         1         0     1.324766   
 299         1         0         0         0     1.039256   
 129         0         0         1         0    -1.530333   
 695         1         0         0         0    -0.388293   
 ..        ...       ...       ...       ...          ...   
 218         1         0         0         0    -0.388293   
 223         0         0         0         1    -0.102784   
 271         0         0         0         1     0.753746   
 474         0         0         0         1     0.182726   
 355         0         0         0         1     1.610276   
 
      Transportation Expense       Age  Body Mass Index  Education  Children  \
 346               -0.654143  0.248310         1.002633          0 -0.919030   
 91                -0.654143  0.562059        -1.114186          1  0.880469   
 299               -0.6541

In [62]:
x_train , x_test , y_train , y_test = train_test_split(scaled_inputs,targets, train_size = 0.8, random_state = 20) 
# train_size takes values between 0 and 1 
# shuffle parameter default is always true [but it doesn't imply robust accurate results]
# random states controls the shuffle a bit , setting it to 20 

In [66]:
print (x_train.shape , y_train.shape) # 75% by default split  but this is 80
# => they have the same observations but differ in feature numbers , Why ? 
# =>  because the y_train stands for one target which is the last column 

(560, 11) (560,)


In [67]:
print(x_test.shape , y_test.shape) # 25% by default split but this is 20%

(140, 11) (140,)


## Logistic regression with sci-kit learn

In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics 

## Training the module 

In [69]:
log_reg = LogisticRegression()

In [71]:
log_reg.fit(x_train,y_train)

LogisticRegression()

In [72]:
log_reg.score(x_train,y_train) #=> accuracy 

0.7732142857142857

## Manually checking the accuracy

In [42]:
model_outputs = log_reg.predict(x_train)
model_outputs

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,

In [43]:
y_train

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,

In [44]:
sum(model_outputs == y_train)

433

In [45]:
# sum(y_train==model_outputs)/y_train.shape[0]
np.sum(y_train==model_outputs)/model_outputs.shape[0]

0.7732142857142857

## Finding the intercepts and coefficients

In [79]:
log_reg.intercept_   #=> [bias]

array([-1.6474549])

In [80]:
log_reg.coef_ #=> good values , but I need to map them back again to my own inputs [weight]

array([[ 2.80019733,  0.95188356,  3.11555338,  0.83900082,  0.1589299 ,
         0.60528415, -0.16989096,  0.27981088, -0.21053312,  0.34826214,
        -0.27739602]])

In [81]:
unscaled_inputs.columns.values
# or 
log_reg.feature_names_in_

array(['Reason_1', 'Reason_2', 'Reason_3', 'Reason_4', 'Month Value',
       'Transportation Expense', 'Age', 'Body Mass Index', 'Education',
       'Children', 'Pets'], dtype=object)

In [82]:
feature_name = log_reg.feature_names_in_
# => It's a good idea to declare a new variable containing this information 

In [84]:
summary_table = pd.DataFrame(columns=['Feature name'], data = feature_name)
# we want to create this data frame - a neat one - to contain the intercepts , coef_ and their corresponding feature names
summary_table

Unnamed: 0,Feature name
0,Reason_1
1,Reason_2
2,Reason_3
3,Reason_4
4,Month Value
5,Transportation Expense
6,Age
7,Body Mass Index
8,Education
9,Children


In [85]:
summary_table['Coefficients'] = np.transpose(log_reg.coef_)
# by default  ,nd arrays are rows 
summary_table

Unnamed: 0,Feature name,Coefficients
0,Reason_1,2.800197
1,Reason_2,0.951884
2,Reason_3,3.115553
3,Reason_4,0.839001
4,Month Value,0.15893
5,Transportation Expense,0.605284
6,Age,-0.169891
7,Body Mass Index,0.279811
8,Education,-0.210533
9,Children,0.348262


In [86]:
summary_table.index = summary_table.index + 1 #=> shifts the dataframe data one down , leaving index 0 empty (not seen but it's there)


In [87]:
summary_table.loc[0] = ['Intercept',log_reg.intercept_[0]] # fills in the 0 index with desired tuple [at the end]


In [88]:
summary_table = summary_table.sort_index() # sort the 0 since the added index is formed at the end of the dataFrame
summary_table

Unnamed: 0,Feature name,Coefficients
0,Intercept,-1.647455
1,Reason_1,2.800197
2,Reason_2,0.951884
3,Reason_3,3.115553
4,Reason_4,0.839001
5,Month Value,0.15893
6,Transportation Expense,0.605284
7,Age,-0.169891
8,Body Mass Index,0.279811
9,Education,-0.210533


## Interpreting the coefficients and intercepts 


In [89]:
summary_table['Odd Ratio'] = np.exp(summary_table.Coefficients) #-> logit regression model ~ Logistic 


# log(Odds) = bias + coef1*feature1 + ..... --> linear regression ! 
# Odds = exp el kalam eli fo2 da (logit regression model)
# Assuming a fair dice , what are the odds of having a face of 5 ? 
# Answer : 1/6 / 5/6 = 1/5 --> ratio of happened / ratio for those who didn't --> px / 1-px

# Here , we estimate upon reason given / reasons not given (Logistic regression model is too complex and weary to use )
# we prefer the logit regression model , whose log gives us the linear 
summary_table

Unnamed: 0,Feature name,Coefficients,Odd Ratio
0,Intercept,-1.647455,0.192539
1,Reason_1,2.800197,16.447892
2,Reason_2,0.951884,2.590585
3,Reason_3,3.115553,22.545903
4,Reason_4,0.839001,2.314054
5,Month Value,0.15893,1.172256
6,Transportation Expense,0.605284,1.831773
7,Age,-0.169891,0.843757
8,Body Mass Index,0.279811,1.32288
9,Education,-0.210533,0.810152


In [92]:
summary_table = summary_table.sort_values('Odd Ratio' , ascending=False)

In [93]:
summary_table #=> listed from most important down to least important 
# Now we get insight of features that makes no difference 
# features with coefficients around 0 ~ features with odd ratio around 1 -> both ineffective 

Unnamed: 0,Feature name,Coefficients,Odd Ratio
3,Reason_3,3.115553,22.545903
1,Reason_1,2.800197,16.447892
2,Reason_2,0.951884,2.590585
4,Reason_4,0.839001,2.314054
6,Transportation Expense,0.605284,1.831773
10,Children,0.348262,1.416604
8,Body Mass Index,0.279811,1.32288
5,Month Value,0.15893,1.172256
7,Age,-0.169891,0.843757
9,Education,-0.210533,0.810152


In [58]:
# NOT YET UNDERSTANDED 
# for a unit change in a standardized feature , the odd increase by a multiple equal to that of the odd ratio 
# Omitting reason 0 makes it a base model (3la ma yabdo enna bn-compare beh)
# Standardizing dummies is bad  
# Same order in video , different number

# UNDERSTOOD 
# odds : p(x) / 1 - p(x) 
# odds ratio : comparing the odds of two features (odds(reason 3) / odds(reason 0))
# Why not standardize dummies ? because standardization puts the data between 0 and 1, dummies are already satisfying this category 
# log(odds) = bias + coef1*feature1 + ..... --> log(odds) = log(p(x) / 1 - p(x)) = bias + coef1*feature1 + ..... => linear regression
# odds = exp(bias + coef1*feature1 + .....) => logistic regression


## Interpretation of the upper summary table and drawbacks of standardizations 
### Daily Work Load Average , Distance to Work and Day of the Week have less impact as they approach 0 
### Features with positive Coefficients (Higher Odd Ratio) Implies the higher chances of absenteeism 
##### Ex: Reason 3 , a poisoned person , has of course 20x chances for absence (No shit sherlock !) 
##### Ex: A person with a huge number of pets has a meaning that he has someone to help with so his chances of absenteeism fall back 
### We call the intercept a calibrating factor (bias) for the model to ensure good, accurate results , it has no interpreting meaning to its value 

## Drawbacks of Standardization 
### Lacks Interpretations meaning (statisticans) , but Accuracy makes it up for it (programmers)

## Backward Elimination : Removal of non-effective features after seeing their coefficients (Odd Ratios) 

In [59]:
#############################################################################################################################

## Testing the model 

In [60]:
log_reg.score(x_test,y_test) #=> tests score < train score 

0.75

In [61]:
predicted_proba = log_reg.predict_proba(x_test)
predicted_proba
#=> will all sum to 1 
#=> first column is the probability of moderate abscene , second is that of excessive absence

array([[0.71340413, 0.28659587],
       [0.58724228, 0.41275772],
       [0.44020821, 0.55979179],
       [0.78159464, 0.21840536],
       [0.08410854, 0.91589146],
       [0.33487603, 0.66512397],
       [0.29984576, 0.70015424],
       [0.13103971, 0.86896029],
       [0.78625404, 0.21374596],
       [0.74903632, 0.25096368],
       [0.49397598, 0.50602402],
       [0.22484913, 0.77515087],
       [0.07129151, 0.92870849],
       [0.73178133, 0.26821867],
       [0.30934135, 0.69065865],
       [0.5471671 , 0.4528329 ],
       [0.55052275, 0.44947725],
       [0.5392707 , 0.4607293 ],
       [0.40201117, 0.59798883],
       [0.05361575, 0.94638425],
       [0.7003009 , 0.2996991 ],
       [0.78159464, 0.21840536],
       [0.42037128, 0.57962872],
       [0.42037128, 0.57962872],
       [0.24783565, 0.75216435],
       [0.74566259, 0.25433741],
       [0.51017274, 0.48982726],
       [0.85690195, 0.14309805],
       [0.20349733, 0.79650267],
       [0.78159464, 0.21840536],
       [0.

In [62]:
predicted_proba[:,1]
# if >0.5 , put 1 , else 0 


array([0.28659587, 0.41275772, 0.55979179, 0.21840536, 0.91589146,
       0.66512397, 0.70015424, 0.86896029, 0.21374596, 0.25096368,
       0.50602402, 0.77515087, 0.92870849, 0.26821867, 0.69065865,
       0.4528329 , 0.44947725, 0.4607293 , 0.59798883, 0.94638425,
       0.2996991 , 0.21840536, 0.57962872, 0.57962872, 0.75216435,
       0.25433741, 0.48982726, 0.14309805, 0.79650267, 0.21840536,
       0.36956558, 0.67906035, 0.68502567, 0.52868083, 0.21840536,
       0.53506551, 0.22147081, 0.73692105, 0.40498044, 0.60505988,
       0.21075848, 0.45224466, 0.23751292, 0.39833498, 0.82755447,
       0.56797575, 0.69113325, 0.28659587, 0.21935267, 0.2033097 ,
       0.57628256, 0.3294664 , 0.66512397, 0.26949499, 0.83321968,
       0.43491525, 0.88374612, 0.23127072, 0.33415858, 0.34432939,
       0.69909345, 0.65494263, 0.29244941, 0.79200758, 0.20750276,
       0.26840558, 0.08708566, 0.22147081, 0.73245417, 0.30530219,
       0.22147081, 0.29014408, 0.90438191, 0.46061297, 0.60174

## Saving the model

In [94]:
import pickle

In [95]:
with open('model', 'wb') as file:
    pickle.dump(log_reg, file)

In [97]:
with open('scaler', 'wb') as file:
    pickle.dump(absenteeism_scaler, file)

# Recap 
### Further preprocessing on our data using machine learning 
### Using logistic model (Logit Regression in specific) , We wanted to study the weights of the features in our binary decision 
#### 1 : This employee has very high chance of being absent
#### 0:  This employee has low chance of being absent
### Our decision is based of the median of the Absenteeism Time in hours , which was 3 , if it's higher than 3 , then predict (mark as highly absent) 
### Our model consists of reason to be absent and no particular reason with given , together they formulate the sample space of reason of absence , but we care to study the odds ! (What are the odds?)
### The odds is when I ask you what are the odds of being absent due to reason 3 (poisoning) , so you could answer me it is probability of me being absent for reason 3 and not being absent for any other reason (this is only the odd ) {Another example is the odds of getting a tail in throwing a coin , is 1 because heads have the same chance}
### The odd ratio is the ratio between two odds (like the odds of getting number 4 in a die / odds of getting number 6) ->> looks stupid to do so with a fair die , and fair coin but has a strong reason if I said the odd ratio of reason 3 is 22 (considering reason 0 as baseline) means that if I am poisoned , I am most likely to be absent more than 3 hours with factor of 22 rather than when I have no reason at all 

### On_Condition_1_regressor_1_predictor = {Odd_Ratio_Meaning_in_Logistic_Regression_Model : R_squared_Meaning_in_Linear_Regression_Model} , i.e : In a multiverse of Regression Models , Odd ratio for logistic models refer to the same meaning just as the r squared refer to in the linear model (this is on a conceptual, theortical basis) , but as for their mathemetical interpretation , R squared is the _difference_ between the predictiors(y_predicted) and the y of the regression line , meanwhile the odd ratio is the _ratio_ between the predictors ; hence , that's a mathematical difference only , but for a business - practical approach - they convey the same idea 

# Important Notes 

### Machine learning model should not train on all data 
### Machine learning score on test is always less than score on train 
### We must save the model in bytes so we avoid setting it up everytime to learn 
### We must save the scaler also , for any new data of employess and stuff to be scaled in the same way (with specific columns to be scaled)
### Dummy variables are ranging between 0 and 1 , they don't need normalization ; hence don't need standardiztion 
### important keywords : np.median , StandardScaler , LogisticRegression , pickle (pickling) , fit (for scaling and regressing) , transform , feature_name_in , coef_ , intercept_ , sort_index, sklearn.base , sklearn.model_selection , sklearn.linear_model, predict, score , predict_proba, train_test_split , dump (pickle) , load (pickle)

# Hacks and Tips 
### Pickle is unsafe , and it should be matching version with scikit learn library 
### we are always interested in creating modules with python performing all the steps of preprocssing (.py format)
### Standardization is preferred with regression models 

