In [102]:
# import os
# import re
# import sys
# from hashlib import sha1

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import datasets

from sklearn.compose import (
    ColumnTransformer,
    TransformedTargetRegressor,
    make_column_transformer,
)

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer

from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_validate,
    train_test_split,
)
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVC, SVR

import scipy
from scipy.stats import randint

In [103]:
data = pd.read_csv('data/Absenteeism_at_work.csv', sep=";")

In [63]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 740 entries, 0 to 739
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               740 non-null    int64  
 1   Reason for absence               740 non-null    int64  
 2   Month of absence                 740 non-null    int64  
 3   Day of the week                  740 non-null    int64  
 4   Seasons                          740 non-null    int64  
 5   Transportation expense           740 non-null    int64  
 6   Distance from Residence to Work  740 non-null    int64  
 7   Service time                     740 non-null    int64  
 8   Age                              740 non-null    int64  
 9   Work load Average/day            740 non-null    float64
 10  Hit target                       740 non-null    int64  
 11  Disciplinary failure             740 non-null    int64  
 12  Education             

In [104]:
data['Social drinker'] = data['Social drinker'].astype('bool')
data['Social smoker'] = data['Social smoker'].astype('bool')
data['Disciplinary failure'] = data['Disciplinary failure'].astype('bool')
data['Seasons'] = data['Seasons'].astype('category')
data['Education'] = data['Education'].astype('category')
data['Month of absence'] = data['Month of absence'].astype('category')
data['Reason for absence'] = data['Reason for absence'].astype('category')
unuseful_idx = (data[data["Month of absence"] == 0]).index.tolist()
data = data.drop(unuseful_idx)

In [105]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 737 entries, 0 to 736
Data columns (total 21 columns):
 #   Column                           Non-Null Count  Dtype   
---  ------                           --------------  -----   
 0   ID                               737 non-null    int64   
 1   Reason for absence               737 non-null    category
 2   Month of absence                 737 non-null    category
 3   Day of the week                  737 non-null    int64   
 4   Seasons                          737 non-null    category
 5   Transportation expense           737 non-null    int64   
 6   Distance from Residence to Work  737 non-null    int64   
 7   Service time                     737 non-null    int64   
 8   Age                              737 non-null    int64   
 9   Work load Average/day            737 non-null    float64 
 10  Hit target                       737 non-null    int64   
 11  Disciplinary failure             737 non-null    bool    
 12  Educatio

In [106]:
train_df, test_df = train_test_split(data, test_size=0.3, random_state=123)

In [107]:
### identify target
target = "Absenteeism time in hours"

In [108]:
### split out features and target
X_train, y_train = train_df.drop(columns=[target]), train_df[target]
X_test, y_test = test_df.drop(columns=[target]), test_df[target]

In [109]:
###
# identify features
drop_feature = ["ID"]
numeric_features = ["Transportation expense", "Distance from Residence to Work", "Service time", "Age", "Work load Average/day ", "Hit target", "Son", "Pet", "Weight", "Height", "Body mass index"]
categorical_features = ["Reason for absence", "Month of absence", "Day of the week", "Seasons"]
binary_features = ["Disciplinary failure", "Social drinker", "Social smoker"]
ordinal_feature = ["Education"]

In [54]:
# make tranformers
# numeric_transformer = make_pipeline(StandardScaler())
# categorical_transformer = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
# binary_transformer = make_pipeline(OneHotEncoder(handle_unknown="error", drop='if_binary'))
# ordinal_transformer = make_pipeline(OrdinalEncoder())

In [110]:
### make preprocessor
preprocessor = make_column_transformer(
    ('drop', drop_feature),
    (StandardScaler(), numeric_features),
    (OneHotEncoder(handle_unknown="ignore"), categorical_features),
    (OneHotEncoder(handle_unknown="error", drop='if_binary'), binary_features),
    (OrdinalEncoder(), ordinal_feature)
)

In [111]:
### make Random Forest Regressor pipeline
pipe_rf = make_pipeline(preprocessor, RandomForestRegressor(random_state=123))

In [112]:
### make scoring
scoring = {
    "r2": "r2", 
    "neg_MSE": "neg_mean_squared_error"
}

In [151]:
### mean train & cv scores from 5 folds
scores_rf = cross_validate(pipe_rf, X_train, y_train, scoring=scoring, return_train_score=True)
rf_cv_result_df = pd.DataFrame({"Random Forest Regressor":pd.DataFrame(scores_rf).mean().tolist()}, index = scores_rf.keys())
rf_cv_result_df

Unnamed: 0,Random Forest Regressor
fit_time,0.582
score_time,0.016001
test_r2,-0.151878
train_r2,0.830825
test_neg_MSE,-241.953149
train_neg_MSE,-39.420397


In [115]:
### create hyperparameter distributions
rf_hyperparam_dist = {
    "randomforestregressor__n_estimators": scipy.stats.randint(low=10, high=300),
    "randomforestregressor__max_depth": scipy.stats.randint(low=2, high=20),
}

In [117]:
### perform hyperparameter tuning and fit the model
random_search_rf = RandomizedSearchCV(pipe_rf, 
                                   rf_hyperparam_dist, 
                                   n_jobs=-1, 
                                   n_iter=50, 
                                   cv=5,
                                   scoring='r2',
                                   random_state=123)
random_search_rf.fit(X_train, y_train)

RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('columntransformer',
                                              ColumnTransformer(transformers=[('drop',
                                                                               'drop',
                                                                               ['ID']),
                                                                              ('standardscaler',
                                                                               StandardScaler(),
                                                                               ['Transportation '
                                                                                'expense',
                                                                                'Distance '
                                                                                'from '
                                                                                'Re

In [118]:
print(f"The best r2 score from random search random forest is {random_search_rf.best_score_:.3f}.")
print(f"The best hyperparameter value is {random_search_rf.best_params_}.")

The best r2 score from random search random forest is -0.053.
The best hyperparameter value is {'randomforestregressor__max_depth': 2, 'randomforestregressor__n_estimators': 234}.


In [119]:
random_search_rf.best_estimator_.score(X_test, y_test)

-0.09406859484354291

In [120]:
random_search_rf.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('drop', 'drop', ['ID']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  ['Transportation expense',
                                                   'Distance from Residence to '
                                                   'Work',
                                                   'Service time', 'Age',
                                                   'Work load Average/day ',
                                                   'Hit target', 'Son', 'Pet',
                                                   'Weight', 'Height',
                                                   'Body mass index']),
                                                 ('onehotencoder-1',
                                                  OneHotEncoder(handle_unknown='ignore'),
  

In [142]:
feature_importances = random_search_rf.best_estimator_.named_steps['randomforestregressor'].feature_importances_.flatten()

In [132]:
categorical_feature_names = random_search_rf.best_estimator_.named_steps['columntransformer'].named_transformers_['onehotencoder-1'].get_feature_names(categorical_features).tolist()

In [137]:
binary_feature_names = random_search_rf.best_estimator_.named_steps['columntransformer'].named_transformers_['onehotencoder-2'].get_feature_names(binary_features).tolist()

In [138]:
all_feature_names = numeric_features + categorical_feature_names + binary_feature_names + ordinal_feature

In [146]:
feature_importances_df = pd.DataFrame({'features': all_feature_names, 'feature importances': feature_importances})

In [147]:
feature_importances_df

Unnamed: 0,features,feature importances
0,Transportation expense,0.013957
1,Distance from Residence to Work,0.003951
2,Service time,0.002258
3,Age,0.230289
4,Work load Average/day,0.043336
...,...,...
59,Seasons_4,0.011644
60,Disciplinary failure_True,0.000000
61,Social drinker_True,0.000000
62,Social smoker_True,0.000000
