In [1]:
" Import the libraries " 

import os
import sys 
import math
import copy

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
" Import the scripts of SD for Explaining "

absFilePath = os.path.dirname(os.path.dirname(os.getcwd()))
newPath = os.path.join(absFilePath, 'SplitSD4X')
sys.path.append(newPath)

from fill_missing_values import *
from missing_values_table import *
from neighbors_generation import *
from patterns_extraction import *
from performances import *
from subgroups_discovery import *
from sp_lime import *

## Data Preparation 

In [3]:
" Loading the dataset "
datasets_path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'Datasets\\')
url = datasets_path + 'data_bike_hour.csv'
df = pd.read_csv(url)
df = df.drop(['instant','dteday','casual','registered'],axis =1)

In [4]:
" Handling some data "
df = df.drop(df[df.weathersit == 4].index)
df[df["weathersit"] == 4]
df.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16
1,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,40
2,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,32
3,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,13
4,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,1


In [5]:
" Decode Categorical Features "

weekday_mapper = {0 : 'Sun', 
                  1 : 'Mon',
                  2 : 'Tue',
                  3 : 'Wed',
                  4 : 'Thu',
                  5 : 'Fri',
                  6 : 'Sat' }
weekday_mapper_inv = dict(map(reversed, weekday_mapper.items()))
df['weekday'] = df['weekday'].replace(weekday_mapper)


holiday_mapper = {0 : 'No_Holiday',
                  1 : 'Holiday'}
holiday_mapper_inv = dict(map(reversed, holiday_mapper.items()))
df['holiday'] = df['holiday'].replace(holiday_mapper)



workingday_mapper = {0 : 'No_Working_Day',
                     1 : 'Working_Day'}
workingday_mapper_inv = dict(map(reversed, workingday_mapper.items()))
df['workingday'] = df['workingday'].replace(workingday_mapper)



season_mapper = {1 : 'Spring',
                 2 : 'Summer',
                 3 : 'Fall',
                 4 : 'Winter'}
season_mapper_inv = dict(map(reversed, season_mapper.items()))
df['season'] = df['season'].replace(season_mapper)

wethersit_mapper = {1 : 'Good',
                    2 : 'Misty',
                    3 : 'Rain_Snow_Storm'}
wethersit_mapper_inv = dict(map(reversed, wethersit_mapper.items()))
df['weathersit'] = df['weathersit'].replace(wethersit_mapper)



mnth_mapper = {1  : 'Jan',
               2  : 'Feb',
               3  : 'Mar',
               4  : 'Apr',
               5  : 'May',
               6  : 'Jun',
               7  : 'Jul',
               8  : 'Aug',
               9  : 'Sep',
               10 : 'Oct',
               11 : 'Nov',
               12 : 'Dec'}
mnth_mapper_inv = dict(map(reversed, mnth_mapper.items()))
df['mnth'] = df['mnth'].replace(mnth_mapper)


yr_mapper = {0 : '2011',
             1 : '2012'}
yr_mapper_inv = dict(map(reversed, yr_mapper.items()))
df['yr'] = df['yr'].replace(yr_mapper)

# Numerical Features
df['temp'] = df['temp'] * (39 - (-8)) + (-8)
df['atemp'] = df['atemp'] * (50 - (16)) + (16)
df['windspeed'] = df['windspeed'] * 67
df['hum'] = df['hum']*100
df.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,Spring,2011,Jan,0,No_Holiday,Sat,No_Working_Day,Good,3.28,25.7886,81.0,0.0,16
1,Spring,2011,Jan,1,No_Holiday,Sat,No_Working_Day,Good,2.34,25.2718,80.0,0.0,40
2,Spring,2011,Jan,2,No_Holiday,Sat,No_Working_Day,Good,2.34,25.2718,80.0,0.0,32
3,Spring,2011,Jan,3,No_Holiday,Sat,No_Working_Day,Good,3.28,25.7886,75.0,0.0,13
4,Spring,2011,Jan,4,No_Holiday,Sat,No_Working_Day,Good,3.28,25.7886,75.0,0.0,1


In [6]:
" display the features types "
df.dtypes

season         object
yr             object
mnth           object
hr              int64
holiday        object
weekday        object
workingday     object
weathersit     object
temp          float64
atemp         float64
hum           float64
windspeed     float64
cnt             int64
dtype: object

In [7]:
" Checking missing values "
df.replace('?', np.nan, inplace=True)
missing_values_table(df)

Your slelected dataframe has 13 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [8]:
" separate the data and the target "
data_df = df.drop(columns=['cnt'])
target_df = df['cnt']

In [9]:
" calculate the categorical features mask "
categorical_feature_mask = (data_df.dtypes == object)
categorical_feature_mask

season         True
yr             True
mnth           True
hr            False
holiday        True
weekday        True
workingday     True
weathersit     True
temp          False
atemp         False
hum           False
windspeed     False
dtype: bool

In [10]:
categorical_cols_names = data_df.columns[categorical_feature_mask].tolist()
categorical_cols_names

['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']

In [11]:
numerical_cols_names = data_df.columns[~categorical_feature_mask].tolist()
numerical_cols_names

['hr', 'temp', 'atemp', 'hum', 'windspeed']

In [12]:
" if no values missed we execute this code : "
data_df = pd.concat([data_df[numerical_cols_names], data_df[categorical_cols_names]],axis = 1)
data_df.head()

Unnamed: 0,hr,temp,atemp,hum,windspeed,season,yr,mnth,holiday,weekday,workingday,weathersit
0,0,3.28,25.7886,81.0,0.0,Spring,2011,Jan,No_Holiday,Sat,No_Working_Day,Good
1,1,2.34,25.2718,80.0,0.0,Spring,2011,Jan,No_Holiday,Sat,No_Working_Day,Good
2,2,2.34,25.2718,80.0,0.0,Spring,2011,Jan,No_Holiday,Sat,No_Working_Day,Good
3,3,3.28,25.7886,75.0,0.0,Spring,2011,Jan,No_Holiday,Sat,No_Working_Day,Good
4,4,3.28,25.7886,75.0,0.0,Spring,2011,Jan,No_Holiday,Sat,No_Working_Day,Good


In [13]:
" Encoding categorical features"

data_df['weekday'] = data_df['weekday'].replace(weekday_mapper_inv)
data_df['holiday'] = data_df['holiday'].replace(holiday_mapper_inv)
data_df['workingday'] = data_df['workingday'].replace(workingday_mapper_inv)
data_df['season'] = data_df['season'].replace(season_mapper_inv)
data_df['weathersit'] = data_df['weathersit'].replace(wethersit_mapper_inv)
data_df['mnth'] = data_df['mnth'].replace(mnth_mapper_inv)
data_df['yr'] = data_df['yr'].replace(yr_mapper_inv)
data_df.head()

Unnamed: 0,hr,temp,atemp,hum,windspeed,season,yr,mnth,holiday,weekday,workingday,weathersit
0,0,3.28,25.7886,81.0,0.0,1,0,1,0,6,0,1
1,1,2.34,25.2718,80.0,0.0,1,0,1,0,6,0,1
2,2,2.34,25.2718,80.0,0.0,1,0,1,0,6,0,1
3,3,3.28,25.7886,75.0,0.0,1,0,1,0,6,0,1
4,4,3.28,25.7886,75.0,0.0,1,0,1,0,6,0,1


In [14]:
data_target_df = pd.concat([data_df, target_df], axis=1) 

In [15]:
" generate the Test SET "
nb_test_instances = 1000 
test_df = data_target_df.sample(n=nb_test_instances)
data_test_df = test_df.drop(columns=['cnt'])
target_test_df = test_df['cnt']

In [16]:
" generate the Training SET "
train_df = pd.concat([data_target_df,test_df]).drop_duplicates(keep=False)
data_train_df = train_df.drop(columns=['cnt'])
target_train_df = train_df['cnt']

In [17]:
" Extract values of the test set to generate the neighbors"

data_test = data_test_df.values
target_test = target_test_df.values

In [18]:
numerical_cols = np.arange(0,len(numerical_cols_names)) 
categorical_cols = np.arange(len(numerical_cols_names),data_df.shape[1])

## Neighbors Generation

In [19]:
nb_neighbors = 50 
list_neigh = generate_all_neighbors(data_test,numerical_cols,categorical_cols,nb_neighbors)

In [20]:
" store all the neighbors together "
n = np.size(data_test,0)
all_neighbors = list_neigh[0]
for i in range(1,n) :
    all_neighbors = np.concatenate((all_neighbors, list_neigh[i]), axis=0)

###  One hot encoding 

In [21]:
df_neigh = pd.DataFrame(data = all_neighbors,columns= numerical_cols_names + categorical_cols_names)
df_neigh[categorical_cols_names] = df_neigh[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "
df_neigh['weekday'] = df_neigh['weekday'].replace(weekday_mapper)
df_neigh['holiday'] = df_neigh['holiday'].replace(holiday_mapper)
df_neigh['workingday'] = df_neigh['workingday'].replace(workingday_mapper)
df_neigh['season'] = df_neigh['season'].replace(season_mapper)
df_neigh['weathersit'] = df_neigh['weathersit'].replace(wethersit_mapper)
df_neigh['mnth'] = df_neigh['mnth'].replace(mnth_mapper)
df_neigh['yr'] = df_neigh['yr'].replace(yr_mapper)
df_neigh.head()

Unnamed: 0,hr,temp,atemp,hum,windspeed,season,yr,mnth,holiday,weekday,workingday,weathersit
0,14.999483,4.267091,23.283897,56.446964,29.306036,Spring,2012,Dec,No_Holiday,Thu,Working_Day,Misty
1,15.170446,2.422202,22.107838,56.443571,30.098421,Spring,2012,Jan,No_Holiday,Thu,Working_Day,Misty
2,15.289875,4.826115,23.518743,54.343997,30.960047,Spring,2012,Nov,No_Holiday,Fri,Working_Day,Misty
3,15.467399,5.396405,23.937963,55.65214,30.512264,Spring,2012,Dec,No_Holiday,Sat,No_Working_Day,Misty
4,15.514197,4.988442,23.811058,57.070115,29.069665,Spring,2012,Dec,No_Holiday,Mon,Working_Day,Good


In [22]:
" One hot encoding "
df_neigh = pd.get_dummies(df_neigh, prefix_sep='_', drop_first=True)
df_neigh.head()

Unnamed: 0,hr,temp,atemp,hum,windspeed,season_Spring,season_Summer,season_Winter,yr_2012,mnth_Aug,...,holiday_No_Holiday,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thu,weekday_Tue,weekday_Wed,workingday_Working_Day,weathersit_Misty,weathersit_Rain_Snow_Storm
0,14.999483,4.267091,23.283897,56.446964,29.306036,1,0,0,1,0,...,1,0,0,0,1,0,0,1,1,0
1,15.170446,2.422202,22.107838,56.443571,30.098421,1,0,0,1,0,...,1,0,0,0,1,0,0,1,1,0
2,15.289875,4.826115,23.518743,54.343997,30.960047,1,0,0,1,0,...,1,0,0,0,0,0,0,1,1,0
3,15.467399,5.396405,23.937963,55.65214,30.512264,1,0,0,1,0,...,1,0,1,0,0,0,0,0,1,0
4,15.514197,4.988442,23.811058,57.070115,29.069665,1,0,0,1,0,...,1,1,0,0,0,0,0,1,0,0


In [23]:
" Store the neighbors in a list"

data_neigh = df_neigh.values
n = np.size(data_test,0)
list_neigh = []
j = 0
for i in range(0,n):
    list_neigh.append(data_neigh[j:(j+nb_neighbors),:])
    j += nb_neighbors

####  One hot encoding for the training and the test sets

In [24]:
data_train_df['weekday'] = data_train_df['weekday'].replace(weekday_mapper)
data_train_df['holiday'] = data_train_df['holiday'].replace(holiday_mapper)
data_train_df['workingday'] = data_train_df['workingday'].replace(workingday_mapper)
data_train_df['season'] = data_train_df['season'].replace(season_mapper)
data_train_df['weathersit'] = data_train_df['weathersit'].replace(wethersit_mapper)
data_train_df['mnth'] = data_train_df['mnth'].replace(mnth_mapper)
data_train_df['yr'] = data_train_df['yr'].replace(yr_mapper)

In [25]:
data_train_df = pd.get_dummies(data_train_df, prefix_sep='_', drop_first=True)
data_train_df.head()

Unnamed: 0,hr,temp,atemp,hum,windspeed,season_Spring,season_Summer,season_Winter,yr_2012,mnth_Aug,...,holiday_No_Holiday,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thu,weekday_Tue,weekday_Wed,workingday_Working_Day,weathersit_Misty,weathersit_Rain_Snow_Storm
0,0,3.28,25.7886,81.0,0.0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
1,1,2.34,25.2718,80.0,0.0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
2,2,2.34,25.2718,80.0,0.0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
3,3,3.28,25.7886,75.0,0.0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
4,4,3.28,25.7886,75.0,0.0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0


In [26]:
data_train = data_train_df.values
target_train = target_train_df.values

In [27]:
data_test_df['weekday'] = data_test_df['weekday'].replace(weekday_mapper)
data_test_df['holiday'] = data_test_df['holiday'].replace(holiday_mapper)
data_test_df['workingday'] = data_test_df['workingday'].replace(workingday_mapper)
data_test_df['season'] = data_test_df['season'].replace(season_mapper)
data_test_df['weathersit'] = data_test_df['weathersit'].replace(wethersit_mapper)
data_test_df['mnth'] = data_test_df['mnth'].replace(mnth_mapper)
data_test_df['yr'] = data_test_df['yr'].replace(yr_mapper)

In [28]:
data_test_df = pd.get_dummies(data_test_df, prefix_sep='_', drop_first=True)
data_test_df.head()

Unnamed: 0,hr,temp,atemp,hum,windspeed,season_Spring,season_Summer,season_Winter,yr_2012,mnth_Aug,...,holiday_No_Holiday,weekday_Mon,weekday_Sat,weekday_Sun,weekday_Thu,weekday_Tue,weekday_Wed,workingday_Working_Day,weathersit_Misty,weathersit_Rain_Snow_Storm
17274,15,4.22,23.2114,56.0,30.0026,1,0,0,1,0,...,1,0,0,0,1,0,0,1,1,0
16204,22,16.44,33.0,77.0,16.9979,0,0,1,1,0,...,0,1,0,0,0,0,0,0,0,0
4123,17,26.78,38.6678,42.0,15.0013,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
1464,8,10.8,29.9094,100.0,19.9995,1,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
10341,0,9.86,29.3926,54.0,12.998,1,0,0,1,0,...,1,1,0,0,0,0,0,1,0,0


In [29]:
data_test = data_test_df.values
target_test = target_test_df.values

In [30]:
" Define the functions to save and load data "
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [31]:
'SAVE THE DATA'

path = './saved_data/'
save_obj(data_train, path + 'data_train')
save_obj(target_train, path + 'target_train')
save_obj(data_test, path  + 'data_test')
save_obj(target_test, path + 'target_test')
save_obj(list_neigh, path + 'list_neighbors')

## Training the models

In [32]:
" Linear Regression : "
regression = LinearRegression()
model_reg = regression.fit(data_train, target_train)
target_pred_reg = model_reg.predict(data_test)

In [41]:
" Random Forest Regressor "
regr = RandomForestRegressor(max_depth=8, random_state=0)
model_rf = regr.fit(data_train, target_train)
target_pred_rf = model_rf.predict(data_test)

In [34]:
" Ada Boost regressor "
adaBoostRegr = AdaBoostRegressor(random_state=0, n_estimators=100)
model_abr = adaBoostRegr.fit(data_train, target_train)
target_pred_abr = model_abr.predict(data_test)

In [35]:
" Sklearn MLP regressor "

mlp = make_pipeline(StandardScaler(),
                    MLPRegressor(hidden_layer_sizes=(50, 50),
                                 tol=1e-2, 
                                 max_iter=1000, 
                                 random_state=0))
model_nt = mlp.fit(data_train, target_train)
target_pred_nt = model_nt.predict(data_test)

## Scores of the black box models 


In [42]:
print(f"{'The score of the linear regression model is ' :<60}{': {}'.format(round(model_reg.score(data_test, target_test),4))}")
print(f"{'The score of the Random Forest Regressor model is ':<60}{': {}'.format(round(model_rf.score(data_test, target_test),4))}")
print(f"{'The score of the AdaBoost Regressor model is ':<60}{': {}'.format(round(model_abr.score(data_test, target_test),4))}")
print(f"{'The score of the Multi-Layer-Perceptron Regressor model is ':<60}{': {}'.format(round(model_nt.score(data_test, target_test),4))}")

The score of the linear regression model is                 : 0.4057
The score of the Random Forest Regressor model is           : 0.8478
The score of the AdaBoost Regressor model is                : 0.6204
The score of the Multi-Layer-Perceptron Regressor model is  : 0.9353


## Execution of Split Based Selection Form Algorithm : 

In [37]:
split_point = len(numerical_cols)
nb_models = 100
(L_Subgroups,P) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh,split_point)

In [38]:
'SAVE THE LIST OF THE SUBGROUPS'
save_obj(L_Subgroups, path + 'list_subgroups')

## Subgroups Descriptions

In [39]:
att_names = data_test_df.columns
patt_descriptions = patterns(P,split_point,data_test,att_names)

subrgoup 0
5.4 < hr <= 6.1
-------------------------------------------------------------------
subrgoup 1
4.0 < hr <= 5.4
-------------------------------------------------------------------
subrgoup 2
0.0 < hr <= 4.0
-------------------------------------------------------------------
subrgoup 3
16.1 < hr <= 19.4
workingday_Working_Day = 0
46.1 < hum <= 100.0
17.1 < windspeed <= 57.0
-------------------------------------------------------------------
subrgoup 4
19.4 < hr <= 21.2
workingday_Working_Day = 0
-4.24 < temp <= 13.15
-------------------------------------------------------------------
subrgoup 5
19.4 < hr <= 21.2
workingday_Working_Day = 1
12.12 < temp <= 37.12
0.0 < hum <= 45.0
-------------------------------------------------------------------
subrgoup 6
6.1 < hr <= 7.2
-4.24 < temp <= 6.48
-------------------------------------------------------------------
subrgoup 7
9.6 < hr <= 10.2
-4.24 < temp <= 13.06
season_Spring = 0
0.0 < hum <= 55.1
----------------------------------

In [40]:
'SAVE THE SUBGROUPS PATTERNS'
save_obj(patt_descriptions, path + 'patterns')
save_obj(att_names, path + 'att_names')