In [1]:
" Import the libraries " 

import os
import sys 
import math
import copy

import numpy as np
import pandas as pd

from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
" Import the scripts of SD for Explaining and the supplementary scripts for neighbors generation"

absFilePath = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
newPath = os.path.join(absFilePath, 'SplitSD4X\\')
sys.path.append(newPath)

newPath_supp = os.path.join(newPath, 'supplementary')
sys.path.append(newPath_supp)

from fill_missing_values import *
from missing_values_table import *
from subgroups_discovery import *

from neighbors_generation import *

## Data Preparation 

In [3]:
" Loading the dataset "
datasets_path = os.path.join(absFilePath, 'Datasets\\')
url = datasets_path + 'data_bike_hour.csv'
df = pd.read_csv(url)
df = df.drop(['instant','dteday','casual','registered'],axis =1)

" Handling some data "
df = df.drop(df[df.weathersit == 4].index)
df[df["weathersit"] == 4]

" Decode Categorical Features "

weekday_mapper = {0 : 'Sun', 
                  1 : 'Mon',
                  2 : 'Tue',
                  3 : 'Wed',
                  4 : 'Thu',
                  5 : 'Fri',
                  6 : 'Sat' }
weekday_mapper_inv = dict(map(reversed, weekday_mapper.items()))
df['weekday'] = df['weekday'].replace(weekday_mapper)


holiday_mapper = {0 : 'No_Holiday',
                  1 : 'Holiday'}
holiday_mapper_inv = dict(map(reversed, holiday_mapper.items()))
df['holiday'] = df['holiday'].replace(holiday_mapper)



workingday_mapper = {0 : 'No_Working_Day',
                     1 : 'Working_Day'}
workingday_mapper_inv = dict(map(reversed, workingday_mapper.items()))
df['workingday'] = df['workingday'].replace(workingday_mapper)



season_mapper = {1 : 'Spring',
                 2 : 'Summer',
                 3 : 'Fall',
                 4 : 'Winter'}
season_mapper_inv = dict(map(reversed, season_mapper.items()))
df['season'] = df['season'].replace(season_mapper)

wethersit_mapper = {1 : 'Good',
                    2 : 'Misty',
                    3 : 'Rain_Snow_Storm'}
wethersit_mapper_inv = dict(map(reversed, wethersit_mapper.items()))
df['weathersit'] = df['weathersit'].replace(wethersit_mapper)



mnth_mapper = {1  : 'Jan',
               2  : 'Feb',
               3  : 'Mar',
               4  : 'Apr',
               5  : 'May',
               6  : 'Jun',
               7  : 'Jul',
               8  : 'Aug',
               9  : 'Sep',
               10 : 'Oct',
               11 : 'Nov',
               12 : 'Dec'}
mnth_mapper_inv = dict(map(reversed, mnth_mapper.items()))
df['mnth'] = df['mnth'].replace(mnth_mapper)


yr_mapper = {0 : '2011',
             1 : '2012'}
yr_mapper_inv = dict(map(reversed, yr_mapper.items()))
df['yr'] = df['yr'].replace(yr_mapper)

# Numerical Features
df['temp'] = df['temp'] * (39 - (-8)) + (-8)
df['atemp'] = df['atemp'] * (50 - (16)) + (16)
df['windspeed'] = df['windspeed'] * 67
df['hum'] = df['hum']*100

" separate the data and the target "
data_df = df.drop(columns=['cnt'])
target_df = df['cnt']

" calculate the categorical features mask "
categorical_feature_mask = (data_df.dtypes == object)
categorical_cols_names = data_df.columns[categorical_feature_mask].tolist()
numerical_cols_names = data_df.columns[~categorical_feature_mask].tolist()

" if no values missed we execute this code : "
data_df = pd.concat([data_df[numerical_cols_names], data_df[categorical_cols_names]],axis = 1)

" Encoding categorical features"

data_df['weekday'] = data_df['weekday'].replace(weekday_mapper_inv)
data_df['holiday'] = data_df['holiday'].replace(holiday_mapper_inv)
data_df['workingday'] = data_df['workingday'].replace(workingday_mapper_inv)
data_df['season'] = data_df['season'].replace(season_mapper_inv)
data_df['weathersit'] = data_df['weathersit'].replace(wethersit_mapper_inv)
data_df['mnth'] = data_df['mnth'].replace(mnth_mapper_inv)
data_df['yr'] = data_df['yr'].replace(yr_mapper_inv)

data_target_df = pd.concat([data_df, target_df], axis=1) 

In [4]:
" generate the Test SET "
nb_test_instances = 1000 
test_df = data_target_df.sample(n=nb_test_instances)
data_test_df = test_df.drop(columns=['cnt'])
target_test_df = test_df['cnt']

" generate the Training SET "
train_df = pd.concat([data_target_df,test_df]).drop_duplicates(keep=False)
data_train_df = train_df.drop(columns=['cnt'])
target_train_df = train_df['cnt']

" Extract values of the test set to generate the neighbors"

data_test = data_test_df.values
target_test = target_test_df.values

numerical_cols = np.arange(0,len(numerical_cols_names)) 
categorical_cols = np.arange(len(numerical_cols_names),data_df.shape[1])

## Neighbors Generation 

In [5]:
l_nb_neighbors = [20,50,100]
l_list_neigh = []

for nb_neighbors in l_nb_neighbors : 
    
    list_neigh = generate_all_neighbors(data_test,numerical_cols,categorical_cols,nb_neighbors)

    " store all the neighbors together "
    n = np.size(data_test,0)
    all_neighbors = list_neigh[0]
    for i in range(1,n) :
        all_neighbors = np.concatenate((all_neighbors, list_neigh[i]), axis=0)

    " One hot encoding "

    df_neigh = pd.DataFrame(data = all_neighbors,columns= numerical_cols_names + categorical_cols_names)
    df_neigh[categorical_cols_names] = df_neigh[categorical_cols_names].astype(int,errors='ignore')

    " Decode all the data neighbors to perform one hot encoding "
    df_neigh['weekday'] = df_neigh['weekday'].replace(weekday_mapper)
    df_neigh['holiday'] = df_neigh['holiday'].replace(holiday_mapper)
    df_neigh['workingday'] = df_neigh['workingday'].replace(workingday_mapper)
    df_neigh['season'] = df_neigh['season'].replace(season_mapper)
    df_neigh['weathersit'] = df_neigh['weathersit'].replace(wethersit_mapper)
    df_neigh['mnth'] = df_neigh['mnth'].replace(mnth_mapper)
    df_neigh['yr'] = df_neigh['yr'].replace(yr_mapper)

    " One hot encoding "
    df_neigh = pd.get_dummies(df_neigh, prefix_sep='_', drop_first=True)

    " Store the neighbors in a list"

    data_neigh = df_neigh.values
    n = np.size(data_test,0)
    list_neigh = []
    j = 0
    for i in range(0,n):
        list_neigh.append(data_neigh[j:(j+nb_neighbors),:])
        j += nb_neighbors
    
    l_list_neigh.append(list_neigh)

In [6]:
list_neigh_20  = l_list_neigh[0] 
list_neigh_50  = l_list_neigh[1]
list_neigh_100 = l_list_neigh[2]

####  One hot encoding for the training and the test sets

In [7]:
data_train_df['weekday'] = data_train_df['weekday'].replace(weekday_mapper)
data_train_df['holiday'] = data_train_df['holiday'].replace(holiday_mapper)
data_train_df['workingday'] = data_train_df['workingday'].replace(workingday_mapper)
data_train_df['season'] = data_train_df['season'].replace(season_mapper)
data_train_df['weathersit'] = data_train_df['weathersit'].replace(wethersit_mapper)
data_train_df['mnth'] = data_train_df['mnth'].replace(mnth_mapper)
data_train_df['yr'] = data_train_df['yr'].replace(yr_mapper)

data_train_df = pd.get_dummies(data_train_df, prefix_sep='_', drop_first=True)
data_train = data_train_df.values
target_train = target_train_df.values

data_test_df['weekday'] = data_test_df['weekday'].replace(weekday_mapper)
data_test_df['holiday'] = data_test_df['holiday'].replace(holiday_mapper)
data_test_df['workingday'] = data_test_df['workingday'].replace(workingday_mapper)
data_test_df['season'] = data_test_df['season'].replace(season_mapper)
data_test_df['weathersit'] = data_test_df['weathersit'].replace(wethersit_mapper)
data_test_df['mnth'] = data_test_df['mnth'].replace(mnth_mapper)
data_test_df['yr'] = data_test_df['yr'].replace(yr_mapper)

data_test_df = pd.get_dummies(data_test_df, prefix_sep='_', drop_first=True)
data_test = data_test_df.values
target_test = target_test_df.values

## Training the MLP model

In [8]:
" Sklearn MLP regressor "

mlp = make_pipeline(StandardScaler(),
                    MLPRegressor(hidden_layer_sizes=(50, 50),
                                 tol=1e-2, 
                                 max_iter=1000, 
                                 random_state=0))
model_nt = mlp.fit(data_train, target_train)
target_pred_nt = model_nt.predict(data_test)

## Execution of Split Based Selection Form Algorithm : 

In [9]:
split_point = len(numerical_cols)
nb_models = 100
(L_Subgroups_20,P_1)  = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh_20,split_point)
(L_Subgroups_50,P_2)  = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh_50,split_point)
(L_Subgroups_100,P_3) = SplitBasedSelectionForm(data_test, target_test, nb_models, model_nt, list_neigh_100,split_point)

In [10]:
" Define the functions to save and load data "
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [11]:
'SAVE THE DATA'

path = './saved_data/'
save_obj(data_train, path + 'data_train_n')
save_obj(target_train, path + 'target_train_n')
save_obj(data_test, path  + 'data_test_n')
save_obj(target_test, path + 'target_test_n')
save_obj(list_neigh_20, path + 'list_neighbors_20')
save_obj(list_neigh_50, path + 'list_neighbors_50')
save_obj(list_neigh_100, path + 'list_neighbors_100')

In [12]:
'SAVE THE LIST OF THE SUBGROUPS'
save_obj(L_Subgroups_20, path  + 'list_subgroups_20')
save_obj(L_Subgroups_50, path  + 'list_subgroups_50')
save_obj(L_Subgroups_100, path + 'list_subgroups_100')