In [1]:
" Import the libraries " 

import os
import sys 
import math
import copy

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
" Import the scripts of SD for Explaining and the supplementary scripts for neighbors generation"

absFilePath = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
newPath = os.path.join(absFilePath, 'SplitSD4X\\')
sys.path.append(newPath)

newPath_supp = os.path.join(newPath, 'supplementary')
sys.path.append(newPath_supp)

from fill_missing_values import *
from missing_values_table import *
from subgroups_discovery import *

from neighbors_generation import *
from discretization import *

## Data Preparation 

In [3]:
" Loading the dataset "
datasets_path = os.path.join(absFilePath, 'Datasets\\')
url = datasets_path + 'data_gpu_performances.csv'
df = pd.read_csv(url)
df = df.sample(n=10000)

mwg_nwg_mapper = {16 : 1, 32 : 2, 64 : 3, 128 : 4}
df['MWG'] = df['MWG'].replace(mwg_nwg_mapper)
df['NWG'] = df['NWG'].replace(mwg_nwg_mapper)

kwg_mapper = {16 : 1, 32 : 2}
df['KWG'] = df['KWG'].replace(kwg_mapper)

mdi_ndi_mapper =  {8 : 1, 16 : 2, 32 : 3}
df['MDIMC'] = df['MDIMC'].replace(mdi_ndi_mapper)
df['NDIMC'] = df['NDIMC'].replace(mdi_ndi_mapper)
df['MDIMA'] = df['MDIMA'].replace(mdi_ndi_mapper)
df['NDIMB'] = df['NDIMB'].replace(mdi_ndi_mapper)

kwi_mapper = {2 : 1, 8 : 2}
df['KWI'] = df['KWI'].replace(kwi_mapper)

vwm_vwn_mapper =  {1 : 1, 2 : 2, 4 : 3, 8 : 4}
df['VWM'] = df['VWM'].replace(mdi_ndi_mapper)
df['VWM'] = df['VWN'].replace(mdi_ndi_mapper)

df['Run'] = (df['Run1 (ms)'] + df['Run2 (ms)'] + df['Run3 (ms)'] + df['Run4 (ms)']) / 4
df = df.drop(['Run1 (ms)','Run2 (ms)','Run3 (ms)','Run4 (ms)'],axis =1)

str_mapper = {0 : 'disable', 1 : 'enable'}
str_mapper_inv = dict(map(reversed, str_mapper.items()))
df['STRM'] = df['STRM'].replace(str_mapper)
df['STRN'] = df['STRN'].replace(str_mapper)

s_mapper = {0 : 'N', 1 : 'T'}
s_mapper_inv = dict(map(reversed, s_mapper.items()))
df['SA'] = df['SA'].replace(s_mapper)
df['SB'] = df['SB'].replace(s_mapper)

" separate the data and the target "
data_df = df.drop(columns=['Run'])
target_df = df['Run']

categorical_feature_mask = (data_df.dtypes == object)
categorical_cols_names = data_df.columns[categorical_feature_mask].tolist()
numerical_cols_names = data_df.columns[~categorical_feature_mask].tolist()

data_df = pd.concat([data_df[numerical_cols_names], data_df[categorical_cols_names]],axis = 1)


" Encoding categorical features"

data_df['STRM'] = data_df['STRM'].replace(str_mapper_inv)
data_df['STRN'] = data_df['STRN'].replace(str_mapper_inv)
data_df['SA'] = data_df['SA'].replace(s_mapper_inv)
data_df['SB'] = data_df['SB'].replace(s_mapper_inv)

data_target_df = pd.concat([data_df, target_df], axis=1) 

In [4]:
" generate the Test SET "
nb_test_instances = 1000 
test_df = data_target_df.sample(n=nb_test_instances)
data_test_df = test_df.drop(columns=['Run'])
target_test_df = test_df['Run']

" generate the Training SET "
train_df = pd.concat([data_target_df,test_df]).drop_duplicates(keep=False)
data_train_df = train_df.drop(columns=['Run'])
target_train_df = train_df['Run']

" Extract values of the test set to generate the neighbors"

data_test = data_test_df.values
target_test = target_test_df.values

numerical_cols = np.arange(0,len(numerical_cols_names)) 
categorical_cols = np.arange(len(numerical_cols_names),data_df.shape[1])

## Neighbors Generation 

In [5]:
nb_neighbors = 20
list_neigh = generate_all_neighbors(data_test,numerical_cols,categorical_cols,nb_neighbors)

" store all the neighbors together "
n = np.size(data_test,0)
all_neighbors = list_neigh[0]
for i in range(1,n) :
    all_neighbors = np.concatenate((all_neighbors, list_neigh[i]), axis=0)
    
" One hot encoding "

df_neigh = pd.DataFrame(data = all_neighbors,columns= numerical_cols_names + categorical_cols_names)
df_neigh[categorical_cols_names] = df_neigh[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "
df_neigh['STRM'] = df_neigh['STRM'].replace(str_mapper)
df_neigh['STRN'] = df_neigh['STRN'].replace(str_mapper)
df_neigh['SA'] = df_neigh['SA'].replace(s_mapper)
df_neigh['SB'] = df_neigh['SB'].replace(s_mapper)

" One hot encoding "
df_neigh = pd.get_dummies(df_neigh, prefix_sep='_', drop_first=True)

" Store the neighbors in a list"

data_neigh = df_neigh.values
n = np.size(data_test,0)
list_neigh = []
j = 0
for i in range(0,n):
    list_neigh.append(data_neigh[j:(j+nb_neighbors),:])
    j += nb_neighbors

####  One hot encoding for the training and the test sets

In [6]:
data_train_df['STRM'] = data_train_df['STRM'].replace(str_mapper)
data_train_df['STRN'] = data_train_df['STRN'].replace(str_mapper)
data_train_df['SA'] = data_train_df['SA'].replace(s_mapper)
data_train_df['SB'] = data_train_df['SB'].replace(s_mapper)

data_train_df = pd.get_dummies(data_train_df, prefix_sep='_', drop_first=True)
data_train = data_train_df.values
target_train = target_train_df.values

data_test_df['STRM'] = data_test_df['STRM'].replace(str_mapper)
data_test_df['STRN'] = data_test_df['STRN'].replace(str_mapper)
data_test_df['SA'] = data_test_df['SA'].replace(s_mapper)
data_test_df['SB'] = data_test_df['SB'].replace(s_mapper)

data_test_df = pd.get_dummies(data_test_df, prefix_sep='_', drop_first=True)
data_test = data_test_df.values
target_test = target_test_df.values

## Training the MLP model

In [7]:
" Sklearn MLP regressor "

mlp = make_pipeline(StandardScaler(),
                    MLPRegressor(hidden_layer_sizes=(50, 50),
                                 tol=1e-2, 
                                 max_iter=1000, 
                                 random_state=0))
model_nt = mlp.fit(data_train, target_train)
target_pred_nt = model_nt.predict(data_test)

## Execution of Split Based Selection Form Algorithm : 

#### Discretization : Equal Frequency 

In [8]:
split_point = len(numerical_cols)
nb_models = 100
L_Subgroups_freq = []

L_Subgroups_freq.append(SplitBasedSelectionForm_freq (data_test, target_test, nb_models, model_nt, list_neigh,split_point,4)[0])
L_Subgroups_freq.append(SplitBasedSelectionForm_freq (data_test, target_test, nb_models, model_nt, list_neigh,split_point,5)[0])
L_Subgroups_freq.append(SplitBasedSelectionForm_freq (data_test, target_test, nb_models, model_nt, list_neigh,split_point,6)[0])
L_Subgroups_freq.append(SplitBasedSelectionForm_freq (data_test, target_test, nb_models, model_nt, list_neigh,split_point,7)[0])
L_Subgroups_freq.append(SplitBasedSelectionForm_freq (data_test, target_test, nb_models, model_nt, list_neigh,split_point,8)[0])
L_Subgroups_freq.append(SplitBasedSelectionForm_freq (data_test, target_test, nb_models, model_nt, list_neigh,split_point,9)[0])
L_Subgroups_freq.append(SplitBasedSelectionForm_freq (data_test, target_test, nb_models, model_nt, list_neigh,split_point,10)[0])


#### Discretization : Equal Width

In [9]:
L_Subgroups_width = []

L_Subgroups_width.append(SplitBasedSelectionForm_width (data_test, target_test, nb_models, model_nt, list_neigh,split_point,4)[0])
L_Subgroups_width.append(SplitBasedSelectionForm_width (data_test, target_test, nb_models, model_nt, list_neigh,split_point,5)[0])
L_Subgroups_width.append(SplitBasedSelectionForm_width (data_test, target_test, nb_models, model_nt, list_neigh,split_point,6)[0])
L_Subgroups_width.append(SplitBasedSelectionForm_width (data_test, target_test, nb_models, model_nt, list_neigh,split_point,7)[0])
L_Subgroups_width.append(SplitBasedSelectionForm_width (data_test, target_test, nb_models, model_nt, list_neigh,split_point,8)[0])
L_Subgroups_width.append(SplitBasedSelectionForm_width (data_test, target_test, nb_models, model_nt, list_neigh,split_point,9)[0])
L_Subgroups_width.append(SplitBasedSelectionForm_width (data_test, target_test, nb_models, model_nt, list_neigh,split_point,10)[0])

In [10]:
" Define the functions to save and load data "
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [11]:
'SAVE THE DATA'

path = './saved_data/'
save_obj(data_train, path + 'data_train_d')
save_obj(target_train, path + 'target_train_d')
save_obj(data_test, path  + 'data_test_d')
save_obj(target_test, path + 'target_test_d')
save_obj(list_neigh, path   + 'list_neighbors_d')

In [12]:
'SAVE THE LIST OF THE SUBGROUPS'
save_obj(L_Subgroups_freq, path + 'l_list_subgroups_freq')
save_obj(L_Subgroups_width, path + 'l_list_subgroups_width')