In [1]:
" Import the libraries " 

import os
import sys 
import math
import copy

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
" Import the scripts of SD for Explaining "

absFilePath = os.path.dirname(os.path.dirname(os.getcwd()))
newPath = os.path.join(absFilePath, 'SplitSD4X')
sys.path.append(newPath)

from fill_missing_values import *
from missing_values_table import *
from neighbors_generation import *
from patterns_extraction import *
from performances import *
from subgroups_discovery import *
from sp_lime import *

## Data Preparation 

In [3]:
" Loading the dataset "
datasets_path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'Datasets\\')
url = datasets_path + 'data_gpu_performances.csv'
df = pd.read_csv(url)
df = df.sample(n=10000)
df.head()

Unnamed: 0,MWG,NWG,KWG,MDIMC,NDIMC,MDIMA,NDIMB,KWI,VWM,VWN,STRM,STRN,SA,SB,Run1 (ms),Run2 (ms),Run3 (ms),Run4 (ms)
156531,128,32,16,16,16,32,16,8,1,2,0,0,1,1,45.13,45.11,45.08,45.18
74596,64,32,16,8,16,16,8,8,1,1,0,1,0,0,129.5,129.4,129.43,129.91
127559,64,128,32,8,16,8,32,8,4,1,0,1,1,1,193.04,194.38,193.78,194.45
115959,64,128,16,8,16,16,32,2,2,1,0,1,1,1,151.85,151.54,151.77,152.04
210042,128,128,16,16,8,8,16,2,2,2,1,0,1,0,513.24,515.08,514.07,514.99


In [4]:
" Handling data "

mwg_nwg_mapper = {16 : 1, 32 : 2, 64 : 3, 128 : 4}
df['MWG'] = df['MWG'].replace(mwg_nwg_mapper)
df['NWG'] = df['NWG'].replace(mwg_nwg_mapper)

kwg_mapper = {16 : 1, 32 : 2}
df['KWG'] = df['KWG'].replace(kwg_mapper)

mdi_ndi_mapper =  {8 : 1, 16 : 2, 32 : 3}
df['MDIMC'] = df['MDIMC'].replace(mdi_ndi_mapper)
df['NDIMC'] = df['NDIMC'].replace(mdi_ndi_mapper)
df['MDIMA'] = df['MDIMA'].replace(mdi_ndi_mapper)
df['NDIMB'] = df['NDIMB'].replace(mdi_ndi_mapper)

kwi_mapper = {2 : 1, 8 : 2}
df['KWI'] = df['KWI'].replace(kwi_mapper)

vwm_vwn_mapper =  {1 : 1, 2 : 2, 4 : 3, 8 : 4}
df['VWM'] = df['VWM'].replace(mdi_ndi_mapper)
df['VWM'] = df['VWN'].replace(mdi_ndi_mapper)

df['Run'] = (df['Run1 (ms)'] + df['Run2 (ms)'] + df['Run3 (ms)'] + df['Run4 (ms)']) / 4
df = df.drop(['Run1 (ms)','Run2 (ms)','Run3 (ms)','Run4 (ms)'],axis =1)
df.head()

Unnamed: 0,MWG,NWG,KWG,MDIMC,NDIMC,MDIMA,NDIMB,KWI,VWM,VWN,STRM,STRN,SA,SB,Run
156531,4,2,1,2,2,3,2,2,2,2,0,0,1,1,45.125
74596,3,2,1,1,2,2,1,2,1,1,0,1,0,0,129.56
127559,3,4,2,1,2,1,3,2,1,1,0,1,1,1,193.9125
115959,3,4,1,1,2,2,3,1,1,1,0,1,1,1,151.8
210042,4,4,1,2,1,1,2,1,2,2,1,0,1,0,514.345


In [5]:
" Decode Categorical Features "

str_mapper = {0 : 'disable', 1 : 'enable'}
str_mapper_inv = dict(map(reversed, str_mapper.items()))
df['STRM'] = df['STRM'].replace(str_mapper)
df['STRN'] = df['STRN'].replace(str_mapper)

s_mapper = {0 : 'N', 1 : 'T'}
s_mapper_inv = dict(map(reversed, s_mapper.items()))
df['SA'] = df['SA'].replace(s_mapper)
df['SB'] = df['SB'].replace(s_mapper)
df.head()

Unnamed: 0,MWG,NWG,KWG,MDIMC,NDIMC,MDIMA,NDIMB,KWI,VWM,VWN,STRM,STRN,SA,SB,Run
156531,4,2,1,2,2,3,2,2,2,2,disable,disable,T,T,45.125
74596,3,2,1,1,2,2,1,2,1,1,disable,enable,N,N,129.56
127559,3,4,2,1,2,1,3,2,1,1,disable,enable,T,T,193.9125
115959,3,4,1,1,2,2,3,1,1,1,disable,enable,T,T,151.8
210042,4,4,1,2,1,1,2,1,2,2,enable,disable,T,N,514.345


In [6]:
a = df['MWG']
steps = (pd.cut(a,5, retbins=True,include_lowest=True))[1][1:-1]
steps = np.trunc(steps)
steps

array([1., 2., 2., 3.])

In [7]:
" display the features types "
df.dtypes

MWG        int64
NWG        int64
KWG        int64
MDIMC      int64
NDIMC      int64
MDIMA      int64
NDIMB      int64
KWI        int64
VWM        int64
VWN        int64
STRM      object
STRN      object
SA        object
SB        object
Run      float64
dtype: object

In [8]:
" Checking missing values "
df.replace('?', np.nan, inplace=True)
missing_values_table(df)

Your slelected dataframe has 15 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [9]:
" separate the data and the target "
data_df = df.drop(columns=['Run'])
target_df = df['Run']

In [10]:
" calculate the categorical features mask "
categorical_feature_mask = (data_df.dtypes == object)
categorical_feature_mask

MWG      False
NWG      False
KWG      False
MDIMC    False
NDIMC    False
MDIMA    False
NDIMB    False
KWI      False
VWM      False
VWN      False
STRM      True
STRN      True
SA        True
SB        True
dtype: bool

In [11]:
categorical_cols_names = data_df.columns[categorical_feature_mask].tolist()
categorical_cols_names

['STRM', 'STRN', 'SA', 'SB']

In [12]:
numerical_cols_names = data_df.columns[~categorical_feature_mask].tolist()
numerical_cols_names

['MWG', 'NWG', 'KWG', 'MDIMC', 'NDIMC', 'MDIMA', 'NDIMB', 'KWI', 'VWM', 'VWN']

In [13]:
" if no values missed we execute this code : "
data_df = pd.concat([data_df[numerical_cols_names], data_df[categorical_cols_names]],axis = 1)
data_df.head()

Unnamed: 0,MWG,NWG,KWG,MDIMC,NDIMC,MDIMA,NDIMB,KWI,VWM,VWN,STRM,STRN,SA,SB
156531,4,2,1,2,2,3,2,2,2,2,disable,disable,T,T
74596,3,2,1,1,2,2,1,2,1,1,disable,enable,N,N
127559,3,4,2,1,2,1,3,2,1,1,disable,enable,T,T
115959,3,4,1,1,2,2,3,1,1,1,disable,enable,T,T
210042,4,4,1,2,1,1,2,1,2,2,enable,disable,T,N


In [14]:
" Encoding categorical features"

data_df['STRM'] = data_df['STRM'].replace(str_mapper_inv)
data_df['STRN'] = data_df['STRN'].replace(str_mapper_inv)
data_df['SA'] = data_df['SA'].replace(s_mapper_inv)
data_df['SB'] = data_df['SB'].replace(s_mapper_inv)
data_df.head()

Unnamed: 0,MWG,NWG,KWG,MDIMC,NDIMC,MDIMA,NDIMB,KWI,VWM,VWN,STRM,STRN,SA,SB
156531,4,2,1,2,2,3,2,2,2,2,0,0,1,1
74596,3,2,1,1,2,2,1,2,1,1,0,1,0,0
127559,3,4,2,1,2,1,3,2,1,1,0,1,1,1
115959,3,4,1,1,2,2,3,1,1,1,0,1,1,1
210042,4,4,1,2,1,1,2,1,2,2,1,0,1,0


In [15]:
data_target_df = pd.concat([data_df, target_df], axis=1) 

In [16]:
" generate the Test SET "
nb_test_instances = 1000 
test_df = data_target_df.sample(n=nb_test_instances)
data_test_df = test_df.drop(columns=['Run'])
target_test_df = test_df['Run']

In [17]:
" generate the Training SET "
train_df = pd.concat([data_target_df,test_df]).drop_duplicates(keep=False)
data_train_df = train_df.drop(columns=['Run'])
target_train_df = train_df['Run']

In [18]:
" Extract values of the test set to generate the neighbors"

data_test = data_test_df.values
target_test = target_test_df.values

In [19]:
numerical_cols = np.arange(0,len(numerical_cols_names)) 
categorical_cols = np.arange(len(numerical_cols_names),data_df.shape[1])

## Neighbors Generation

In [20]:
nb_neighbors = 50 
list_neigh = generate_all_neighbors(data_test,numerical_cols,categorical_cols,nb_neighbors)

In [21]:
" store all the neighbors together "
n = np.size(data_test,0)
all_neighbors = list_neigh[0]
for i in range(1,n) :
    all_neighbors = np.concatenate((all_neighbors, list_neigh[i]), axis=0)

###  One hot encoding 

In [22]:
df_neigh = pd.DataFrame(data = all_neighbors,columns= numerical_cols_names + categorical_cols_names)
df_neigh[categorical_cols_names] = df_neigh[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "
df_neigh['STRM'] = df_neigh['STRM'].replace(str_mapper)
df_neigh['STRN'] = df_neigh['STRN'].replace(str_mapper)
df_neigh['SA'] = df_neigh['SA'].replace(s_mapper)
df_neigh['SB'] = df_neigh['SB'].replace(s_mapper)

df_neigh.head()

Unnamed: 0,MWG,NWG,KWG,MDIMC,NDIMC,MDIMA,NDIMB,KWI,VWM,VWN,STRM,STRN,SA,SB
0,4.186105,3.140434,1.002349,1.978606,0.994704,2.012724,2.012767,1.932645,4.15425,4.377014,disable,disable,T,T
1,3.997399,3.114142,0.938841,1.946713,0.982081,1.961076,2.026842,2.049204,3.976246,4.110474,disable,disable,T,T
2,3.927249,2.967254,1.003999,2.01454,0.91605,1.993997,1.988218,2.067772,4.123034,3.520577,disable,disable,T,T
3,3.996907,3.041041,0.922809,1.936176,0.970281,2.088889,1.952451,1.961817,4.011859,3.969447,disable,disable,T,T
4,4.223511,2.85577,0.910055,2.031062,0.995447,2.050172,2.025981,2.007337,3.749771,3.758769,disable,disable,T,T


In [23]:
" One hot encoding "
df_neigh = pd.get_dummies(df_neigh, prefix_sep='_', drop_first=True)
df_neigh.head()

Unnamed: 0,MWG,NWG,KWG,MDIMC,NDIMC,MDIMA,NDIMB,KWI,VWM,VWN,STRM_enable,STRN_enable,SA_T,SB_T
0,4.186105,3.140434,1.002349,1.978606,0.994704,2.012724,2.012767,1.932645,4.15425,4.377014,0,0,1,1
1,3.997399,3.114142,0.938841,1.946713,0.982081,1.961076,2.026842,2.049204,3.976246,4.110474,0,0,1,1
2,3.927249,2.967254,1.003999,2.01454,0.91605,1.993997,1.988218,2.067772,4.123034,3.520577,0,0,1,1
3,3.996907,3.041041,0.922809,1.936176,0.970281,2.088889,1.952451,1.961817,4.011859,3.969447,0,0,1,1
4,4.223511,2.85577,0.910055,2.031062,0.995447,2.050172,2.025981,2.007337,3.749771,3.758769,0,0,1,1


In [24]:
" Store the neighbors in a list"

data_neigh = df_neigh.values
n = np.size(data_test,0)
list_neigh = []
j = 0
for i in range(0,n):
    list_neigh.append(data_neigh[j:(j+nb_neighbors),:])
    j += nb_neighbors

####  One hot encoding for the training and the test sets

In [25]:
data_train_df['STRM'] = data_train_df['STRM'].replace(str_mapper)
data_train_df['STRN'] = data_train_df['STRN'].replace(str_mapper)
data_train_df['SA'] = data_train_df['SA'].replace(s_mapper)
data_train_df['SB'] = data_train_df['SB'].replace(s_mapper)

In [26]:
data_train_df = pd.get_dummies(data_train_df, prefix_sep='_', drop_first=True)
data_train_df.head()

Unnamed: 0,MWG,NWG,KWG,MDIMC,NDIMC,MDIMA,NDIMB,KWI,VWM,VWN,STRM_enable,STRN_enable,SA_T,SB_T
156531,4,2,1,2,2,3,2,2,2,2,0,0,1,1
74596,3,2,1,1,2,2,1,2,1,1,0,1,0,0
127559,3,4,2,1,2,1,3,2,1,1,0,1,1,1
115959,3,4,1,1,2,2,3,1,1,1,0,1,1,1
210042,4,4,1,2,1,1,2,1,2,2,1,0,1,0


In [27]:
data_train = data_train_df.values
target_train = target_train_df.values

In [28]:
data_test_df['STRM'] = data_test_df['STRM'].replace(str_mapper)
data_test_df['STRN'] = data_test_df['STRN'].replace(str_mapper)
data_test_df['SA'] = data_test_df['SA'].replace(s_mapper)
data_test_df['SB'] = data_test_df['SB'].replace(s_mapper)

In [29]:
data_test_df = pd.get_dummies(data_test_df, prefix_sep='_', drop_first=True)
data_test_df.head()

Unnamed: 0,MWG,NWG,KWG,MDIMC,NDIMC,MDIMA,NDIMB,KWI,VWM,VWN,STRM_enable,STRN_enable,SA_T,SB_T
177951,4,3,1,2,1,2,2,2,4,4,1,1,1,1
6349,1,3,1,1,2,1,2,1,2,2,1,1,0,1
195722,4,3,2,2,3,2,2,1,1,1,1,0,1,0
94571,3,3,1,2,1,3,2,2,4,4,1,0,1,1
227785,4,4,2,2,1,1,1,1,1,1,1,0,0,1


In [30]:
data_test = data_test_df.values
target_test = target_test_df.values

In [31]:
" Define the functions to save and load data "
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [32]:
'SAVE THE DATA'

path = './saved_data/'
save_obj(data_train, path + 'data_train')
save_obj(target_train, path + 'target_train')
save_obj(data_test, path  + 'data_test')
save_obj(target_test, path + 'target_test')
save_obj(list_neigh, path + 'list_neighbors')

## Training the models

In [33]:
" Linear Regression : "
regression = LinearRegression()
model_reg = regression.fit(data_train, target_train)
target_pred_reg = model_reg.predict(data_test)

In [34]:
" Random Forest Regressor "
regr = RandomForestRegressor(n_estimators=100, max_depth=6, random_state=0)
model_rf = regr.fit(data_train, target_train)
target_pred_rf = model_rf.predict(data_test)

In [35]:
" Ada Boost regressor "
adaBoostRegr = AdaBoostRegressor(random_state=0, n_estimators=10)
model_abr = adaBoostRegr.fit(data_train, target_train)
target_pred_abr = model_abr.predict(data_test)

In [36]:
" Sklearn MLP regressor "

mlp = make_pipeline(StandardScaler(),
                    MLPRegressor(hidden_layer_sizes=(50, 50),
                                 tol=1e-2, 
                                 max_iter=1000, 
                                 random_state=0))
model_nt = mlp.fit(data_train, target_train)
target_pred_nt = model_nt.predict(data_test)

## Scores of the black box models 


In [37]:
print(f"{'The score of the linear regression model is ' :<60}{': {}'.format(round(model_reg.score(data_test, target_test),4))}")
print(f"{'The score of the Random Forest Regressor model is ':<60}{': {}'.format(round(model_rf.score(data_test, target_test),4))}")
print(f"{'The score of the AdaBoost Regressor model is ':<60}{': {}'.format(round(model_abr.score(data_test, target_test),4))}")
print(f"{'The score of the Multi-Layer-Perceptron Regressor model is ':<60}{': {}'.format(round(model_nt.score(data_test, target_test),4))}")

The score of the linear regression model is                 : 0.4203
The score of the Random Forest Regressor model is           : 0.8788
The score of the AdaBoost Regressor model is                : 0.6787
The score of the Multi-Layer-Perceptron Regressor model is  : 0.9916


## Execution of Split Based Selection Form Algorithm : 

In [38]:
split_point = len(numerical_cols)
nb_models = 100
(L_Subgroups,P) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh,split_point)

In [39]:
'SAVE THE LIST OF THE SUBGROUPS'
save_obj(L_Subgroups, path + 'list_subgroups')

## Subgroups Descriptions

In [40]:
att_names = data_test_df.columns
patt_descriptions = patterns(P,split_point,data_test,att_names)

subrgoup 0
3.0 < NWG <= 4
1 < NDIMC <= 1.0
1.0 < MDIMC <= 3
1 < MWG <= 2.0
-------------------------------------------------------------------
subrgoup 1
2.0 < NWG <= 3.0
3.0 < MWG <= 4
1.0 < NDIMC <= 3
1.0 < MDIMC <= 3
-------------------------------------------------------------------
subrgoup 2
3.0 < NWG <= 4
1.0 < NDIMC <= 3
2.0 < MWG <= 3.0
1.0 < MDIMC <= 3
-------------------------------------------------------------------
subrgoup 3
2.0 < NWG <= 3.0
3.0 < MWG <= 4
1 < NDIMC <= 1.0
1 < MDIMC <= 1.0
SA_T = 0
SB_T = 0
-------------------------------------------------------------------
subrgoup 4
3.0 < NWG <= 4
1 < NDIMC <= 1.0
1.0 < MDIMC <= 2.0
3.0 < MWG <= 4
SA_T = 1
SB_T = 0
-------------------------------------------------------------------
subrgoup 5
3.0 < NWG <= 4
1 < NDIMC <= 1.0
1 < MDIMC <= 1.0
2.0 < MWG <= 3.0
SA_T = 0
SB_T = 0
-------------------------------------------------------------------
subrgoup 6
1 < NWG <= 1.0
3.0 < MWG <= 4
1 < NDIMC <= 1.0
1 < MDIMC <= 1.0
---

1 < NDIMC <= 1.0
1.0 < MDIMC <= 2.0
SA_T = 1
-------------------------------------------------------------------
subrgoup 87
2.0 < NWG <= 3.0
3.0 < MWG <= 4
1 < NDIMC <= 1.0
1.0 < MDIMC <= 2.0
SA_T = 0
-------------------------------------------------------------------
subrgoup 88
3.0 < NWG <= 4
1.0 < NDIMC <= 3
1 < MWG <= 2.0
SB_T = 1
-------------------------------------------------------------------
subrgoup 89
3.0 < NWG <= 4
1.0 < NDIMC <= 3
1 < MWG <= 2.0
SB_T = 0
-------------------------------------------------------------------
subrgoup 90
3.0 < NWG <= 4
1 < NDIMC <= 1.0
2.0 < MDIMC <= 3
3.0 < MWG <= 4
STRM_enable = 1
-------------------------------------------------------------------
subrgoup 91
3.0 < NWG <= 4
1 < NDIMC <= 1.0
2.0 < MDIMC <= 3
3.0 < MWG <= 4
STRM_enable = 0
-------------------------------------------------------------------
subrgoup 92
1 < NWG <= 2.0
2.0 < MWG <= 3.0
SA_T = 1
1.0 < KWI <= 2
-------------------------------------------------------------------
su

In [41]:
'SAVE THE SUBGROUPS PATTERNS'
save_obj(patt_descriptions, path + 'patterns')
save_obj(att_names, path + 'att_names')