In [1]:
" Import the libraries " 

import os
import sys 
import math
import copy

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
" Import the scripts of SD for Explaining "

absFilePath = os.path.dirname(os.path.dirname(os.getcwd()))
newPath = os.path.join(absFilePath, 'SplitSD4X')
sys.path.append(newPath)

from fill_missing_values import *
from missing_values_table import *
from neighbors_generation import *
from patterns_extraction import *
from performances import *
from subgroups_discovery import *
from sp_lime import *

## Data Preparation 

In [3]:
" Loading the dataset "
datasets_path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'Datasets\\')
url = datasets_path + 'data_parkinsons_updrs.csv'
df = pd.read_csv(url)
df = df.drop(['subject#','motor_UPDRS'],axis =1)
df.head()

Unnamed: 0,age,sex,test_time,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,72,0,5.6431,34.398,0.00662,3.4e-05,0.00401,0.00317,0.01204,0.02565,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
1,72,0,12.666,34.894,0.003,1.7e-05,0.00132,0.0015,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
2,72,0,19.681,35.389,0.00481,2.5e-05,0.00205,0.00208,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
3,72,0,25.647,35.81,0.00528,2.7e-05,0.00191,0.00264,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
4,72,0,33.642,36.375,0.00335,2e-05,0.00093,0.0013,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361


In [4]:
" Decode Categorical Features "

sex_mapper = {0 : 'male', 
              1 : 'female' }
sex_mapper_inv = dict(map(reversed, sex_mapper.items()))
df['sex'] = df['sex'].replace(sex_mapper)
df.head()

Unnamed: 0,age,sex,test_time,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,72,male,5.6431,34.398,0.00662,3.4e-05,0.00401,0.00317,0.01204,0.02565,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
1,72,male,12.666,34.894,0.003,1.7e-05,0.00132,0.0015,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
2,72,male,19.681,35.389,0.00481,2.5e-05,0.00205,0.00208,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
3,72,male,25.647,35.81,0.00528,2.7e-05,0.00191,0.00264,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
4,72,male,33.642,36.375,0.00335,2e-05,0.00093,0.0013,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361


In [5]:
" display the features types "
df.dtypes

age                int64
sex               object
test_time        float64
total_UPDRS      float64
Jitter(%)        float64
Jitter(Abs)      float64
Jitter:RAP       float64
Jitter:PPQ5      float64
Jitter:DDP       float64
Shimmer          float64
Shimmer(dB)      float64
Shimmer:APQ3     float64
Shimmer:APQ5     float64
Shimmer:APQ11    float64
Shimmer:DDA      float64
NHR              float64
HNR              float64
RPDE             float64
DFA              float64
PPE              float64
dtype: object

In [6]:
" Checking missing values "
df.replace('?', np.nan, inplace=True)
missing_values_table(df)

Your slelected dataframe has 20 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [7]:
" separate the data and the target "
data_df = df.drop(columns=['total_UPDRS'])
target_df = df['total_UPDRS']

In [8]:
" calculate the categorical features mask "
categorical_feature_mask = (data_df.dtypes == object)
categorical_feature_mask

age              False
sex               True
test_time        False
Jitter(%)        False
Jitter(Abs)      False
Jitter:RAP       False
Jitter:PPQ5      False
Jitter:DDP       False
Shimmer          False
Shimmer(dB)      False
Shimmer:APQ3     False
Shimmer:APQ5     False
Shimmer:APQ11    False
Shimmer:DDA      False
NHR              False
HNR              False
RPDE             False
DFA              False
PPE              False
dtype: bool

In [9]:
categorical_cols_names = data_df.columns[categorical_feature_mask].tolist()
categorical_cols_names

['sex']

In [10]:
numerical_cols_names = data_df.columns[~categorical_feature_mask].tolist()
numerical_cols_names

['age',
 'test_time',
 'Jitter(%)',
 'Jitter(Abs)',
 'Jitter:RAP',
 'Jitter:PPQ5',
 'Jitter:DDP',
 'Shimmer',
 'Shimmer(dB)',
 'Shimmer:APQ3',
 'Shimmer:APQ5',
 'Shimmer:APQ11',
 'Shimmer:DDA',
 'NHR',
 'HNR',
 'RPDE',
 'DFA',
 'PPE']

In [11]:
" if no values missed we execute this code : "
data_df = pd.concat([data_df[numerical_cols_names], data_df[categorical_cols_names]],axis = 1)
data_df.head()

Unnamed: 0,age,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE,sex
0,72,5.6431,0.00662,3.4e-05,0.00401,0.00317,0.01204,0.02565,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006,male
1,72,12.666,0.003,1.7e-05,0.00132,0.0015,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081,male
2,72,19.681,0.00481,2.5e-05,0.00205,0.00208,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014,male
3,72,25.647,0.00528,2.7e-05,0.00191,0.00264,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277,male
4,72,33.642,0.00335,2e-05,0.00093,0.0013,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361,male


In [12]:
" Encoding categorical features "
data_df['sex'] = data_df['sex'].replace(sex_mapper_inv)

In [13]:
data_target_df = pd.concat([data_df, target_df], axis=1) 

In [14]:
" generate the Test SET "
nb_test_instances = 1000 
test_df = data_target_df.sample(n=nb_test_instances)
data_test_df = test_df.drop(columns=['total_UPDRS'])
target_test_df = test_df['total_UPDRS']

In [15]:
" generate the Training SET "
train_df = pd.concat([data_target_df,test_df]).drop_duplicates(keep=False)
data_train_df = train_df.drop(columns=['total_UPDRS'])
target_train_df = train_df['total_UPDRS']

In [16]:
" Extract values of the test set to generate the neighbors"

data_test = data_test_df.values
target_test = target_test_df.values

In [17]:
numerical_cols = np.arange(0,len(numerical_cols_names)) 
categorical_cols = np.arange(len(numerical_cols_names),data_df.shape[1])

## Neighbors Generation

In [18]:
nb_neighbors = 50 
list_neigh = generate_all_neighbors(data_test,numerical_cols,categorical_cols,nb_neighbors)

In [19]:
" store all the neighbors together "
n = np.size(data_test,0)
all_neighbors = list_neigh[0]
for i in range(1,n) :
    all_neighbors = np.concatenate((all_neighbors, list_neigh[i]), axis=0)

###  One hot encoding 

In [20]:
df_neigh = pd.DataFrame(data = all_neighbors,columns= numerical_cols_names + categorical_cols_names)
df_neigh[categorical_cols_names] = df_neigh[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "
df_neigh['sex'] = df_neigh['sex'].replace(sex_mapper)
df_neigh.head()

Unnamed: 0,age,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE,sex
0,72.066701,98.659117,0.004072,3.9e-05,0.001738,0.002366,0.005205,0.024804,0.2131,0.011645,0.015545,0.023496,0.034935,0.008132,23.030725,0.549659,0.587949,0.206242,male
1,70.546883,107.723381,0.004353,3.9e-05,0.001856,0.002575,0.005558,0.026597,0.225856,0.012879,0.016913,0.023853,0.038637,0.013734,22.6296,0.523855,0.586936,0.206067,male
2,72.326985,102.890964,0.00425,3.7e-05,0.001771,0.002481,0.005302,0.025016,0.219846,0.011968,0.016195,0.023423,0.035904,0.015806,23.183551,0.5256,0.572226,0.197111,male
3,73.179925,96.703031,0.00453,4e-05,0.001936,0.002684,0.005797,0.028588,0.247328,0.013687,0.018185,0.026357,0.041062,0.016496,22.682033,0.535223,0.577245,0.208496,male
4,71.193294,104.981597,0.003719,3.5e-05,0.00155,0.002076,0.004639,0.025502,0.217516,0.012211,0.0162,0.023964,0.036634,0.007995,22.941752,0.528982,0.584647,0.195377,male


In [21]:
" One hot encoding "
df_neigh = pd.get_dummies(df_neigh, prefix_sep='_', drop_first=True)
df_neigh.head()

Unnamed: 0,age,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE,sex_male
0,72.066701,98.659117,0.004072,3.9e-05,0.001738,0.002366,0.005205,0.024804,0.2131,0.011645,0.015545,0.023496,0.034935,0.008132,23.030725,0.549659,0.587949,0.206242,1
1,70.546883,107.723381,0.004353,3.9e-05,0.001856,0.002575,0.005558,0.026597,0.225856,0.012879,0.016913,0.023853,0.038637,0.013734,22.6296,0.523855,0.586936,0.206067,1
2,72.326985,102.890964,0.00425,3.7e-05,0.001771,0.002481,0.005302,0.025016,0.219846,0.011968,0.016195,0.023423,0.035904,0.015806,23.183551,0.5256,0.572226,0.197111,1
3,73.179925,96.703031,0.00453,4e-05,0.001936,0.002684,0.005797,0.028588,0.247328,0.013687,0.018185,0.026357,0.041062,0.016496,22.682033,0.535223,0.577245,0.208496,1
4,71.193294,104.981597,0.003719,3.5e-05,0.00155,0.002076,0.004639,0.025502,0.217516,0.012211,0.0162,0.023964,0.036634,0.007995,22.941752,0.528982,0.584647,0.195377,1


In [22]:
" Store the neighbors in a list"

data_neigh = df_neigh.values
n = np.size(data_test,0)
list_neigh = []
j = 0
for i in range(0,n):
    list_neigh.append(data_neigh[j:(j+nb_neighbors),:])
    j += nb_neighbors

####  One hot encoding for the training and the test sets

In [23]:
data_train_df['sex'] = data_train_df['sex'].replace(sex_mapper)

In [24]:
data_train_df = pd.get_dummies(data_train_df, prefix_sep='_', drop_first=True)
data_train_df.head()

Unnamed: 0,age,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE,sex_male
0,72,5.6431,0.00662,3.4e-05,0.00401,0.00317,0.01204,0.02565,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006,1
1,72,12.666,0.003,1.7e-05,0.00132,0.0015,0.00395,0.02024,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081,1
2,72,19.681,0.00481,2.5e-05,0.00205,0.00208,0.00616,0.01675,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014,1
3,72,25.647,0.00528,2.7e-05,0.00191,0.00264,0.00573,0.02309,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277,1
4,72,33.642,0.00335,2e-05,0.00093,0.0013,0.00278,0.01703,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361,1


In [25]:
data_train = data_train_df.values
target_train = target_train_df.values

In [26]:
data_test_df['sex'] = data_test_df['sex'].replace(sex_mapper)

In [27]:
data_test_df = pd.get_dummies(data_test_df, prefix_sep='_', drop_first=True)
data_test_df.head()

Unnamed: 0,age,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE,sex_male
922,72,105.3,0.00407,3.7e-05,0.00169,0.00234,0.00506,0.02647,0.229,0.01278,0.01702,0.0245,0.03834,0.012899,22.759,0.5315,0.57912,0.20233,1
539,74,79.669,0.0024,1.8e-05,0.00094,0.00131,0.00283,0.0288,0.247,0.01373,0.01676,0.02476,0.04119,0.011364,22.645,0.70869,0.60409,0.10738,1
220,58,166.76,0.00621,4.5e-05,0.003,0.00335,0.009,0.04671,0.454,0.02602,0.0302,0.0354,0.07806,0.020905,20.639,0.47387,0.70712,0.17682,1
3507,49,66.851,0.00883,9.1e-05,0.00511,0.00488,0.01534,0.04086,0.362,0.0212,0.02495,0.0356,0.06359,0.024856,17.633,0.67118,0.74831,0.348,1
3144,59,107.52,0.00322,1.7e-05,0.00168,0.00172,0.00503,0.01727,0.155,0.00972,0.01011,0.01254,0.02916,0.012371,23.947,0.46907,0.658,0.14207,0


In [28]:
data_test = data_test_df.values
target_test = target_test_df.values

In [29]:
" Define the functions to save and load data "
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [30]:
'SAVE THE DATA'

path = './saved_data/'
save_obj(data_train, path + 'data_train')
save_obj(target_train, path + 'target_train')
save_obj(data_test, path  + 'data_test')
save_obj(target_test, path + 'target_test')
save_obj(list_neigh, path + 'list_neighbors')

## Training the models

In [31]:
" Linear Regression : "
regression = LinearRegression()
model_reg = regression.fit(data_train, target_train)
target_pred_reg = model_reg.predict(data_test)

In [32]:
" Random Forest Regressor "
regr = RandomForestRegressor(n_estimators=300, max_depth=6, random_state=0)
model_rf = regr.fit(data_train, target_train)
target_pred_rf = model_rf.predict(data_test)

In [33]:
" using Ada Boost regressor as black box: "
adaBoostRegr = AdaBoostRegressor(random_state=0, n_estimators=500)
model_abr = adaBoostRegr.fit(data_train, target_train)
target_pred_abr = model_abr.predict(data_test)

In [34]:
" using Sklearn MLP regressor as black box: "

mlp = make_pipeline(StandardScaler(),
                    MLPRegressor(hidden_layer_sizes=(100, 100),
                                 tol=1e-2, 
                                 max_iter=1000, 
                                 random_state=0))
model_nt = mlp.fit(data_train, target_train)
target_pred_nt = model_nt.predict(data_test)

## Scores of the black box models 


In [35]:
print(f"{'The score of the linear regression model is ' :<60}{': {}'.format(round(model_reg.score(data_test, target_test),4))}")
print(f"{'The score of the Random Forest Regressor model is ':<60}{': {}'.format(round(model_rf.score(data_test, target_test),4))}")
print(f"{'The score of the AdaBoost Regressor model is ':<60}{': {}'.format(round(model_abr.score(data_test, target_test),4))}")
print(f"{'The score of the Multi-Layer-Perceptron Regressor model is ':<60}{': {}'.format(round(model_nt.score(data_test, target_test),4))}")

The score of the linear regression model is                 : 0.1697
The score of the Random Forest Regressor model is           : 0.7922
The score of the AdaBoost Regressor model is                : 0.3774
The score of the Multi-Layer-Perceptron Regressor model is  : 0.858


## Execution of Split Based Selection Form Algorithm : 

In [36]:
split_point = len(numerical_cols)
nb_models = 100
(L_Subgroups,P) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_rf, list_neigh,split_point)

In [37]:
'SAVE THE LIST OF THE SUBGROUPS'
save_obj(L_Subgroups, path + 'list_subgroups')

## Subgroups Descriptions

In [38]:
att_names = data_test_df.columns
patt_descriptions = patterns(P,split_point,data_test,att_names)

subrgoup 0
49.3 < age <= 55.2
-------------------------------------------------------------------
subrgoup 1
36.0 < age <= 49.3
sex_male = 1
-------------------------------------------------------------------
subrgoup 2
36.0 < age <= 49.3
sex_male = 0
-------------------------------------------------------------------
subrgoup 3
75.1 < age <= 85.0
sex_male = 0
-------------------------------------------------------------------
subrgoup 4
66.2 < age <= 70.3
0.52 < DFA <= 0.64
-------------------------------------------------------------------
subrgoup 5
76.2 < age <= 85.0
sex_male = 1
-------------------------------------------------------------------
subrgoup 6
56.1 < age <= 57.8
sex_male = 1
-------------------------------------------------------------------
subrgoup 7
73.8 < age <= 75.1
0.7 < DFA <= 0.82
107.83 < test_time <= 212.39
-------------------------------------------------------------------
subrgoup 8
73.8 < age <= 75.1
0.7 < DFA <= 0.82
-4.26 < test_time <= 107.83
---------

In [39]:
'SAVE THE SUBGROUPS PATTERNS'
save_obj(patt_descriptions, path + 'patterns')
save_obj(att_names, path + 'att_names')