In [1]:
" Import the libraries " 

import os
import sys 
import math
import copy

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
" Import the scripts of SD for Explaining "

absFilePath = os.path.dirname(os.path.dirname(os.getcwd()))
newPath = os.path.join(absFilePath, 'SplitSD4X')
sys.path.append(newPath)

from fill_missing_values import *
from missing_values_table import *
from neighbors_generation import *
from patterns_extraction import *
from performances import *
from subgroups_discovery import *
from sp_lime import *

### Data Preparation 

In [3]:
"Loading and preparing data" 

datasets_path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'Datasets\\')
url = datasets_path + 'data_anuran.csv'
df = pd.read_csv(url)
df = df.drop(columns=['RecordID','Genus','Species'])
df = df.dropna()
df.head()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,0.082245,0.135752,-0.024017,-0.108351,-0.077623,-0.009568,0.057684,0.11868,0.014038,Leptodactylidae
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,0.022786,0.16332,0.012022,-0.090974,-0.05651,-0.035303,0.02014,0.082263,0.029056,Leptodactylidae
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,0.050791,0.207338,0.083536,-0.050691,-0.02359,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.011567,0.100413,-0.050224,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.1727,0.266434,...,0.037439,0.219153,0.062837,-0.048885,-0.053074,-0.08855,-0.031346,0.10861,0.079244,Leptodactylidae


In [4]:
" Handling data "

family_mapper = {'Bufonidae' : 0, 
                 'Dendrobatidae' : 1, 
                 'Hylidae' : 2, 
                 'Leptodactylidae' : 3}
df['Family'] = df['Family'].replace(family_mapper)

In [5]:
" display the features types "
df.dtypes

MFCCs_ 1    float64
MFCCs_ 2    float64
MFCCs_ 3    float64
MFCCs_ 4    float64
MFCCs_ 5    float64
MFCCs_ 6    float64
MFCCs_ 7    float64
MFCCs_ 8    float64
MFCCs_ 9    float64
MFCCs_10    float64
MFCCs_11    float64
MFCCs_12    float64
MFCCs_13    float64
MFCCs_14    float64
MFCCs_15    float64
MFCCs_16    float64
MFCCs_17    float64
MFCCs_18    float64
MFCCs_19    float64
MFCCs_20    float64
MFCCs_21    float64
MFCCs_22    float64
Family        int64
dtype: object

In [6]:
" Checking missing values "
df.replace('?', np.nan, inplace=True)
missing_values_table(df)

Your slelected dataframe has 23 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [7]:
" separate the data and the target "
data_df = df.drop(columns=['Family'])
target_df = df['Family']

In [8]:
" calculate the categorical features mask "
categorical_feature_mask = (data_df.dtypes == object)
categorical_feature_mask

MFCCs_ 1    False
MFCCs_ 2    False
MFCCs_ 3    False
MFCCs_ 4    False
MFCCs_ 5    False
MFCCs_ 6    False
MFCCs_ 7    False
MFCCs_ 8    False
MFCCs_ 9    False
MFCCs_10    False
MFCCs_11    False
MFCCs_12    False
MFCCs_13    False
MFCCs_14    False
MFCCs_15    False
MFCCs_16    False
MFCCs_17    False
MFCCs_18    False
MFCCs_19    False
MFCCs_20    False
MFCCs_21    False
MFCCs_22    False
dtype: bool

In [9]:
categorical_cols_names = data_df.columns[categorical_feature_mask].tolist()
categorical_cols_names

[]

In [10]:
numerical_cols_names = data_df.columns[~categorical_feature_mask].tolist()
numerical_cols_names

['MFCCs_ 1',
 'MFCCs_ 2',
 'MFCCs_ 3',
 'MFCCs_ 4',
 'MFCCs_ 5',
 'MFCCs_ 6',
 'MFCCs_ 7',
 'MFCCs_ 8',
 'MFCCs_ 9',
 'MFCCs_10',
 'MFCCs_11',
 'MFCCs_12',
 'MFCCs_13',
 'MFCCs_14',
 'MFCCs_15',
 'MFCCs_16',
 'MFCCs_17',
 'MFCCs_18',
 'MFCCs_19',
 'MFCCs_20',
 'MFCCs_21',
 'MFCCs_22']

In [11]:
" if no values missed we execute this code : "
data_df = pd.concat([data_df[numerical_cols_names].astype(float), data_df[categorical_cols_names]],axis = 1)
data_df.head()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.156436,0.082245,0.135752,-0.024017,-0.108351,-0.077623,-0.009568,0.057684,0.11868,0.014038
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.254341,0.022786,0.16332,0.012022,-0.090974,-0.05651,-0.035303,0.02014,0.082263,0.029056
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.237384,0.050791,0.207338,0.083536,-0.050691,-0.02359,-0.066722,-0.025083,0.099108,0.077162
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.317084,-0.011567,0.100413,-0.050224,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.1727,0.266434,...,-0.298524,0.037439,0.219153,0.062837,-0.048885,-0.053074,-0.08855,-0.031346,0.10861,0.079244


In [12]:
data_target_df = pd.concat([data_df, target_df], axis=1) 

In [13]:
" generate the Test SET "
nb_test_instances = 1000 
test_df = data_target_df.sample(n=nb_test_instances)
data_test_df = test_df.drop(columns=['Family'])
target_test_df = test_df['Family']
data_test_df.head()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22
4711,1.0,0.051928,0.425507,0.716878,0.171392,-0.068375,-0.062532,0.203029,0.284906,-0.071257,...,0.209079,0.067251,-0.132956,-0.099386,0.090031,0.12559,0.08002,-0.014546,-0.098098,0.020151
2141,1.0,0.003547,0.207452,0.538393,0.08053,0.041937,-0.092426,0.028645,0.247987,-0.033075,...,0.322633,-0.134339,-0.126806,0.145191,0.07466,-0.04362,-0.013037,-0.012308,0.086435,0.129924
1108,1.0,0.271116,0.085903,0.29901,0.089331,0.121719,0.255655,0.112551,-0.125651,-0.080977,...,-0.25887,0.167555,0.227067,-0.144172,-0.138246,0.017622,0.006341,-0.01267,0.091385,0.106009
2849,1.0,0.427336,0.259256,0.566474,0.213056,0.009584,-0.146234,0.087943,0.325763,0.070633,...,0.322685,0.007587,-0.239809,-0.028797,0.162817,0.061375,-0.022426,-0.147935,-0.027738,0.169884
3389,1.0,0.141197,0.034381,0.488721,0.270525,0.095071,-0.13242,-0.00938,0.247782,0.117731,...,0.332931,-0.020529,-0.28748,-0.012996,0.25744,0.125054,-0.100404,-0.224178,-0.078883,0.200265


In [14]:
" generate the Training SET "
train_df = pd.concat([data_target_df,test_df]).drop_duplicates(keep=False)
data_train_df = train_df.drop(columns=['Family'])
target_train_df = train_df['Family']
data_train_df.head()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.156436,0.082245,0.135752,-0.024017,-0.108351,-0.077623,-0.009568,0.057684,0.11868,0.014038
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.237384,0.050791,0.207338,0.083536,-0.050691,-0.02359,-0.066722,-0.025083,0.099108,0.077162
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.317084,-0.011567,0.100413,-0.050224,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.1727,0.266434,...,-0.298524,0.037439,0.219153,0.062837,-0.048885,-0.053074,-0.08855,-0.031346,0.10861,0.079244
5,1.0,0.099704,-0.033408,0.349895,0.344535,0.247569,0.022407,-0.213767,-0.127916,0.277353,...,-0.295123,0.012486,0.180641,0.055242,-0.080487,-0.130089,-0.171478,-0.071569,0.077643,0.064903


In [15]:
" Extract values of the test set to generate the neighbors"

data_test = data_test_df.values
target_test = target_test_df.values

In [16]:
numerical_cols = np.arange(0,len(numerical_cols_names)) 
categorical_cols = np.arange(len(numerical_cols_names),data_df.shape[1])

## Neighbors Generation

In [17]:
nb_neighbors = 50
list_neigh = generate_all_neighbors(data_test,numerical_cols,categorical_cols,nb_neighbors)

In [18]:
" store all the neighbors together "
n = np.size(data_test,0)
all_neighbors = list_neigh[0]
for i in range(1,n) :
    all_neighbors = np.concatenate((all_neighbors, list_neigh[i]), axis=0)

### One hot encoding 

In [19]:
df_neigh = pd.DataFrame(data = all_neighbors,columns= numerical_cols_names + categorical_cols_names)
df_neigh[categorical_cols_names] = df_neigh[categorical_cols_names].astype(int,errors='ignore')
df_neigh.head()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22
0,1.003737,0.051992,0.405012,0.719765,0.196474,-0.076103,-0.079482,0.229841,0.309117,-0.074338,...,0.222567,0.083912,-0.141837,-0.099996,0.095553,0.115565,0.068243,-0.018097,-0.097654,0.024797
1,0.99529,0.068751,0.448414,0.746943,0.170076,-0.07018,-0.080117,0.194517,0.302975,-0.066114,...,0.224273,0.057814,-0.140006,-0.089786,0.086931,0.11409,0.087016,-0.004206,-0.09575,0.015046
2,1.002796,0.060097,0.421984,0.714818,0.179371,-0.067473,-0.03853,0.212444,0.256341,-0.066935,...,0.171457,0.094967,-0.100876,-0.114665,0.074172,0.133454,0.08834,-0.0106,-0.097801,0.010901
3,1.001177,0.036171,0.428023,0.715226,0.184059,-0.061669,-0.081233,0.191554,0.295547,-0.063493,...,0.217751,0.05655,-0.141677,-0.094625,0.099228,0.125218,0.083375,-0.003049,-0.09751,0.011129
4,1.002158,0.046519,0.404379,0.721402,0.176152,-0.094616,-0.073001,0.212649,0.275563,-0.081906,...,0.197732,0.068999,-0.128767,-0.11082,0.086448,0.13365,0.071369,-0.021161,-0.085031,0.023322


In [20]:
" Store the neighbors in a list "
data_neigh = df_neigh.values
n = np.size(data_test,0)
list_neigh = []
j = 0
for i in range(0,n):
    list_neigh.append(data_neigh[j:(j+nb_neighbors),:])
    j += nb_neighbors

####  One hot encoding for the training and the test sets

In [21]:
data_train = data_train_df.values
target_train = target_train_df.values

In [22]:
data_test = data_test_df.values
target_test = target_test_df.values

In [23]:
" Define the functions to save and load data "
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [24]:
'SAVE THE DATA'

path = './saved_data/'
save_obj(data_train, path + 'data_train')
save_obj(target_train, path + 'target_train')
save_obj(data_test, path  + 'data_test')
save_obj(target_test, path + 'target_test')
save_obj(list_neigh, path + 'list_neighbors')

## Training the models

In [25]:
" Logistic Regression : "
lr = LogisticRegression(class_weight='balanced', random_state=0,max_iter = 5000)
model_lr = lr.fit(data_train,target_train)
target_pred_lr = model_lr.predict(data_test)

In [26]:
" Random Forest : "
rdclassifier = RandomForestClassifier(n_estimators=600,max_depth=20, random_state=0) 
model_rd = rdclassifier.fit(data_train,target_train)
target_pred_rd = model_rd.predict(data_test)

In [27]:
" SVM : "
clf = svm.SVC(probability=True)
model_svm = clf.fit(data_train, target_train)
target_pred_svm = model_svm.predict(data_test)

In [28]:
" Sklearn MLP Classifier : "
mlp = MLPClassifier(hidden_layer_sizes=(100,100), max_iter=1000,
                    solver='adam', random_state=1,
                    learning_rate_init=.1)

model_nt = mlp.fit(data_train, target_train)
target_pred_mlp = model_nt.predict(data_test)

## Scores of the black box models 

In [29]:
print(f"{'The score of the logistic regression model is ' :<50}{': {}'.format(round(f1_score(target_test,target_pred_lr,average='macro'),4))}")
print(f"{'The score of the Random Forest  model is ' :<50}{': {}'.format(round(f1_score(target_test,target_pred_rd,average='macro'),4))}")
print(f"{'The score of the SVM model is ' :<50}{': {}'.format(round(f1_score(target_test,target_pred_svm,average='macro'),4))}")
print(f"{'The score of the Multi-Layer-Perceptron model is ' :<50}{': {}'.format(round(f1_score(target_test,target_pred_mlp,average='macro'),4))}")

The score of the logistic regression model is     : 0.7515
The score of the Random Forest  model is          : 0.9465
The score of the SVM model is                     : 0.9389
The score of the Multi-Layer-Perceptron model is  : 0.9466


## Execution of Split Based Selection Form Algorithm : 


In [30]:
split_point = len(numerical_cols)
nb_models = 100
(L_Subgroups,P) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh,split_point,4)

In [31]:
'SAVE THE LIST OF THE SUBGROUPS'
save_obj(L_Subgroups, path + 'list_subgroups')

## Subgroups Descriptions

In [32]:
att_names = data_test_df.columns
patt_descriptions = patterns(P,split_point,data_test,att_names)

subrgoup 0
0.13 < MFCCs_13 <= 0.95
-0.35 < MFCCs_ 4 <= 0.42
-0.3 < MFCCs_ 3 <= 0.25
0.33 < MFCCs_ 5 <= 0.53
-------------------------------------------------------------------
subrgoup 1
-0.61 < MFCCs_13 <= 0.01
-0.3 < MFCCs_ 3 <= 0.35
-0.01 < MFCCs_18 <= 0.43
-0.37 < MFCCs_16 <= -0.05
-0.17 < MFCCs_22 <= 0.43
-------------------------------------------------------------------
subrgoup 2
-0.61 < MFCCs_13 <= 0.01
-0.3 < MFCCs_ 3 <= 0.35
-0.01 < MFCCs_18 <= 0.43
-0.37 < MFCCs_16 <= -0.05
-0.38 < MFCCs_22 <= -0.17
-------------------------------------------------------------------
subrgoup 3
0.01 < MFCCs_13 <= 0.95
-0.35 < MFCCs_ 4 <= 0.42
0.25 < MFCCs_ 3 <= 1.0
-0.41 < MFCCs_ 8 <= 0.01
0.03 < MFCCs_ 5 <= 0.53
-0.01 < MFCCs_ 7 <= 0.62
-------------------------------------------------------------------
subrgoup 4
-0.61 < MFCCs_13 <= -0.22
-0.3 < MFCCs_ 3 <= 0.35
-0.51 < MFCCs_18 <= -0.01
-0.41 < MFCCs_ 8 <= -0.14
0.34 < MFCCs_ 2 <= 1.0
------------------------------------------------------

0.01 < MFCCs_20 <= 0.47
-------------------------------------------------------------------
subrgoup 40
0.01 < MFCCs_13 <= 0.95
0.42 < MFCCs_ 4 <= 1.0
-0.38 < MFCCs_22 <= 0.13
-0.02 < MFCCs_16 <= 0.67
-0.45 < MFCCs_ 9 <= 0.23
0.26 < MFCCs_ 5 <= 0.53
-0.19 < MFCCs_ 7 <= 0.62
-0.14 < MFCCs_15 <= 0.66
-------------------------------------------------------------------
subrgoup 41
0.01 < MFCCs_13 <= 0.95
0.42 < MFCCs_ 4 <= 1.0
-0.38 < MFCCs_22 <= 0.13
-0.02 < MFCCs_16 <= 0.67
-0.45 < MFCCs_ 9 <= 0.23
0.26 < MFCCs_ 5 <= 0.53
-0.19 < MFCCs_ 7 <= 0.62
-0.72 < MFCCs_15 <= -0.14
-------------------------------------------------------------------
subrgoup 42
0.01 < MFCCs_13 <= 0.95
-0.35 < MFCCs_ 4 <= 0.42
0.25 < MFCCs_ 3 <= 1.0
-0.13 < MFCCs_ 8 <= 0.01
-0.49 < MFCCs_ 5 <= 0.03
0.08 < MFCCs_14 <= 0.46
-0.13 < MFCCs_15 <= -0.08
-------------------------------------------------------------------
subrgoup 43
0.01 < MFCCs_13 <= 0.95
-0.35 < MFCCs_ 4 <= 0.42
0.25 < MFCCs_ 3 <= 1.0
-0.13 < MFCCs_ 8 <=

In [33]:
'SAVE THE SUBGROUPS PATTERNS'
save_obj(patt_descriptions, path + 'patterns')
save_obj(att_names, path + 'att_names')