In [1]:
" Import the libraries " 

import os
import sys 
import math
import copy

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from statsmodels.api import OLS

from sklearn.metrics import f1_score

In [2]:
" Import the scripts of SD for Explaining and the supplementary scripts for neighbors generation"

absFilePath = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
newPath = os.path.join(absFilePath, 'SplitSD4X\\')
sys.path.append(newPath)

newPath_supp = os.path.join(newPath, 'supplementary')
sys.path.append(newPath_supp)

from fill_missing_values import *
from missing_values_table import *
from subgroups_discovery import *

from neighbors_generation import *
from neighbors_generation_2 import *
from neighbors_generation_3 import *
from neighbors_generation_4 import *

## Data Preparation 

In [3]:
" Loading the dataset "
datasets_path = os.path.join(absFilePath, 'Datasets\\')
data = np.loadtxt(fname = datasets_path +"thyroid_data.dat", delimiter =',')
with open(datasets_path +'thyroid_names.dat', 'r') as f:
    string = f.read()
columns_names = string.split(', ')
columns_names.append('Class')
df = pd.DataFrame(data = data ,columns= columns_names)

" Handling some data "
df = df.drop(['Hypopituitary'],axis =1)

" Decode Categorical Features " 
sex_mapper = {0 : 'M', 
              1 : 'F'}
sex_mapper_inv = dict(map(reversed, sex_mapper.items()))
df['Sex'] = df['Sex'].replace(sex_mapper)


ft_mapper = {0 : 'F', 
             1 : 'T'}
ft_mapper_inv = dict(map(reversed, ft_mapper.items()))
df['On_thyroxine'] = df['On_thyroxine'].replace(ft_mapper)
df['Query_on_thyroxine'] = df['Query_on_thyroxine'].replace(ft_mapper)
df['On_antithyroid_medication'] = df['On_antithyroid_medication'].replace(ft_mapper)
df['Sick'] = df['Sick'].replace(ft_mapper)
df['Pregnant'] = df['Pregnant'].replace(ft_mapper)
df['Thyroid_surgery'] = df['Thyroid_surgery'].replace(ft_mapper)
df['I131_treatment'] = df['I131_treatment'].replace(ft_mapper)
df['Query_hypothyroid'] = df['Query_hypothyroid'].replace(ft_mapper)
df['Query_hyperthyroid'] = df['Query_hyperthyroid'].replace(ft_mapper)
df['Lithium'] = df['Lithium'].replace(ft_mapper)
df['Goitre'] = df['Goitre'].replace(ft_mapper)
df['Tumor'] = df['Tumor'].replace(ft_mapper)
df['Psych'] = df['Psych'].replace(ft_mapper)

" separate the data and the target "
data_df = df.drop(columns=['Class'])
target_df = df['Class']

" calculate the categorical features mask "
categorical_feature_mask = (data_df.dtypes == object)
categorical_cols_names = data_df.columns[categorical_feature_mask].tolist()
numerical_cols_names = data_df.columns[~categorical_feature_mask].tolist()
" if no values missed we execute this code : "
data_df = pd.concat([data_df[numerical_cols_names], data_df[categorical_cols_names]],axis = 1)

" Encoding categorical features" 

data_df['Sex'] = data_df['Sex'].replace(sex_mapper_inv)
data_df['On_thyroxine'] = data_df['On_thyroxine'].replace(ft_mapper_inv)
data_df['Query_on_thyroxine'] = data_df['Query_on_thyroxine'].replace(ft_mapper_inv)
data_df['On_antithyroid_medication'] = data_df['On_antithyroid_medication'].replace(ft_mapper_inv)
data_df['Sick'] = data_df['Sick'].replace(ft_mapper_inv)
data_df['Pregnant'] = data_df['Pregnant'].replace(ft_mapper_inv)
data_df['Thyroid_surgery'] = data_df['Thyroid_surgery'].replace(ft_mapper_inv)
data_df['I131_treatment'] = data_df['I131_treatment'].replace(ft_mapper_inv)
data_df['Query_hypothyroid'] = data_df['Query_hypothyroid'].replace(ft_mapper_inv)
data_df['Query_hyperthyroid'] = data_df['Query_hyperthyroid'].replace(ft_mapper_inv)
data_df['Lithium'] = data_df['Lithium'].replace(ft_mapper_inv)
data_df['Goitre'] = data_df['Goitre'].replace(ft_mapper_inv)
data_df['Tumor'] = data_df['Tumor'].replace(ft_mapper_inv)
data_df['Psych'] = data_df['Psych'].replace(ft_mapper_inv)

data_target_df = pd.concat([data_df, target_df], axis=1) 

In [4]:
" generate the Test SET "
nb_test_instances = 1000 
test_df = data_target_df.sample(n=nb_test_instances)
data_test_df = test_df.drop(columns=['Class'])
target_test_df = test_df['Class']

" generate the Training SET "
train_df = pd.concat([data_target_df,test_df]).drop_duplicates(keep=False)
data_train_df = train_df.drop(columns=['Class'])
target_train_df = train_df['Class']

" Extract values of the test set to generate the neighbors"

data_test = data_test_df.values
target_test = target_test_df.values

numerical_cols = np.arange(0,len(numerical_cols_names)) 
categorical_cols = np.arange(len(numerical_cols_names),data_df.shape[1])

## Neighbors Generation (*Version 1*)

In [5]:
nb_neighbors = 50 
list_neigh = generate_all_neighbors(data_test,numerical_cols,categorical_cols,nb_neighbors)

" store all the neighbors together "
n = np.size(data_test,0)
all_neighbors = list_neigh[0]
for i in range(1,n) :
    all_neighbors = np.concatenate((all_neighbors, list_neigh[i]), axis=0)
    
" One hot encoding "

df_neigh = pd.DataFrame(data = all_neighbors,columns= numerical_cols_names + categorical_cols_names)
df_neigh[categorical_cols_names] = df_neigh[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "
df_neigh['Sex'] = df_neigh['Sex'].replace(sex_mapper)
df_neigh['On_thyroxine'] = df_neigh['On_thyroxine'].replace(ft_mapper)
df_neigh['Query_on_thyroxine'] = df_neigh['Query_on_thyroxine'].replace(ft_mapper)
df_neigh['On_antithyroid_medication'] = df_neigh['On_antithyroid_medication'].replace(ft_mapper)
df_neigh['Sick'] = df_neigh['Sick'].replace(ft_mapper)
df_neigh['Pregnant'] = df_neigh['Pregnant'].replace(ft_mapper)
df_neigh['Thyroid_surgery'] = df_neigh['Thyroid_surgery'].replace(ft_mapper)
df_neigh['I131_treatment'] = df_neigh['I131_treatment'].replace(ft_mapper)
df_neigh['Query_hypothyroid'] = df_neigh['Query_hypothyroid'].replace(ft_mapper)
df_neigh['Query_hyperthyroid'] = df_neigh['Query_hyperthyroid'].replace(ft_mapper)
df_neigh['Lithium'] = df_neigh['Lithium'].replace(ft_mapper)
df_neigh['Goitre'] = df_neigh['Goitre'].replace(ft_mapper)
df_neigh['Tumor'] = df_neigh['Tumor'].replace(ft_mapper)
df_neigh['Psych'] = df_neigh['Psych'].replace(ft_mapper)

" One hot encoding "
df_neigh = pd.get_dummies(df_neigh, prefix_sep='_', drop_first=True)

" Store the neighbors in a list"

data_neigh = df_neigh.values
n = np.size(data_test,0)
list_neigh = []
j = 0
for i in range(0,n):
    list_neigh.append(data_neigh[j:(j+nb_neighbors),:])
    j += nb_neighbors

## Neighbors Generation (*Version 2*)

In [6]:
mat_nb_categ = []
j = 0 
for name in categorical_cols_names :
    mat_nb_categ.append(np.size(data_df[name].unique()))
    
list_neigh_2 = generate_all_neighbors_2(data_test,numerical_cols,categorical_cols,mat_nb_categ,nb_neighbors)
" store all the neighbors together "
n = np.size(data_test,0)
all_neighbors_2 = list_neigh_2[0]
for i in range(1,n) :
    all_neighbors_2 = np.concatenate((all_neighbors_2, list_neigh_2[i]), axis=0)
    
df_neigh_2 = pd.DataFrame(data = all_neighbors_2,columns= numerical_cols_names + categorical_cols_names)
df_neigh_2[categorical_cols_names] = df_neigh_2[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "

df_neigh_2['Sex'] = df_neigh_2['Sex'].replace(sex_mapper)
df_neigh_2['On_thyroxine'] = df_neigh_2['On_thyroxine'].replace(ft_mapper)
df_neigh_2['Query_on_thyroxine'] = df_neigh_2['Query_on_thyroxine'].replace(ft_mapper)
df_neigh_2['On_antithyroid_medication'] = df_neigh_2['On_antithyroid_medication'].replace(ft_mapper)
df_neigh_2['Sick'] = df_neigh_2['Sick'].replace(ft_mapper)
df_neigh_2['Pregnant'] = df_neigh_2['Pregnant'].replace(ft_mapper)
df_neigh_2['Thyroid_surgery'] = df_neigh_2['Thyroid_surgery'].replace(ft_mapper)
df_neigh_2['I131_treatment'] = df_neigh_2['I131_treatment'].replace(ft_mapper)
df_neigh_2['Query_hypothyroid'] = df_neigh_2['Query_hypothyroid'].replace(ft_mapper)
df_neigh_2['Query_hyperthyroid'] = df_neigh_2['Query_hyperthyroid'].replace(ft_mapper)
df_neigh_2['Lithium'] = df_neigh_2['Lithium'].replace(ft_mapper)
df_neigh_2['Goitre'] = df_neigh_2['Goitre'].replace(ft_mapper)
df_neigh_2['Tumor'] = df_neigh_2['Tumor'].replace(ft_mapper)
df_neigh_2['Psych'] = df_neigh_2['Psych'].replace(ft_mapper)

" One hot encoding "
df_neigh_2 = pd.get_dummies(df_neigh_2, prefix_sep='_', drop_first=True)

data_neigh_2 = df_neigh_2.values
n = np.size(data_test,0)
list_neigh_2 = []
j = 0
for i in range(0,n):
    list_neigh_2.append(data_neigh_2[j:(j+nb_neighbors),:])
    j += nb_neighbors

## Neighbors Generation (*Version 3*)

In [7]:
list_neigh_3 = generate_all_neighbors_3(data_test,numerical_cols,categorical_cols,mat_nb_categ,nb_neighbors)

" store all the neighbors together "
n = np.size(data_test,0)
all_neighbors_3 = list_neigh_3[0]
for i in range(1,n) :
    all_neighbors_3 = np.concatenate((all_neighbors_3, list_neigh_3[i]), axis=0)
    
df_neigh_3 = pd.DataFrame(data = all_neighbors_3,columns= numerical_cols_names + categorical_cols_names)
df_neigh_3[categorical_cols_names] = df_neigh_3[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "

df_neigh_3['Sex'] = df_neigh_3['Sex'].replace(sex_mapper)
df_neigh_3['On_thyroxine'] = df_neigh_3['On_thyroxine'].replace(ft_mapper)
df_neigh_3['Query_on_thyroxine'] = df_neigh_3['Query_on_thyroxine'].replace(ft_mapper)
df_neigh_3['On_antithyroid_medication'] = df_neigh_3['On_antithyroid_medication'].replace(ft_mapper)
df_neigh_3['Sick'] = df_neigh_3['Sick'].replace(ft_mapper)
df_neigh_3['Pregnant'] = df_neigh_3['Pregnant'].replace(ft_mapper)
df_neigh_3['Thyroid_surgery'] = df_neigh_3['Thyroid_surgery'].replace(ft_mapper)
df_neigh_3['I131_treatment'] = df_neigh_3['I131_treatment'].replace(ft_mapper)
df_neigh_3['Query_hypothyroid'] = df_neigh_3['Query_hypothyroid'].replace(ft_mapper)
df_neigh_3['Query_hyperthyroid'] = df_neigh_3['Query_hyperthyroid'].replace(ft_mapper)
df_neigh_3['Lithium'] = df_neigh_3['Lithium'].replace(ft_mapper)
df_neigh_3['Goitre'] = df_neigh_3['Goitre'].replace(ft_mapper)
df_neigh_3['Tumor'] = df_neigh_3['Tumor'].replace(ft_mapper)
df_neigh_3['Psych'] = df_neigh_3['Psych'].replace(ft_mapper)

" One hot encoding "
df_neigh_3 = pd.get_dummies(df_neigh_3, prefix_sep='_', drop_first=True)

data_neigh_3 = df_neigh_3.values
n = np.size(data_test,0)
list_neigh_3 = []
j = 0
for i in range(0,n):
    list_neigh_3.append(data_neigh_3[j:(j+nb_neighbors),:])
    j += nb_neighbors

## Neighbors Generation (*Version 4*)

In [8]:
special = []
list_neigh_4 = generate_all_neighbors_4(data_test,numerical_cols,categorical_cols,mat_nb_categ,nb_neighbors,special)

" store all the neighbors together "
n = np.size(data_test,0)
all_neighbors_4 = list_neigh_4[0]
for i in range(1,n) :
    all_neighbors_4 = np.concatenate((all_neighbors_4, list_neigh_4[i]), axis=0)
    
df_neigh_4 = pd.DataFrame(data = all_neighbors_4,columns= numerical_cols_names + categorical_cols_names)
df_neigh_4[categorical_cols_names] = df_neigh_4[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "

df_neigh_4['Sex'] = df_neigh_4['Sex'].replace(sex_mapper)
df_neigh_4['On_thyroxine'] = df_neigh_4['On_thyroxine'].replace(ft_mapper)
df_neigh_4['Query_on_thyroxine'] = df_neigh_4['Query_on_thyroxine'].replace(ft_mapper)
df_neigh_4['On_antithyroid_medication'] = df_neigh_4['On_antithyroid_medication'].replace(ft_mapper)
df_neigh_4['Sick'] = df_neigh_4['Sick'].replace(ft_mapper)
df_neigh_4['Pregnant'] = df_neigh_4['Pregnant'].replace(ft_mapper)
df_neigh_4['Thyroid_surgery'] = df_neigh_4['Thyroid_surgery'].replace(ft_mapper)
df_neigh_4['I131_treatment'] = df_neigh_4['I131_treatment'].replace(ft_mapper)
df_neigh_4['Query_hypothyroid'] = df_neigh_4['Query_hypothyroid'].replace(ft_mapper)
df_neigh_4['Query_hyperthyroid'] = df_neigh_4['Query_hyperthyroid'].replace(ft_mapper)
df_neigh_4['Lithium'] = df_neigh_4['Lithium'].replace(ft_mapper)
df_neigh_4['Goitre'] = df_neigh_4['Goitre'].replace(ft_mapper)
df_neigh_4['Tumor'] = df_neigh_4['Tumor'].replace(ft_mapper)
df_neigh_4['Psych'] = df_neigh_4['Psych'].replace(ft_mapper)

" One hot encoding "
df_neigh_4 = pd.get_dummies(df_neigh_4, prefix_sep='_', drop_first=True)

data_neigh_4 = df_neigh_4.values
n = np.size(data_test,0)
list_neigh_4 = []
j = 0
for i in range(0,n):
    list_neigh_4.append(data_neigh_4[j:(j+nb_neighbors),:])
    j += nb_neighbors

####  One hot encoding for the training and the test sets

In [9]:
data_train_df['Sex'] = data_train_df['Sex'].replace(sex_mapper)
data_train_df['On_thyroxine'] = data_train_df['On_thyroxine'].replace(ft_mapper)
data_train_df['Query_on_thyroxine'] = data_train_df['Query_on_thyroxine'].replace(ft_mapper)
data_train_df['On_antithyroid_medication'] = data_train_df['On_antithyroid_medication'].replace(ft_mapper)
data_train_df['Sick'] = data_train_df['Sick'].replace(ft_mapper)
data_train_df['Pregnant'] = data_train_df['Pregnant'].replace(ft_mapper)
data_train_df['Thyroid_surgery'] = data_train_df['Thyroid_surgery'].replace(ft_mapper)
data_train_df['I131_treatment'] = data_train_df['I131_treatment'].replace(ft_mapper)
data_train_df['Query_hypothyroid'] = data_train_df['Query_hypothyroid'].replace(ft_mapper)
data_train_df['Query_hyperthyroid'] = data_train_df['Query_hyperthyroid'].replace(ft_mapper)
data_train_df['Lithium'] = data_train_df['Lithium'].replace(ft_mapper)
data_train_df['Goitre'] = data_train_df['Goitre'].replace(ft_mapper)
data_train_df['Tumor'] = data_train_df['Tumor'].replace(ft_mapper)
data_train_df['Psych'] = data_train_df['Psych'].replace(ft_mapper)

data_train_df = pd.get_dummies(data_train_df, prefix_sep='_', drop_first=True)
data_train = data_train_df.values
target_train = target_train_df.values

data_test_df['Sex'] = data_test_df['Sex'].replace(sex_mapper)
data_test_df['On_thyroxine'] = data_test_df['On_thyroxine'].replace(ft_mapper)
data_test_df['Query_on_thyroxine'] = data_test_df['Query_on_thyroxine'].replace(ft_mapper)
data_test_df['On_antithyroid_medication'] = data_test_df['On_antithyroid_medication'].replace(ft_mapper)
data_test_df['Sick'] = data_test_df['Sick'].replace(ft_mapper)
data_test_df['Pregnant'] = data_test_df['Pregnant'].replace(ft_mapper)
data_test_df['Thyroid_surgery'] = data_test_df['Thyroid_surgery'].replace(ft_mapper)
data_test_df['I131_treatment'] = data_test_df['I131_treatment'].replace(ft_mapper)
data_test_df['Query_hypothyroid'] = data_test_df['Query_hypothyroid'].replace(ft_mapper)
data_test_df['Query_hyperthyroid'] = data_test_df['Query_hyperthyroid'].replace(ft_mapper)
data_test_df['Lithium'] = data_test_df['Lithium'].replace(ft_mapper)
data_test_df['Goitre'] = data_test_df['Goitre'].replace(ft_mapper)
data_test_df['Tumor'] = data_test_df['Tumor'].replace(ft_mapper)
data_test_df['Psych'] = data_test_df['Psych'].replace(ft_mapper)

data_test_df = pd.get_dummies(data_test_df, prefix_sep='_', drop_first=True)
data_test = data_test_df.values
target_test = target_test_df.values

## Training the MLP model

In [10]:
" Sklearn MLP classifier "

mlp = MLPClassifier(hidden_layer_sizes=(5,), max_iter=500,
                    solver='sgd', random_state=1,
                    learning_rate_init=.1)

model_nt = mlp.fit(data_train, target_train)
target_pred_mlp = model_nt.predict(data_test)

## Execution of Split Based Selection Form Algorithm : 

In [11]:
split_point = len(numerical_cols)
nb_models = 100
(L_Subgroups_1,P_1) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh,split_point,3)
(L_Subgroups_2,P_2) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh_2,split_point,3)
(L_Subgroups_3,P_3) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh_3,split_point,3)
(L_Subgroups_4,P_4) = SplitBasedSelectionForm (data_test, target_test, nb_models, model_nt, list_neigh_4,split_point,3)

In [12]:
" Define the functions to save and load data "
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [13]:
'SAVE THE DATA'

path = './saved_data/'
save_obj(data_train, path + 'data_train')
save_obj(target_train, path + 'target_train')
save_obj(data_test, path  + 'data_test')
save_obj(target_test, path + 'target_test')
save_obj(list_neigh, path   + 'list_neighbors_1')
save_obj(list_neigh_2, path + 'list_neighbors_2')
save_obj(list_neigh_3, path + 'list_neighbors_3')
save_obj(list_neigh_4, path + 'list_neighbors_4')

In [14]:
'SAVE THE LIST OF THE SUBGROUPS'
save_obj(L_Subgroups_1, path + 'list_subgroups_1')
save_obj(L_Subgroups_2, path + 'list_subgroups_2')
save_obj(L_Subgroups_3, path + 'list_subgroups_3')
save_obj(L_Subgroups_4, path + 'list_subgroups_4')