In [1]:
" Import the libraries " 

import os
import sys 
import math
import copy

import numpy as np
import pandas as pd

from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
" Import the scripts of SD for Explaining and the supplementary scripts for neighbors generation"

absFilePath = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
newPath = os.path.join(absFilePath, 'SplitSD4X\\')
sys.path.append(newPath)

newPath_supp = os.path.join(newPath, 'supplementary')
sys.path.append(newPath_supp)

from fill_missing_values import *
from missing_values_table import *
from subgroups_discovery import *

from neighbors_generation import *
from neighbors_generation_2 import *
from neighbors_generation_3 import *
from neighbors_generation_4 import *

## Data Preparation 

In [3]:
" Loading the dataset "
datasets_path = os.path.join(absFilePath, 'Datasets\\')
url = datasets_path + 'data_adult_income.csv'
df = pd.read_csv(url)

" Handling some data "

df.replace('?', np.nan, inplace=True)
df = df.dropna()
df = df.drop(['capital-gain','capital-loss','native-country'],axis =1)
df['age'] = df['age'].astype(float)
df['fnlwgt'] = df['fnlwgt'].astype(float)
df['educational-num'] = df['educational-num'].astype(float)
df['hours-per-week'] = df['hours-per-week'].astype(float)
df['income'] = df['income'].map({ "<=50K": 1, ">50K": 2 })

data_df = df.drop(columns=['income'])
target_df = df['income']

categorical_feature_mask = (data_df.dtypes == object)
categorical_cols_names = data_df.columns[categorical_feature_mask].tolist()
numerical_cols_names = data_df.columns[~categorical_feature_mask].tolist()

data_df = pd.concat([data_df[numerical_cols_names], data_df[categorical_cols_names]],axis = 1)
data_target_df = pd.concat([data_df, target_df], axis=1) 

In [4]:
" generate the Test SET "
a = False 
b = False
c = False
d = False 
e = False
f = False
g = False

while (not a or not b or not c or not d or not e or not f or not g ) : 

    nb_test_instances = 1000 
    test_df = data_target_df.sample(n=nb_test_instances)
    data_test_df = test_df.drop(columns=['income'])
    target_test_df = test_df['income']


    a = (data_df['workclass'].unique().size == data_test_df['workclass'].unique().size)
    b = (data_df['education'].unique().size == data_test_df['education'].unique().size)
    c = (data_df['marital-status'].unique().size == data_test_df['marital-status'].unique().size)
    d = (data_df['occupation'].unique().size == data_test_df['occupation'].unique().size)
    e = (data_df['relationship'].unique().size == data_test_df['relationship'].unique().size)
    f = (data_df['race'].unique().size == data_test_df['race'].unique().size)
    g = (data_df['gender'].unique().size == data_test_df['gender'].unique().size)

" generate the Training SET "
train_df = pd.concat([data_target_df,test_df]).drop_duplicates(keep=False)
data_train_df = train_df.drop(columns=['income'])
target_train_df = train_df['income']

" Decode Categorical Features "
workclass_mapper = {
    0 : 'Private',
    1 : 'Local-gov',
    2 : 'Self-emp-not-inc',
    3 : 'Federal-gov',
    4 : 'State-gov',
    5 : 'Self-emp-inc',
    6 : 'Without-pay'
}
workclass_mapper_inv = dict(map(reversed, workclass_mapper.items()))

'------------------------------------------------------------------------'

education_mapper = {
    0 : '11th', 1 : 'HS-grad', 2 : 'Assoc-acdm', 3 : 'Some-college',
    4 : '10th', 5 : 'Prof-school', 6 : '7th-8th', 7 : 'Bachelors', 
    8 : 'Masters' , 9 : '5th-6th', 10 : 'Assoc-voc', 11 : '9th' ,
    12 : 'Doctorate', 13 : '12th', 14 : '1st-4th', 15 : 'Preschool'
}
education_mapper_inv = dict(map(reversed, education_mapper.items()))

'------------------------------------------------------------------------'

maritalStatus_mapper = {
    0 : 'Never-married', 1 : 'Married-civ-spouse', 2 : 'Widowed', 
    3 : 'Separated',4 : 'Divorced', 5 : 'Married-spouse-absent', 
    6 : 'Married-AF-spouse'
}
maritalStatus_mapper_inv = dict(map(reversed, maritalStatus_mapper.items()))

'------------------------------------------------------------------------'

occupation_mapper = {
    0 : 'Machine-op-inspct', 1 : 'Farming-fishing', 2 : 'Protective-serv', 3 : 'Other-service',
    4 : 'Prof-specialty', 5 : 'Craft-repair', 6 : 'Adm-clerical', 7 : 'Exec-managerial', 
    8 : 'Tech-support' , 9 : 'Sales', 10 : 'Priv-house-serv', 11 : 'Transport-moving' ,
    12 : 'Handlers-cleaners', 13 : 'Armed-Forces'
}
occupation_mapper_inv = dict(map(reversed, occupation_mapper.items()))

'------------------------------------------------------------------------'

relationship_mapper = {
    0 : 'Own-child', 1 : 'Husband', 2 : 'Not-in-family', 
    3 : 'Unmarried', 4 : 'Wife', 5 : 'Other-relative'
}
relationship_mapper_inv = dict(map(reversed, relationship_mapper.items()))

'------------------------------------------------------------------------'

race_mapper = {
    0 : 'Black', 1 : 'White', 2 : 'Other', 
    3 : 'Amer-Indian-Eskimo', 4 : 'Asian-Pac-Islander'
}
race_mapper_inv = dict(map(reversed, race_mapper.items()))

'------------------------------------------------------------------------'
gender_mapper = {
    0 : 'Male', 1 : 'Female'
}
gender_mapper_inv = dict(map(reversed, gender_mapper.items()))


data_test_df_copy = data_test_df.copy()

data_test_df_copy['workclass'] = data_test_df_copy['workclass'].replace(workclass_mapper_inv)
data_test_df_copy['education'] = data_test_df_copy['education'].replace(education_mapper_inv)
data_test_df_copy['marital-status'] = data_test_df_copy['marital-status'].replace(maritalStatus_mapper_inv)
data_test_df_copy['occupation'] = data_test_df_copy['occupation'].replace(occupation_mapper_inv)
data_test_df_copy['relationship'] = data_test_df_copy['relationship'].replace(relationship_mapper_inv)
data_test_df_copy['race'] = data_test_df_copy['race'].replace(race_mapper_inv)
data_test_df_copy['gender'] = data_test_df_copy['gender'].replace(gender_mapper_inv)

data_test_copy = data_test_df_copy.values
numerical_cols = np.arange(0,len(numerical_cols_names)) 
categorical_cols = np.arange(len(numerical_cols_names),data_df.shape[1])

## Neighbors Generation (*Version 1*)

In [5]:
nb_neighbors = 50
list_neigh = generate_all_neighbors(data_test_copy,numerical_cols,categorical_cols,nb_neighbors)

" store all the neighbors together "
n = np.size(data_test_copy,0)
all_neighbors = list_neigh[0]
for i in range(1,n) :
    all_neighbors = np.concatenate((all_neighbors, list_neigh[i]), axis=0)
    
" One hot encoding "

df_neigh = pd.DataFrame(data = all_neighbors,columns= numerical_cols_names + categorical_cols_names)
df_neigh[categorical_cols_names] = df_neigh[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "

df_neigh['workclass'] = df_neigh['workclass'].replace(workclass_mapper)
df_neigh['education'] = df_neigh['education'].replace(education_mapper)
df_neigh['marital-status'] = df_neigh['marital-status'].replace(maritalStatus_mapper)
df_neigh['occupation'] = df_neigh['occupation'].replace(occupation_mapper)
df_neigh['relationship'] = df_neigh['relationship'].replace(relationship_mapper)
df_neigh['race'] = df_neigh['race'].replace(race_mapper)
df_neigh['gender'] = df_neigh['gender'].replace(gender_mapper)

" One hot encoding "
df_neigh = pd.get_dummies(df_neigh, prefix_sep='_', drop_first=True)

" Scale the neighbors data "
data_neigh = df_neigh.values

scaler_neigh = StandardScaler()
data_neigh_s = scaler_neigh.fit_transform(data_neigh)

" Store the neighbors in a list "
n = np.size(data_test_copy,0)
list_neigh = []
j = 0
for i in range(0,n):
    list_neigh.append(data_neigh_s[j:(j+nb_neighbors),:])
    j += nb_neighbors

## Neighbors Generation (*Version 2*)

In [6]:
mat_nb_categ = []
j = 0 
for name in categorical_cols_names :
    mat_nb_categ.append(np.size(data_df[name].unique()))

nb_neighbors = 50
list_neigh_2 = generate_all_neighbors_2(data_test_copy,numerical_cols,categorical_cols,mat_nb_categ,nb_neighbors)
" store all the neighbors together "
n = np.size(data_test_copy,0)
all_neighbors_2 = list_neigh_2[0]
for i in range(1,n) :
    all_neighbors_2 = np.concatenate((all_neighbors_2, list_neigh_2[i]), axis=0)
    
df_neigh_2 = pd.DataFrame(data = all_neighbors_2,columns= numerical_cols_names + categorical_cols_names)
df_neigh_2[categorical_cols_names] = df_neigh_2[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "

df_neigh_2['workclass'] = df_neigh_2['workclass'].replace(workclass_mapper)
df_neigh_2['education'] = df_neigh_2['education'].replace(education_mapper)
df_neigh_2['marital-status'] = df_neigh_2['marital-status'].replace(maritalStatus_mapper)
df_neigh_2['occupation'] = df_neigh_2['occupation'].replace(occupation_mapper)
df_neigh_2['relationship'] = df_neigh_2['relationship'].replace(relationship_mapper)
df_neigh_2['race'] = df_neigh_2['race'].replace(race_mapper)
df_neigh_2['gender'] = df_neigh_2['gender'].replace(gender_mapper)

" One hot encoding "
df_neigh_2 = pd.get_dummies(df_neigh_2, prefix_sep='_', drop_first=True)

data_neigh_2 = df_neigh_2.values

scaler_neigh = StandardScaler()
data_neigh_2s = scaler_neigh.fit_transform(data_neigh_2)


n = np.size(data_test_copy,0)
list_neigh_2 = []
j = 0
for i in range(0,n):
    list_neigh_2.append(data_neigh_2s[j:(j+nb_neighbors),:])
    j += nb_neighbors

## Neighbors Generation (*Version 3*)

In [7]:
list_neigh_3 = generate_all_neighbors_3(data_test_copy,numerical_cols,categorical_cols,mat_nb_categ,nb_neighbors)

" store all the neighbors together "
n = np.size(data_test_copy,0)
all_neighbors_3 = list_neigh_3[0]
for i in range(1,n) :
    all_neighbors_3 = np.concatenate((all_neighbors_3, list_neigh_3[i]), axis=0)
    
df_neigh_3 = pd.DataFrame(data = all_neighbors_3,columns= numerical_cols_names + categorical_cols_names)
df_neigh_3[categorical_cols_names] = df_neigh_3[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "

df_neigh_3['workclass'] = df_neigh_3['workclass'].replace(workclass_mapper)
df_neigh_3['education'] = df_neigh_3['education'].replace(education_mapper)
df_neigh_3['marital-status'] = df_neigh_3['marital-status'].replace(maritalStatus_mapper)
df_neigh_3['occupation'] = df_neigh_3['occupation'].replace(occupation_mapper)
df_neigh_3['relationship'] = df_neigh_3['relationship'].replace(relationship_mapper)
df_neigh_3['race'] = df_neigh_3['race'].replace(race_mapper)
df_neigh_3['gender'] = df_neigh_3['gender'].replace(gender_mapper)

" One hot encoding "
df_neigh_3 = pd.get_dummies(df_neigh_3, prefix_sep='_', drop_first=True)

data_neigh_3 = df_neigh_3.values

scaler_neigh = StandardScaler()
data_neigh_3s = scaler_neigh.fit_transform(data_neigh_3)


n = np.size(data_test_copy,0)
list_neigh_3 = []
j = 0
for i in range(0,n):
    list_neigh_3.append(data_neigh_3s[j:(j+nb_neighbors),:])
    j += nb_neighbors

## Neighbors Generation (*Version 4*)

In [8]:
special = []

list_neigh_4 = generate_all_neighbors_4(data_test_copy,numerical_cols,categorical_cols,mat_nb_categ,nb_neighbors,special)

" store all the neighbors together "
n = np.size(data_test_copy,0)
all_neighbors_4 = list_neigh_4[0]
for i in range(1,n) :
    all_neighbors_4 = np.concatenate((all_neighbors_4, list_neigh_4[i]), axis=0)
    
df_neigh_4 = pd.DataFrame(data = all_neighbors_4,columns= numerical_cols_names + categorical_cols_names)
df_neigh_4[categorical_cols_names] = df_neigh_4[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "

df_neigh_4['workclass'] = df_neigh_4['workclass'].replace(workclass_mapper)
df_neigh_4['education'] = df_neigh_4['education'].replace(education_mapper)
df_neigh_4['marital-status'] = df_neigh_4['marital-status'].replace(maritalStatus_mapper)
df_neigh_4['occupation'] = df_neigh_4['occupation'].replace(occupation_mapper)
df_neigh_4['relationship'] = df_neigh_4['relationship'].replace(relationship_mapper)
df_neigh_4['race'] = df_neigh_4['race'].replace(race_mapper)
df_neigh_4['gender'] = df_neigh_4['gender'].replace(gender_mapper)

" One hot encoding "
df_neigh_4 = pd.get_dummies(df_neigh_4, prefix_sep='_', drop_first=True)

data_neigh_4 = df_neigh_4.values

scaler_neigh = StandardScaler()
data_neigh_4s = scaler_neigh.fit_transform(data_neigh_4)

n = np.size(data_test_copy,0)
list_neigh_4 = []
j = 0
for i in range(0,n):
    list_neigh_4.append(data_neigh_4s[j:(j+nb_neighbors),:])
    j += nb_neighbors

####  One hot encoding for the training and the test sets

In [9]:
data_train_df = pd.get_dummies(data_train_df, prefix_sep='_', drop_first=True)
data_train = data_train_df.values
target_train = target_train_df.values

data_test_df = pd.get_dummies(data_test_df, prefix_sep='_', drop_first=True)
data_test = data_test_df.values
target_test = target_test_df.values

" Scale the training and the test sets data"
scaler_train = StandardScaler()
data_train_s = scaler_train.fit_transform(data_train)

scaler_test = StandardScaler()
data_test_s = scaler_test.fit_transform(data_test)

## Training the MLP model

In [10]:
" Sklearn MLP Classifier "

mlp = MLPClassifier(activation='logistic',hidden_layer_sizes=(50,50), max_iter=5000,
                    solver='sgd', random_state=1,alpha=0.1,
                    learning_rate_init=.1)
model_nt = mlp.fit(data_train_s, target_train)
target_pred_mlp = model_nt.predict(data_test_s)

## Execution of Split Based Selection Form Algorithm : 

In [11]:
split_point = len(numerical_cols)
nb_models = 100
(L_Subgroups_1,P_1) = SplitBasedSelectionForm (data_test_s, target_test, nb_models, model_nt, list_neigh,split_point,2)
(L_Subgroups_2,P_2) = SplitBasedSelectionForm (data_test_s, target_test, nb_models, model_nt, list_neigh_2,split_point,2)
(L_Subgroups_3,P_3) = SplitBasedSelectionForm (data_test_s, target_test, nb_models, model_nt, list_neigh_3,split_point,2)
(L_Subgroups_4,P_4) = SplitBasedSelectionForm (data_test_s, target_test, nb_models, model_nt, list_neigh_4,split_point,2)

In [12]:
" Define the functions to save and load data "
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [13]:
'SAVE THE DATA'

path = './saved_data/'
save_obj(data_train_s, path + 'data_train')
save_obj(target_train, path + 'target_train')
save_obj(data_test_s,  path + 'data_test')
save_obj(target_test,  path + 'target_test')
save_obj(list_neigh ,  path + 'list_neighbors_1')
save_obj(list_neigh_2, path + 'list_neighbors_2')
save_obj(list_neigh_3, path + 'list_neighbors_3')
save_obj(list_neigh_4, path + 'list_neighbors_4')

In [14]:
'SAVE THE LIST OF THE SUBGROUPS'
save_obj(L_Subgroups_1, path + 'list_subgroups_1')
save_obj(L_Subgroups_2, path + 'list_subgroups_2')
save_obj(L_Subgroups_3, path + 'list_subgroups_3')
save_obj(L_Subgroups_4, path + 'list_subgroups_4')