In [1]:
" Import the libraries " 

import os
import sys 
import math
import copy

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
" Import the scripts of SD for Explaining "

absFilePath = os.path.dirname(os.path.dirname(os.getcwd()))
newPath = os.path.join(absFilePath, 'SplitSD4X')
sys.path.append(newPath)

from fill_missing_values import *
from missing_values_table import *
from neighbors_generation import *
from patterns_extraction import *
from performances import *
from subgroups_discovery import *
from sp_lime import *

### Data Preparation 

In [3]:
"Loading and preparing data"
datasets_path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'Datasets\\')
url = datasets_path + 'data_adult_income.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [4]:
" Handling some data "

df.replace('?', np.nan, inplace=True)
df = df.dropna()
df = df.drop(['capital-gain','capital-loss','native-country'],axis =1)
df['age'] = df['age'].astype(float)
df['fnlwgt'] = df['fnlwgt'].astype(float)
df['educational-num'] = df['educational-num'].astype(float)
df['hours-per-week'] = df['hours-per-week'].astype(float)
df['income'] = df['income'].map({ "<=50K": 1, ">50K": 2 })

In [5]:
data_df = df.drop(columns=['income'])
target_df = df['income']

In [6]:
categorical_feature_mask = (data_df.dtypes == object)
categorical_cols_names = data_df.columns[categorical_feature_mask].tolist()
numerical_cols_names = data_df.columns[~categorical_feature_mask].tolist()

In [7]:
data_df = pd.concat([data_df[numerical_cols_names], data_df[categorical_cols_names]],axis = 1)
data_df.head()

Unnamed: 0,age,fnlwgt,educational-num,hours-per-week,workclass,education,marital-status,occupation,relationship,race,gender
0,25.0,226802.0,7.0,40.0,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male
1,38.0,89814.0,9.0,50.0,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male
2,28.0,336951.0,12.0,40.0,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male
3,44.0,160323.0,10.0,40.0,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male
5,34.0,198693.0,6.0,30.0,Private,10th,Never-married,Other-service,Not-in-family,White,Male


In [8]:
data_target_df = pd.concat([data_df, target_df], axis=1) 

In [9]:
" generate the Test SET "
a = False 
b = False
c = False
d = False 
e = False
f = False
g = False

while (not a or not b or not c or not d or not e or not f or not g ) : 

    nb_test_instances = 1000 
    test_df = data_target_df.sample(n=nb_test_instances)
    data_test_df = test_df.drop(columns=['income'])
    target_test_df = test_df['income']


    a = (data_df['workclass'].unique().size == data_test_df['workclass'].unique().size)
    b = (data_df['education'].unique().size == data_test_df['education'].unique().size)
    c = (data_df['marital-status'].unique().size == data_test_df['marital-status'].unique().size)
    d = (data_df['occupation'].unique().size == data_test_df['occupation'].unique().size)
    e = (data_df['relationship'].unique().size == data_test_df['relationship'].unique().size)
    f = (data_df['race'].unique().size == data_test_df['race'].unique().size)
    g = (data_df['gender'].unique().size == data_test_df['gender'].unique().size)

data_test_df.head()

Unnamed: 0,age,fnlwgt,educational-num,hours-per-week,workclass,education,marital-status,occupation,relationship,race,gender
13471,23.0,39551.0,12.0,12.0,Private,Assoc-acdm,Never-married,Sales,Own-child,White,Female
33969,47.0,103020.0,4.0,40.0,Private,7th-8th,Married-civ-spouse,Machine-op-inspct,Wife,Other,Female
38310,23.0,139012.0,11.0,40.0,Private,Assoc-voc,Never-married,Transport-moving,Own-child,Asian-Pac-Islander,Male
44991,22.0,141028.0,9.0,30.0,Private,HS-grad,Never-married,Other-service,Own-child,Black,Male
14830,49.0,340755.0,14.0,40.0,Self-emp-not-inc,Masters,Married-civ-spouse,Exec-managerial,Husband,White,Male


In [10]:
" generate the Training SET "
train_df = pd.concat([data_target_df,test_df]).drop_duplicates(keep=False)
data_train_df = train_df.drop(columns=['income'])
target_train_df = train_df['income']
data_train_df.head()

Unnamed: 0,age,fnlwgt,educational-num,hours-per-week,workclass,education,marital-status,occupation,relationship,race,gender
0,25.0,226802.0,7.0,40.0,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male
1,38.0,89814.0,9.0,50.0,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male
2,28.0,336951.0,12.0,40.0,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male
3,44.0,160323.0,10.0,40.0,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male
5,34.0,198693.0,6.0,30.0,Private,10th,Never-married,Other-service,Not-in-family,White,Male


In [11]:
" Decode Categorical Features "
workclass_mapper = {
    0 : 'Private',
    1 : 'Local-gov',
    2 : 'Self-emp-not-inc',
    3 : 'Federal-gov',
    4 : 'State-gov',
    5 : 'Self-emp-inc',
    6 : 'Without-pay'
}
workclass_mapper_inv = dict(map(reversed, workclass_mapper.items()))

'------------------------------------------------------------------------'

education_mapper = {
    0 : '11th', 1 : 'HS-grad', 2 : 'Assoc-acdm', 3 : 'Some-college',
    4 : '10th', 5 : 'Prof-school', 6 : '7th-8th', 7 : 'Bachelors', 
    8 : 'Masters' , 9 : '5th-6th', 10 : 'Assoc-voc', 11 : '9th' ,
    12 : 'Doctorate', 13 : '12th', 14 : '1st-4th', 15 : 'Preschool'
}
education_mapper_inv = dict(map(reversed, education_mapper.items()))

'------------------------------------------------------------------------'

maritalStatus_mapper = {
    0 : 'Never-married', 1 : 'Married-civ-spouse', 2 : 'Widowed', 
    3 : 'Separated',4 : 'Divorced', 5 : 'Married-spouse-absent', 
    6 : 'Married-AF-spouse'
}
maritalStatus_mapper_inv = dict(map(reversed, maritalStatus_mapper.items()))

'------------------------------------------------------------------------'

occupation_mapper = {
    0 : 'Machine-op-inspct', 1 : 'Farming-fishing', 2 : 'Protective-serv', 3 : 'Other-service',
    4 : 'Prof-specialty', 5 : 'Craft-repair', 6 : 'Adm-clerical', 7 : 'Exec-managerial', 
    8 : 'Tech-support' , 9 : 'Sales', 10 : 'Priv-house-serv', 11 : 'Transport-moving' ,
    12 : 'Handlers-cleaners', 13 : 'Armed-Forces'
}
occupation_mapper_inv = dict(map(reversed, occupation_mapper.items()))

'------------------------------------------------------------------------'

relationship_mapper = {
    0 : 'Own-child', 1 : 'Husband', 2 : 'Not-in-family', 
    3 : 'Unmarried', 4 : 'Wife', 5 : 'Other-relative'
}
relationship_mapper_inv = dict(map(reversed, relationship_mapper.items()))

'------------------------------------------------------------------------'

race_mapper = {
    0 : 'Black', 1 : 'White', 2 : 'Other', 
    3 : 'Amer-Indian-Eskimo', 4 : 'Asian-Pac-Islander'
}
race_mapper_inv = dict(map(reversed, race_mapper.items()))

'------------------------------------------------------------------------'
gender_mapper = {
    0 : 'Male', 1 : 'Female'
}
gender_mapper_inv = dict(map(reversed, gender_mapper.items()))

In [12]:
data_test_df_copy = data_test_df.copy()

In [13]:
data_test_df_copy['workclass'] = data_test_df_copy['workclass'].replace(workclass_mapper_inv)
data_test_df_copy['education'] = data_test_df_copy['education'].replace(education_mapper_inv)
data_test_df_copy['marital-status'] = data_test_df_copy['marital-status'].replace(maritalStatus_mapper_inv)
data_test_df_copy['occupation'] = data_test_df_copy['occupation'].replace(occupation_mapper_inv)
data_test_df_copy['relationship'] = data_test_df_copy['relationship'].replace(relationship_mapper_inv)
data_test_df_copy['race'] = data_test_df_copy['race'].replace(race_mapper_inv)
data_test_df_copy['gender'] = data_test_df_copy['gender'].replace(gender_mapper_inv)
data_test_df_copy.head()

Unnamed: 0,age,fnlwgt,educational-num,hours-per-week,workclass,education,marital-status,occupation,relationship,race,gender
13471,23.0,39551.0,12.0,12.0,0,2,0,9,0,1,1
33969,47.0,103020.0,4.0,40.0,0,6,1,0,4,2,1
38310,23.0,139012.0,11.0,40.0,0,10,0,11,0,4,0
44991,22.0,141028.0,9.0,30.0,0,1,0,3,0,0,0
14830,49.0,340755.0,14.0,40.0,2,8,1,7,1,1,0


In [14]:
data_test_copy = data_test_df_copy.values
numerical_cols = np.arange(0,len(numerical_cols_names)) 
categorical_cols = np.arange(len(numerical_cols_names),data_df.shape[1])

## Neighbors Generation

In [15]:
# generate neighbors : 
nb_neighbors = 50
list_neigh = generate_all_neighbors(data_test_copy,numerical_cols,categorical_cols,nb_neighbors)

In [16]:
" store all the neighbors together "
n = np.size(data_test_copy,0)
all_neighbors = list_neigh[0]
for i in range(1,n) :
    all_neighbors = np.concatenate((all_neighbors, list_neigh[i]), axis=0)

### One hot encoding 

In [17]:
df_neigh = pd.DataFrame(data = all_neighbors,columns= numerical_cols_names + categorical_cols_names)
df_neigh[categorical_cols_names] = df_neigh[categorical_cols_names].astype(int,errors='ignore')

" Decode all the data neighbors to perform one hot encoding "

df_neigh['workclass'] = df_neigh['workclass'].replace(workclass_mapper)
df_neigh['education'] = df_neigh['education'].replace(education_mapper)
df_neigh['marital-status'] = df_neigh['marital-status'].replace(maritalStatus_mapper)
df_neigh['occupation'] = df_neigh['occupation'].replace(occupation_mapper)
df_neigh['relationship'] = df_neigh['relationship'].replace(relationship_mapper)
df_neigh['race'] = df_neigh['race'].replace(race_mapper)
df_neigh['gender'] = df_neigh['gender'].replace(gender_mapper)

In [18]:
" One hot encoding "
df_neigh = pd.get_dummies(df_neigh, prefix_sep='_', drop_first=True)
df_neigh.head()

Unnamed: 0,age,fnlwgt,educational-num,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Male
0,21.822362,23673.568824,12.566528,13.061515,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,1
1,23.125444,44901.575241,11.82097,13.87539,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,22.702125,64941.747443,11.990287,12.540356,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
3,25.494887,54006.638258,12.144477,14.44478,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,22.932401,36988.576943,11.742537,11.915675,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [19]:
" Scale the neighbors data "
data_neigh = df_neigh.values

scaler_neigh = StandardScaler()
data_neigh_s = scaler_neigh.fit_transform(data_neigh)

" Store the neighbors in a list "
n = np.size(data_test_copy,0)
list_neigh = []
j = 0
for i in range(0,n):
    list_neigh.append(data_neigh_s[j:(j+nb_neighbors),:])
    j += nb_neighbors

####  One hot encoding for the training and the test sets

In [20]:
data_train_df = pd.get_dummies(data_train_df, prefix_sep='_', drop_first=True)
data_train_df.head()

Unnamed: 0,age,fnlwgt,educational-num,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Male
0,25.0,226802.0,7.0,40.0,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
1,38.0,89814.0,9.0,50.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,28.0,336951.0,12.0,40.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,44.0,160323.0,10.0,40.0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
5,34.0,198693.0,6.0,30.0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,1


In [21]:
data_train = data_train_df.values
target_train = target_train_df.values

In [22]:
data_test_df = pd.get_dummies(data_test_df, prefix_sep='_', drop_first=True)
data_test_df.head()

Unnamed: 0,age,fnlwgt,educational-num,hours-per-week,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,...,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Asian-Pac-Islander,race_Black,race_Other,race_White,gender_Male
13471,23.0,39551.0,12.0,12.0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
33969,47.0,103020.0,4.0,40.0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
38310,23.0,139012.0,11.0,40.0,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,1
44991,22.0,141028.0,9.0,30.0,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
14830,49.0,340755.0,14.0,40.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,1


In [23]:
data_test = data_test_df.values
target_test = target_test_df.values

In [24]:
" Scale the training and the test sets data"
scaler_train = StandardScaler()
data_train_s = scaler_train.fit_transform(data_train)

scaler_test = StandardScaler()
data_test_s = scaler_test.fit_transform(data_test)

In [25]:
" Define the functions to save and load data "
import pickle
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [26]:
'SAVE THE DATA'

path = './saved_data/'
save_obj(data_train_s, path + 'data_train_s')
save_obj(target_train, path + 'target_train')
save_obj(data_test, path  + 'data_test')
save_obj(data_test_s, path  + 'data_test_s')
save_obj(target_test, path + 'target_test')
save_obj(list_neigh, path + 'list_neighbors')

## Training the models

In [27]:
" Logistic Regression : "
lr = LogisticRegression(class_weight = "balanced",random_state=0,max_iter = 1000)
model_lr = lr.fit(data_train_s,target_train)
target_pred_lr = model_lr.predict(data_test_s)

In [40]:
" Random Forest : "
rdclassifier = RandomForestClassifier(max_depth=8, random_state=0) 
model_rd = rdclassifier.fit(data_train_s,target_train)
target_pred_rd = model_rd.predict(data_test_s)

In [29]:
" SVM : "
clf = svm.SVC(probability=True,decision_function_shape='ovr')
model_svm = clf.fit(data_train_s, target_train)
target_pred_svm = model_svm.predict(data_test_s)

In [30]:
" MLP Classifier : "
mlp = MLPClassifier(activation='logistic',hidden_layer_sizes=(50,50), max_iter=5000,
                    solver='sgd', random_state=1,alpha=0.1,
                    learning_rate_init=.1)
model_nt = mlp.fit(data_train_s, target_train)
target_pred_mlp = model_nt.predict(data_test_s)

## Scores of the black box models 

In [41]:
print(f"{'The score of the logistic regression model is ' :<50}{': {}'.format(round(f1_score(target_test,target_pred_lr),4))}")
print(f"{'The score of the Random Forest  model is ' :<50}{': {}'.format(round(f1_score(target_test,target_pred_rd),4))}")
print(f"{'The score of the SVM model is ' :<50}{': {}'.format(round(f1_score(target_test,target_pred_svm),4))}")
print(f"{'The score of the Multi-Layer-Perceptron model is ' :<50}{': {}'.format(round(f1_score(target_test,target_pred_mlp),4))}")

The score of the logistic regression model is     : 0.8488
The score of the Random Forest  model is          : 0.8871
The score of the SVM model is                     : 0.8793
The score of the Multi-Layer-Perceptron model is  : 0.8882


## Execution of Split Based Selection Form Algorithm : 


In [32]:
split_point = len(numerical_cols)
nb_models = 100
(L_Subgroups,P) = SplitBasedSelectionForm (data_test_s, target_test, nb_models, model_nt, list_neigh,split_point,2)

In [33]:
'SAVE THE LIST OF THE SUBGROUPS'
save_obj(L_Subgroups, path + 'list_subgroups')

### Define patterns


In [34]:
att_names = data_test_df.columns
data_test_means = scaler_test.mean_
data_test_stds = np.sqrt(scaler_test.var_)
patt_descriptions = patterns_sc(P,split_point,data_test_s,att_names,data_test_means,data_test_stds)

subrgoup 0
31.6 < age <= 90.0
7.0 < educational-num <= 10.4
32.3 < hours-per-week <= 35.3
marital-status_Married-civ-spouse = 1
-------------------------------------------------------------------
subrgoup 1
17.0 < age <= 31.6
12.3 < educational-num <= 16.0
55.6 < hours-per-week <= 99.0
-------------------------------------------------------------------
subrgoup 2
31.6 < age <= 90.0
7.0 < educational-num <= 10.4
1.0 < hours-per-week <= 24.2
marital-status_Married-civ-spouse = 0
-------------------------------------------------------------------
subrgoup 3
31.6 < age <= 49.0
13.0 < educational-num <= 16.0
48.6 < hours-per-week <= 59.8
-------------------------------------------------------------------
subrgoup 4
31.6 < age <= 41.0
7.0 < educational-num <= 9.6
48.6 < hours-per-week <= 99.0
-------------------------------------------------------------------
subrgoup 5
31.6 < age <= 90.0
10.4 < educational-num <= 16.0
1.0 < hours-per-week <= 24.8
--------------------------------------------

In [35]:
'SAVE THE SUBGROUPS PATTERNS'
save_obj(patt_descriptions, path + 'patterns')
save_obj(att_names, path + 'att_names')