In [98]:
import lore
from datamanager import *

from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, accuracy_score

from util import record2str

ipynb to py in pycharm:
- jupyter nbconvert --to script new_run.ipynb

## Dataset

In [99]:
## Iris Dataset
# dataset_name = 'dataset/iris.csv'
# dataset = prepare_iris_dataset(dataset_name)

## wine
# dataset_name = 'dataset/wine.csv'
# dataset = prepare_wine_dataset(dataset_name)

##############################################
#           Categorical dataset              #
##############################################
## german: (0 = Good, 1 = Bad)
# dataset_name = 'dataset/german_credit.csv'
# dataset = prepare_german_dataset(dataset_name)

## adult: ['<=50K', '>50K']
dataset_name = 'dataset/adult.csv'
dataset = prepare_adult_dataset(dataset_name)

## compas-scores-two-years: ['High', 'Low', 'Medium']
# dataset_name = 'dataset/compas-scores-two-years.csv'
# dataset = prepare_compass_dataset(dataset_name)

dataframe = dataset[0]
class_name = dataset[1]
dataset_fin = prepare_dataset(dataframe, class_name)

In [100]:
df = dataset_fin[0] #dataframe with unique numeric class values(0, 1, ...)
feature_names = dataset_fin[1]
class_values = dataset_fin[2]
numeric_columns = dataset_fin[3]
rdf = dataset_fin[4] #real dataframe
real_feature_names = dataset_fin[5]
features_map = dataset_fin[6] #map each class name to its unique numeric value

In [101]:
rdf.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass,education,marital-status,occupation,relationship,race,sex,native-country,class
0,39,2174,0,40,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,50,0,0,13,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,38,0,0,40,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,53,0,0,40,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,28,0,0,40,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


In [102]:
df.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass=Federal-gov,workclass=Local-gov,workclass=Never-worked,workclass=Private,workclass=Self-emp-inc,workclass=Self-emp-not-inc,...,native-country=Puerto-Rico,native-country=Scotland,native-country=South,native-country=Taiwan,native-country=Thailand,native-country=Trinadad&Tobago,native-country=United-States,native-country=Vietnam,native-country=Yugoslavia,class
0,39,2174,0,40,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,0,0,13,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,38,0,0,40,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,53,0,0,40,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,28,0,0,40,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Black box classifier

In [103]:
X = df.loc[:, df.columns != class_name].values
y = df[class_name].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
blackbox = RandomForestClassifier()
blackbox.fit(X_train, y_train)

RandomForestClassifier()

In [104]:
y_pred = blackbox.predict(X_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.850


## select an instance _x_

In [105]:
i = 10
x = X_test[i]
y_val = blackbox.predict(x.reshape(1,-1))[0]

print(class_values)
class_prob = blackbox.predict_proba(x.reshape(1,-1))[0]
print(class_prob)

y_val_name = class_values[y_val]
print('blackbox(x) = { %s }' % y_val_name)


['<=50K', '>50K']
[0.42 0.58]
blackbox(x) = { >50K }


In [106]:
print('x = %s' % record2str(x, feature_names, numeric_columns))

x = { age = 30, capital-gain = 0, capital-loss = 1977, hours-per-week = 40, workclass = Private, education = Doctorate, marital-status = Married-civ-spouse, occupation = Prof-specialty, relationship = Husband, race = Asian-Pac-Islander, sex = Male, native-country = China }


# LORE explainer (explaining an instance x)

In [107]:
lore_obj = lore.LORE(X_test, blackbox, feature_names, class_name, class_values,
                 numeric_columns, features_map, neigh_type='ngmusx', verbose=False)

In [108]:
# just to check
Z = lore_obj.neighgen_fn(x)
print('Z is:',Z)
Z.shape

Z is: [[ 3.00000000e+01  0.00000000e+00  1.97700000e+03 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 2.99867599e+01 -3.25485724e-02  1.97701552e+03 ...  1.38403217e-03
   1.51129317e-03 -1.83794464e-02]
 [ 2.99804587e+01  2.53484207e-03  1.97701009e+03 ...  2.24775341e-03
  -4.51324608e-03  1.17690940e-02]
 ...
 [ 2.97063858e+01 -5.36434170e-02  1.96196730e+03 ...  8.65295518e-02
  -8.85785446e-02 -7.86957162e-02]
 [ 2.97429244e+01 -4.70509344e-02  1.96198181e+03 ...  7.30957001e-02
  -8.59698818e-02 -5.74963411e-02]
 [ 2.97216115e+01 -5.57548152e-02  1.96197492e+03 ...  9.02634721e-02
  -1.00604553e-01 -7.98644057e-02]]


(1010, 103)

In [109]:
explanation = lore_obj.explain_instance(x, samples=1000, nbr_runs=10, verbose=True)

print(explanation)

generating neighborhood - ngmusx
synthetic neighborhood class counts {'<=50K': 778, '>50K': 232}
r = { hours-per-week > 39.93 } --> { class: >50K }
c = { { hours-per-week <= 39.93, occupation = Transport-moving },
      { hours-per-week <= 39.93, marital-status = Never-married },
      { hours-per-week <= 39.93, occupation = Farming-fishing } }


In [110]:
print('x = %s' % record2str(x, feature_names, numeric_columns))

x = { age = 30, capital-gain = 0, capital-loss = 1977, hours-per-week = 40, workclass = Private, education = Doctorate, marital-status = Married-civ-spouse, occupation = Prof-specialty, relationship = Husband, race = Asian-Pac-Islander, sex = Male, native-country = China }


## check the borderline

In [74]:
temp_x = x.copy()

In [75]:
temp_x[3] = temp_x[3]+1
print('x = %s' % record2str(temp_x, feature_names, numeric_columns))

x = { sepal_length = 6.0, sepal_width = 2.9, petal_length = 4.5, petal_width = 2.5 }


In [76]:
print(class_values)
print(blackbox.predict_proba(temp_x.reshape(1,-1))[0])
print(class_values[blackbox.predict(temp_x.reshape(1,-1))[0]])

['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
[0.   0.55 0.45]
Iris-versicolor


In [77]:
explanation_temp = lore_obj.explain_instance(temp_x, samples=1000, nbr_runs=10, verbose=True)

print(explanation_temp)

generating neighborhood - ngmusx
synthetic neighborhood class counts {'Iris-versicolor': 714, 'Iris-virginica': 296}
r = { sepal_length <= 6.05, sepal_width > 2.82 } --> { class: Iris-versicolor }
c = { { sepal_length > 6.05, sepal_width <= 2.85 } }


In [78]:
temp_x[0] = 6.5
print('x = %s' % record2str(temp_x, feature_names, numeric_columns))

x = { sepal_length = 6.5, sepal_width = 2.9, petal_length = 4.5, petal_width = 2.5 }


In [79]:
print(class_values)
print(blackbox.predict_proba(temp_x.reshape(1,-1))[0])
print(class_values[blackbox.predict(temp_x.reshape(1,-1))[0]])

['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
[0.   0.43 0.57]
Iris-virginica


In [80]:
explanation_temp2 = lore_obj.explain_instance(temp_x, samples=1000, nbr_runs=10, verbose=True)

print(explanation_temp2)

generating neighborhood - ngmusx
synthetic neighborhood class counts {'Iris-versicolor': 469, 'Iris-virginica': 541}
r = { sepal_width <= 3.10, petal_length > 4.41 } --> { class: Iris-virginica }
c = { { sepal_width > 3.10 } }
