In [1]:
import lore
from datamanager import *

from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, accuracy_score

from util import record2str

ipynb to py in pycharm:
- jupyter nbconvert --to script new_run.ipynb

## Dataset

In [2]:
## Iris Dataset
# dataset_name = 'dataset/iris.csv'
# dataset = prepare_iris_dataset(dataset_name)

## wine
# dataset_name = 'dataset/wine.csv'
# dataset = prepare_wine_dataset(dataset_name)

##############################################
#           Categorical dataset              #
##############################################
## german: (0 = Good, 1 = Bad)
# dataset_name = 'dataset/german_credit.csv'
# dataset = prepare_german_dataset(dataset_name)

## adult: ['<=50K', '>50K']
# dataset_name = 'dataset/adult.csv'
# dataset = prepare_adult_dataset(dataset_name)

## compas-scores-two-years: ['High', 'Low', 'Medium']
dataset_name = 'dataset/compas-scores-two-years.csv'
dataset = prepare_compass_dataset(dataset_name)

dataframe = dataset[0]
class_name = dataset[1]
dataset_fin = prepare_dataset(dataframe, class_name)

In [3]:
df = dataset_fin[0] #dataframe with unique numeric class values(0, 1, ...)
feature_names = dataset_fin[1]
class_values = dataset_fin[2]
numeric_columns = dataset_fin[3]
rdf = dataset_fin[4] #real dataframe
real_feature_names = dataset_fin[5]
features_map = dataset_fin[6] #map each class name to its unique numeric value
df_categorical_idx = dataset_fin[7]

In [4]:
rdf.head()

Unnamed: 0,age,priors_count,days_b_screening_arrest,is_recid,is_violent_recid,two_year_recid,length_of_stay,age_cat,sex,race,c_charge_degree,class
0,69,0,1,0,0,0,0,Greater than 45,Male,Other,F,Low
1,34,0,1,1,1,1,10,25 - 45,Male,African-American,F,Low
2,24,4,1,1,0,1,1,Less than 25,Male,African-American,F,Low
3,23,1,1,0,0,0,0,Less than 25,Male,African-American,F,High
4,43,2,1,0,0,0,0,25 - 45,Male,Other,F,Low


In [5]:
df.head()

Unnamed: 0,age,priors_count,days_b_screening_arrest,is_recid,is_violent_recid,two_year_recid,length_of_stay,age_cat=25 - 45,age_cat=Greater than 45,age_cat=Less than 25,...,sex=Male,race=African-American,race=Asian,race=Caucasian,race=Hispanic,race=Native American,race=Other,c_charge_degree=F,c_charge_degree=M,class
0,69,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,1,0,1
1,34,0,1,1,1,1,10,1,0,0,...,1,1,0,0,0,0,0,1,0,1
2,24,4,1,1,0,1,1,0,0,1,...,1,1,0,0,0,0,0,1,0,1
3,23,1,1,0,0,0,0,0,0,1,...,1,1,0,0,0,0,0,1,0,0
4,43,2,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,1,0,1


## Black box classifier

In [6]:
X = df.loc[:, df.columns != class_name].values
y = df[class_name].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
blackbox = RandomForestClassifier()
blackbox.fit(X_train, y_train)

RandomForestClassifier()

In [7]:
y_pred = blackbox.predict(X_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.591


## select an instance _x_

In [8]:
i = 10
x = X_test[i]
y_val = blackbox.predict(x.reshape(1,-1))[0]

print(class_values)
class_prob = blackbox.predict_proba(x.reshape(1,-1))[0]
print(class_prob)

y_val_name = class_values[y_val]
print('blackbox(x) = { %s }' % y_val_name)


['High', 'Low', 'Medium']
[0.15 0.26 0.59]
blackbox(x) = { Medium }


In [10]:
print('x = %s' % record2str(x, feature_names, numeric_columns))

x = { age = 22, priors_count = 1, days_b_screening_arrest = 19, is_recid = 1, is_violent_recid = 0, two_year_recid = 1, length_of_stay = 17, age_cat = Less than 25, sex = Male, race = Hispanic, c_charge_degree = F }


# LORE explainer (explaining an instance x)

In [11]:
lore_obj = lore.LORE(X_test, blackbox, feature_names, class_name, class_values,
                 numeric_columns, features_map, df_categorical_idx, neigh_type='ngmusx', verbose=False)

In [12]:
# just to check
Z = lore_obj.neighgen_fn(x, categorical_columns=df_categorical_idx)
print('Z is:',Z)
Z.shape

Z is: [[22.          1.         19.         ...  0.          1.
   0.        ]
 [22.05366026  1.00119021 18.98883769 ...  0.          1.
   0.        ]
 [21.97541032  0.97211349 19.07417378 ...  0.          1.
   0.        ]
 ...
 [21.58142125  1.55975748 19.16343229 ...  0.          1.
   0.        ]
 [21.45515226  1.5654605  19.16522187 ...  0.          1.
   0.        ]
 [21.52885987  1.5216077  19.18362474 ...  0.          1.
   0.        ]]


(1010, 20)

In [17]:
explanation = lore_obj.explain_instance(x, samples=1000, nbr_runs=10)

print(explanation)

r = { age > 21.50 } --> { class: Medium }
c = { { age <= 21.23, priors_count > 1.50 } }


In [14]:
print('x = %s' % record2str(x, feature_names, numeric_columns))

x = { age = 22, priors_count = 1, days_b_screening_arrest = 19, is_recid = 1, is_violent_recid = 0, two_year_recid = 1, length_of_stay = 17, age_cat = Less than 25, sex = Male, race = Hispanic, c_charge_degree = F }
