In [57]:
import lore
from datamanager import *

from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, accuracy_score

from util import record2str

ipynb to py in pycharm:
- jupyter nbconvert --to script new_run.ipynb

## Dataset

In [66]:
## Iris Dataset
# dataset_name = 'dataset/iris.csv'
# dataset = prepare_iris_dataset(dataset_name)

## wine
# dataset_name = 'dataset/wine.csv'
# dataset = prepare_wine_dataset(dataset_name)

##############################################
#           Categorical dataset              #
##############################################
## german: (0 = Good, 1 = Bad)
# dataset_name = 'dataset/german_credit.csv'
# dataset = prepare_german_dataset(dataset_name)

## adult: ['<=50K', '>50K']
# dataset_name = 'dataset/adult.csv'
# dataset = prepare_adult_dataset(dataset_name)

## compas-scores-two-years: ['High', 'Low', 'Medium']
dataset_name = 'dataset/compas-scores-two-years.csv'
dataset = prepare_compass_dataset(dataset_name)

dataframe = dataset[0]
class_name = dataset[1]
dataset_fin = prepare_dataset(dataframe, class_name)

In [67]:
df = dataset_fin[0] #dataframe with unique numeric class values(0, 1, ...)
feature_names = dataset_fin[1]
class_values = dataset_fin[2]
numeric_columns = dataset_fin[3]
rdf = dataset_fin[4] #real dataframe
real_feature_names = dataset_fin[5]
features_map = dataset_fin[6] #map each class name to its unique numeric value

In [68]:
rdf.head()

Unnamed: 0,age,priors_count,days_b_screening_arrest,is_recid,is_violent_recid,two_year_recid,length_of_stay,age_cat,sex,race,c_charge_degree,class
0,69,0,1,0,0,0,0,Greater than 45,Male,Other,F,Low
1,34,0,1,1,1,1,10,25 - 45,Male,African-American,F,Low
2,24,4,1,1,0,1,1,Less than 25,Male,African-American,F,Low
3,23,1,1,0,0,0,0,Less than 25,Male,African-American,F,High
4,43,2,1,0,0,0,0,25 - 45,Male,Other,F,Low


In [69]:
df.head()

Unnamed: 0,age,priors_count,days_b_screening_arrest,is_recid,is_violent_recid,two_year_recid,length_of_stay,age_cat=25 - 45,age_cat=Greater than 45,age_cat=Less than 25,...,sex=Male,race=African-American,race=Asian,race=Caucasian,race=Hispanic,race=Native American,race=Other,c_charge_degree=F,c_charge_degree=M,class
0,69,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,1,0,1
1,34,0,1,1,1,1,10,1,0,0,...,1,1,0,0,0,0,0,1,0,1
2,24,4,1,1,0,1,1,0,0,1,...,1,1,0,0,0,0,0,1,0,1
3,23,1,1,0,0,0,0,0,0,1,...,1,1,0,0,0,0,0,1,0,0
4,43,2,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,1,0,1


## Black box classifier

In [70]:
X = df.loc[:, df.columns != class_name].values
y = df[class_name].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
blackbox = RandomForestClassifier()
blackbox.fit(X_train, y_train)

RandomForestClassifier()

In [71]:
y_pred = blackbox.predict(X_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.611


## select an instance _x_

In [72]:
i = 10
x = X_test[i]
y_val = blackbox.predict(x.reshape(1,-1))[0]

print(class_values)
class_prob = blackbox.predict_proba(x.reshape(1,-1))[0]
print(class_prob)

y_val_name = class_values[y_val]
print('blackbox(x) = { %s }' % y_val_name)


['High', 'Low', 'Medium']
[0.38 0.01 0.61]
blackbox(x) = { Medium }


In [73]:
print('x = %s' % record2str(x, feature_names, numeric_columns))

x = { age = 26, priors_count = 8, days_b_screening_arrest = 1, is_recid = 1, is_violent_recid = 1, two_year_recid = 1, length_of_stay = 41, age_cat = 25 - 45, sex = Male, race = Caucasian, c_charge_degree = F }


# LORE explainer (explaining an instance x)

In [78]:
lore_obj = lore.LORE(X_test, blackbox, feature_names, class_name, class_values,
                 numeric_columns, features_map, neigh_type='ngmusx', multi_label=False , verbose=False)

In [79]:
# just to check
Z = lore_obj.neighgen_fn(x, upper_threshold=4)
print('Z is:',Z)
Z.shape

Z is: [[ 2.60000000e+01  8.00000000e+00  1.00000000e+00 ...  0.00000000e+00
   1.00000000e+00  0.00000000e+00]
 [ 2.59592168e+01  7.99745029e+00  1.07568820e+00 ... -2.37842458e-02
   9.84631595e-01  1.35616222e-02]
 [ 2.59997265e+01  8.01182259e+00  1.00569185e+00 ... -1.42198793e-02
   9.95998601e-01  1.26461802e-02]
 ...
 [ 2.54964916e+01  7.91231829e+00  9.75403229e-01 ... -9.89460775e-02
   8.92675609e-01 -9.72957107e-02]
 [ 2.54651186e+01  7.95086991e+00  9.11758533e-01 ... -3.69276344e-02
   9.01797746e-01 -6.02134276e-02]
 [ 2.55165808e+01  7.93693373e+00  9.32498211e-01 ... -3.02494268e-02
   9.29798635e-01 -1.29595123e-01]]


(1010, 20)

In [80]:
explanation = lore_obj.explain_instance(x, samples=1000, nbr_runs=10, upper_threshold=32)

print(explanation)

r = { age > 25.50, race = Caucasian } --> { class: Medium }
c = { { race != Caucasian } }


In [81]:
explanation = lore_obj.explain_instance(x, samples=1000, nbr_runs=10, upper_threshold=4)

print(explanation)

r = { age > 25.50, age_cat = 25 - 45 } --> { class: Medium }
c = { { age <= 25.50 } }


In [82]:
print('x = %s' % record2str(x, feature_names, numeric_columns))

x = { age = 26, priors_count = 8, days_b_screening_arrest = 1, is_recid = 1, is_violent_recid = 1, two_year_recid = 1, length_of_stay = 41, age_cat = 25 - 45, sex = Male, race = Caucasian, c_charge_degree = F }


## check the borderline

In [85]:
temp_x = x.copy()

In [86]:
features_map

defaultdict(dict,
            {0: {'age': 0},
             1: {'priors_count': 1},
             2: {'days_b_screening_arrest': 2},
             3: {'is_recid': 3},
             4: {'is_violent_recid': 4},
             5: {'two_year_recid': 5},
             6: {'length_of_stay': 6},
             7: {'25 - 45': 7, 'Greater than 45': 8, 'Less than 25': 9},
             8: {'Female': 10, 'Male': 11},
             9: {'African-American': 12,
              'Asian': 13,
              'Caucasian': 14,
              'Hispanic': 15,
              'Native American': 16,
              'Other': 17},
             10: {'F': 18, 'M': 19}})

In [87]:
#c = { { age <= 25.50 } }
temp_x[0] = 25
print('x = %s' % record2str(temp_x, feature_names, numeric_columns))

x = { age = 25, priors_count = 8, days_b_screening_arrest = 1, is_recid = 1, is_violent_recid = 1, two_year_recid = 1, length_of_stay = 41, age_cat = 25 - 45, sex = Male, race = Caucasian, c_charge_degree = F }


In [89]:
print(class_values)
print(blackbox.predict_proba(temp_x.reshape(1,-1))[0])
print(class_values[blackbox.predict(temp_x.reshape(1,-1))[0]])

['High', 'Low', 'Medium']
[0.66 0.02 0.32]
High


In [91]:
explanation_temp = lore_obj.explain_instance(temp_x, samples=1000, nbr_runs=10)

print(explanation_temp)

r = { age_cat != Greater than 45, c_charge_degree != M } --> { class: High }
c = { { age_cat = Greater than 45, c_charge_degree != F },
      { c_charge_degree=M > 0.51, race = Hispanic },
      { c_charge_degree=M > 0.50, race = Hispanic },
      { age_cat = Greater than 45, sex = Female } }


In [92]:
#c = { { race != Caucasian } }
temp_x = x.copy()
temp_x[14] = 0
temp_x[13] = 1
print('x = %s' % record2str(temp_x, feature_names, numeric_columns))

x = { age = 26, priors_count = 8, days_b_screening_arrest = 1, is_recid = 1, is_violent_recid = 1, two_year_recid = 1, length_of_stay = 41, age_cat = 25 - 45, sex = Male, race = Asian, c_charge_degree = F }


In [93]:
print(class_values)
print(blackbox.predict_proba(temp_x.reshape(1,-1))[0])
print(class_values[blackbox.predict(temp_x.reshape(1,-1))[0]])

['High', 'Low', 'Medium']
[0.47 0.12 0.41]
High


In [98]:
explanation_temp2 = lore_obj.explain_instance(temp_x, samples=1000, nbr_runs=10, upper_threshold=4)

print(explanation_temp2)

r = { length_of_stay > 40.50, c_charge_degree = M } --> { class: High }
c = { { length_of_stay <= 40.50, age > 26.00 },
      { length_of_stay <= 40.39, age > 26.00 } }
