In [1]:
import lore
from datamanager import *

from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, accuracy_score

from util import record2str

ipynb to py in pycharm:
- jupyter nbconvert --to script new_run.ipynb

## Dataset

In [21]:
## Iris Dataset
# dataset_name = 'dataset/iris.csv'
# dataset = prepare_iris_dataset(dataset_name)

## wine
# dataset_name = 'dataset/wine.csv'
# dataset = prepare_wine_dataset(dataset_name)

##############################################
#           Categorical dataset              #
##############################################
## german: (0 = Good, 1 = Bad)
# dataset_name = 'dataset/german_credit.csv'
# dataset = prepare_german_dataset(dataset_name)

## adult: ['<=50K', '>50K']
# dataset_name = 'dataset/adult.csv'
# dataset = prepare_adult_dataset(dataset_name)

## compas-scores-two-years: ['High', 'Low', 'Medium']
dataset_name = 'dataset/compas-scores-two-years.csv'
dataset = prepare_compass_dataset(dataset_name)

dataframe = dataset[0]
class_name = dataset[1]
dataset_fin = prepare_dataset(dataframe, class_name)

In [22]:
df = dataset_fin[0] #dataframe with unique numeric class values(0, 1, ...)
feature_names = dataset_fin[1]
class_values = dataset_fin[2]
numeric_columns = dataset_fin[3]
rdf = dataset_fin[4] #real dataframe
real_feature_names = dataset_fin[5]
features_map = dataset_fin[6] #map each class name to its unique numeric value

In [23]:
rdf.head()

Unnamed: 0,age,priors_count,days_b_screening_arrest,is_recid,is_violent_recid,two_year_recid,length_of_stay,age_cat,sex,race,c_charge_degree,class
0,69,0,1,0,0,0,0,Greater than 45,Male,Other,F,Low
1,34,0,1,1,1,1,10,25 - 45,Male,African-American,F,Low
2,24,4,1,1,0,1,1,Less than 25,Male,African-American,F,Low
3,23,1,1,0,0,0,0,Less than 25,Male,African-American,F,High
4,43,2,1,0,0,0,0,25 - 45,Male,Other,F,Low


In [24]:
df.head()

Unnamed: 0,age,priors_count,days_b_screening_arrest,is_recid,is_violent_recid,two_year_recid,length_of_stay,age_cat=25 - 45,age_cat=Greater than 45,age_cat=Less than 25,...,sex=Male,race=African-American,race=Asian,race=Caucasian,race=Hispanic,race=Native American,race=Other,c_charge_degree=F,c_charge_degree=M,class
0,69,0,1,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,1,0,1
1,34,0,1,1,1,1,10,1,0,0,...,1,1,0,0,0,0,0,1,0,1
2,24,4,1,1,0,1,1,0,0,1,...,1,1,0,0,0,0,0,1,0,1
3,23,1,1,0,0,0,0,0,0,1,...,1,1,0,0,0,0,0,1,0,0
4,43,2,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,1,1,0,1


## Black box classifier

In [28]:
X = df.loc[:, df.columns != class_name].values
y = df[class_name].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
blackbox = RandomForestClassifier()
blackbox.fit(X_train, y_train)

RandomForestClassifier()

In [29]:
y_pred = blackbox.predict(X_test)
print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))

Accuracy: 0.607


## select an instance _x_

In [30]:
i = 10
x = X_test[i]
y_val = blackbox.predict(x.reshape(1,-1))[0]

print(class_values)
class_prob = blackbox.predict_proba(x.reshape(1,-1))[0]
print(class_prob)

y_val_name = class_values[y_val]
print('blackbox(x) = { %s }' % y_val_name)


['High', 'Low', 'Medium']
[0.04 0.45 0.51]
blackbox(x) = { Medium }


In [31]:
print('x = %s' % record2str(x, feature_names, numeric_columns))

x = { age = 37, priors_count = 9, days_b_screening_arrest = 1, is_recid = 0, is_violent_recid = 0, two_year_recid = 0, length_of_stay = 2, age_cat = 25 - 45, sex = Male, race = Hispanic, c_charge_degree = F }


# LORE explainer (explaining an instance x)

In [36]:
lore_obj = lore.LORE(X_test, blackbox, feature_names, class_name, class_values,
                 numeric_columns, features_map, neigh_type='ngmusx', verbose=False)

In [37]:
# just to check
Z = lore_obj.neighgen_fn(x)
print('Z is:',Z)
Z.shape

Z is: [[ 3.70000000e+01  9.00000000e+00  1.00000000e+00 ...  0.00000000e+00
   1.00000000e+00  0.00000000e+00]
 [ 3.70167552e+01  9.07807269e+00  9.60334502e-01 ... -1.47134683e-02
   9.59340391e-01  2.66767921e-02]
 [ 3.69719518e+01  8.99628487e+00  1.01959314e+00 ... -7.13768198e-03
   1.03303459e+00 -3.14706207e-02]
 ...
 [ 3.64655662e+01  8.97898788e+00  1.08179356e+00 ...  1.54733953e-01
   9.66369602e-01 -1.17993381e-01]
 [ 3.64445754e+01  9.00390223e+00  1.09486637e+00 ...  3.39886497e-02
   9.64748199e-01 -1.19137124e-01]
 [ 3.65187730e+01  8.99190464e+00  1.06826529e+00 ...  1.09027263e-01
   9.75887212e-01 -1.38678863e-01]]


(1010, 20)

In [38]:
explanation = lore_obj.explain_instance(x, samples=1000, nbr_runs=10, verbose=True)

print(explanation)

generating neighborhood - ngmusx
synthetic neighborhood class counts {'Low': 582, 'Medium': 428}
r = { age > 36.50, length_of_stay > 1.75 } --> { class: Medium }
c = { { age <= 36.50 } }


In [35]:
print('x = %s' % record2str(x, feature_names, numeric_columns))

x = { age = 37, priors_count = 9, days_b_screening_arrest = 1, is_recid = 0, is_violent_recid = 0, two_year_recid = 0, length_of_stay = 2, age_cat = 25 - 45, sex = Male, race = Hispanic, c_charge_degree = F }


## check the borderline

In [39]:
temp_x = x.copy()

In [40]:
temp_x[0] = 36
print('x = %s' % record2str(temp_x, feature_names, numeric_columns))

x = { age = 36, priors_count = 9, days_b_screening_arrest = 1, is_recid = 0, is_violent_recid = 0, two_year_recid = 0, length_of_stay = 2, age_cat = 25 - 45, sex = Male, race = Hispanic, c_charge_degree = F }


In [41]:
print(class_values)
print(blackbox.predict_proba(temp_x.reshape(1,-1))[0])
print(class_values[blackbox.predict(temp_x.reshape(1,-1))[0]])

['High', 'Low', 'Medium']
[0.05 0.51 0.44]
Low


In [42]:
explanation_temp = lore_obj.explain_instance(temp_x, samples=1000, nbr_runs=10, verbose=True)

print(explanation_temp)

generating neighborhood - ngmusx
synthetic neighborhood class counts {'Low': 412, 'Medium': 598}
r = { age > 35.92, sex = Male } --> { class: Low }
c = { { age <= 35.92, race != Hispanic },
      { age <= 35.92, is_violent_recid > 0.50 },
      { age <= 35.92, age_cat != 25 - 45 },
      { age <= 35.92, race = African-American },
      { age <= 35.92, age_cat = Greater than 45 } }


In [59]:
features_map

defaultdict(dict,
            {0: {'age': 0},
             1: {'priors_count': 1},
             2: {'days_b_screening_arrest': 2},
             3: {'is_recid': 3},
             4: {'is_violent_recid': 4},
             5: {'two_year_recid': 5},
             6: {'length_of_stay': 6},
             7: {'25 - 45': 7, 'Greater than 45': 8, 'Less than 25': 9},
             8: {'Female': 10, 'Male': 11},
             9: {'African-American': 12,
              'Asian': 13,
              'Caucasian': 14,
              'Hispanic': 15,
              'Native American': 16,
              'Other': 17},
             10: {'F': 18, 'M': 19}})

In [67]:
temp_x[15]

1

In [70]:
# c =  { age <= 35.92, race = African-American }
temp_x[0] = 35
temp_x[15] = 0
temp_x[12] = 1

print('x = %s' % record2str(temp_x, feature_names, numeric_columns))

x = { age = 35, priors_count = 9, days_b_screening_arrest = 1, is_recid = 0, is_violent_recid = 0, two_year_recid = 0, length_of_stay = 2, age_cat = 25 - 45, sex = Male, race = African-American, c_charge_degree = F }


In [71]:
print(class_values)
print(blackbox.predict_proba(temp_x.reshape(1,-1))[0])
print(class_values[blackbox.predict(temp_x.reshape(1,-1))[0]])

['High', 'Low', 'Medium']
[0.09 0.08 0.83]
Medium


In [72]:
explanation_temp2 = lore_obj.explain_instance(temp_x, samples=1000, nbr_runs=10, verbose=True)

print(explanation_temp2)

generating neighborhood - ngmusx
synthetic neighborhood class counts {'High': 330, 'Low': 334, 'Medium': 346}
r = { race = African-American, age_cat != Less than 25, c_charge_degree != M } --> { class: Medium }
c = { { race != African-American, priors_count <= 8.50 },
      { race != African-American, age <= 34.50 },
      { race != African-American, race = Caucasian } }
