In [8]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import time
import pickle
from rf_counterfactuals import RandomForestExplainer, visualize, evaluate_counterfactual, evaluate_counterfactual_set
from rf_counterfactuals.single_cf_costs_functions import heterogeneous_euclidean_overlap_metric, unmatched_components_distance
import os
from collections import defaultdict

from sklearn import preprocessing
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, pairwise_distances
from sklearn.neighbors import NearestNeighbors
DATASET_PATH = "./datasets/"

In [91]:
adult_dataset = pd.read_csv(os.path.join(DATASET_PATH, "adult.csv"))

class_feature = "income"
feature_names = [c for c in adult_dataset.columns if c != class_feature]
categorical_features = [1, 3, 5, 6, 7, 8, 9, 13]
to_encode = [c for no, c in enumerate(feature_names) if no in categorical_features]

from collections import defaultdict
d = defaultdict(preprocessing.LabelEncoder)

adult_dataset[to_encode] = adult_dataset[to_encode].apply(lambda x: d[x.name].fit_transform(x))

X = adult_dataset.loc[:, adult_dataset.columns!=class_feature]
y = adult_dataset[class_feature]

X = X.iloc[:1000]
y = y.iloc[:1000]

In [93]:
SPLITS = 3

skf = StratifiedKFold(n_splits=SPLITS, shuffle=True, random_state=1000)
eps = 0.1
categorical_features = [1, 3, 5, 6, 7, 8, 9, 13]
frozen_features = [8, 9]
left_frozen_features = [0]

scores = defaultdict(list)

accuracy = []
split = 0

for train_index, test_index in skf.split(X, y):
    start_time = time.time()
    print(split+1, "/", SPLITS)
    split += 1
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    rf = RandomForestClassifier(n_estimators=100, max_depth=5)
    rf.fit(X_train, y_train)
    accuracy.append(accuracy_score(y_test, rf.predict(X_test)))
    
    X_test = X_test.sample(100)
    
    y_hat = rf.predict(X_test)
    X_test_0 = X_test[y_hat=='<=50K']
    X_test_1 = X_test[y_hat=='>50K']

    
    rfe = RandomForestExplainer(rf, X_train, y_train, categorical_features=categorical_features, 
                                        left_frozen_features=left_frozen_features, frozen_features=frozen_features)
    
    
    cfs0 = rfe.explain_with_multiple_metrics(X_test_0, '>50K', eps=eps, k=5, metrics=('hoem', 'unmatched_components', 'implausibility_single'))
    cfs1 = rfe.explain_with_multiple_metrics(X_test_1, '<=50K', eps=eps, k=5, metrics=('hoem', 'unmatched_components', 'implausibility_single'))

    rfe = RandomForestExplainer(rf, X_train, y_train, categorical_features=categorical_features, 
                                        left_frozen_features=left_frozen_features, frozen_features=frozen_features)
    
    
    cfs0_single_hoem = rfe.explain_with_single_metric(X_test_0, '>50K', eps=eps, metric='hoem', k=5, limit=None)
    cfs1_single_hoem = rfe.explain_with_single_metric(X_test_1, '<=50K', eps=eps, metric='hoem', k=5, limit=None)
    
    cfs0_single_ucr = rfe.explain_with_single_metric(X_test_0, '>50K', eps=eps, metric='unmatched_components', k=5, limit=None)
    cfs1_single_ucr = rfe.explain_with_single_metric(X_test_1, '<=50K', eps=eps, metric='unmatched_components', k=5, limit=None)
    
    cfs0_single_im = rfe.explain_with_single_metric(X_test_0, '>50K', eps=eps, metric='implausibility_single', k=5, limit=None)
    cfs1_single_im = rfe.explain_with_single_metric(X_test_1, '<=50K', eps=eps, metric='implausibility_single', k=5, limit=None)
    
    for i in range(len(cfs0)):
        cfs_count = len(cfs0[i])
        scores["cfs_count"].append(cfs_count)
        
        if len(cfs0[i]) < 2:
            continue
            
        cfs = cfs0[i]
        e = evaluate_counterfactual_set(rfe, X_test_0.iloc[i], cfs, k=5)
        scores["pareto"].append(list(e.values()))
        
        cfs = cfs0_single_hoem[i].iloc[:cfs_count]
        e = evaluate_counterfactual_set(rfe, X_test_0.iloc[i], cfs, k=5)
        scores["hoem"].append(list(e.values()))
        
        cfs = cfs0_single_ucr[i].iloc[:cfs_count]
        e = evaluate_counterfactual_set(rfe, X_test_0.iloc[i], cfs, k=5)
        scores["ucr"].append(list(e.values()))
        
        cfs = cfs0_single_im[i].iloc[:cfs_count]
        e = evaluate_counterfactual_set(rfe, X_test_0.iloc[i], cfs, k=5)
        scores["implausibility"].append(list(e.values()))
        
    for i in range(len(cfs1)):
        cfs_count = len(cfs1[i])
        scores["cfs_count"].append(cfs_count)
        
        if len(cfs1[i]) < 2:
            continue
            
        cfs = cfs1[i]
        e = evaluate_counterfactual_set(rfe, X_test_1.iloc[i], cfs, k=5)
        scores["pareto"].append(list(e.values()))
        
        cfs = cfs1_single_hoem[i].iloc[:cfs_count]
        e = evaluate_counterfactual_set(rfe, X_test_1.iloc[i], cfs, k=5)
        scores["hoem"].append(list(e.values()))
        
        cfs = cfs1_single_ucr[i].iloc[:cfs_count]
        e = evaluate_counterfactual_set(rfe, X_test_1.iloc[i], cfs, k=5)
        scores["ucr"].append(list(e.values()))
        
        cfs = cfs1_single_im[i].iloc[:cfs_count]
        e = evaluate_counterfactual_set(rfe, X_test_1.iloc[i], cfs, k=5)
        scores["implausibility"].append(list(e.values()))
        
    print("Split time: ", time.time() - start_time, "s")


    
    

accuracy_mean, accuracy_std = np.mean(accuracy), np.std(accuracy)
accuracy_mean

1 / 3
[1/3] Extracting positive paths.
[2/3] Generating counterfactual examples for each tree. Total number of tasks: 100


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done  96 out of 100 | elapsed:    5.5s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.6s finished


[3/3] Calculating loss function. Total number of tasks: 94


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:  2.9min

KeyboardInterrupt

Process LokyProcess-162:
Traceback (most recent call last):
  File "/home/mleszczyk/python-venvs/rf-counterfactuals-venv/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 595, in __call__
    return self.func(*args, **kwargs)
  File "/home/mleszczyk/python-venvs/rf-counterfactuals-venv/lib/python3.8/site-packages/joblib/parallel.py", line 262, in __call__
    return [func(*args, **kwargs)
  File "/home/mleszczyk/python-venvs/rf-counterfactuals-venv/lib/python3.8/site-packages/joblib/parallel.py", line 262, in <li

In [74]:
y_hat

array(['<=50K', '<=50K', '<=50K', '>50K', '>50K', '<=50K', '<=50K',
       '<=50K', '<=50K', '<=50K', '>50K', '<=50K', '<=50K', '<=50K',
       '<=50K', '>50K', '<=50K', '<=50K', '<=50K', '<=50K'], dtype=object)

In [75]:
X_test[y_hat=='>50K'].shape

(4, 14)

In [84]:
cfs0

[     age  workclass         fnlwgt  education  educational-num  \
 93  59.0        1.0   98984.000000        9.0             13.0   
 93  59.0        1.0  321981.961552        9.0             13.0   
 93  59.0        1.0   98984.000000        9.0             13.0   
 93  59.0        1.0  329690.461552        9.0             13.0   
 93  59.0        1.0   98984.000000        9.0             13.0   
 93  59.0        1.0   98984.000000        9.0             13.0   
 93  59.0        4.0   98984.000000        9.0             13.0   
 
     marital-status  occupation  relationship  race  gender  capital-gain  \
 93             0.0         1.0           1.0   4.0     1.0   2803.182602   
 93             0.0         1.0           1.0   4.0     1.0   4900.682602   
 93             0.0         1.0           0.0   4.0     1.0   2803.182602   
 93             0.0         1.0           1.0   4.0     1.0      0.000000   
 93             0.0         1.0           1.0   4.0     1.0   8293.682602   


In [82]:
cfs0_single_hoem[4]

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
63,33.0,4.0,281384.0,11.0,9.0,4.0,7.0,3.0,4.0,0.0,8293.682602,0.0,44.706034,39.0
63,33.0,4.0,281384.0,11.0,9.0,4.0,7.0,0.0,4.0,0.0,2803.182602,0.0,50.206034,39.0
