Experiment, when there are calculated some statistics for Adult and German statlog datasets, for comparision to gradient optimization methods

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import time
import pickle
from rf_counterfactuals import RandomForestExplainer, visualize, evaluate_counterfactual
from rf_counterfactuals.single_cf_costs_functions import heterogeneous_euclidean_overlap_metric, unmatched_components_distance
import os
from collections import defaultdict

from sklearn import preprocessing
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, pairwise_distances
from sklearn.neighbors import NearestNeighbors
DATASET_PATH = "./datasets/"

In [2]:
adult_dataset = pd.read_csv(os.path.join(DATASET_PATH, "adult.csv"))

class_feature = "income"
feature_names = [c for c in adult_dataset.columns if c != class_feature]
categorical_features = [1, 3, 5, 6, 7, 8, 9, 13]
to_encode = [c for no, c in enumerate(feature_names) if no in categorical_features]

from collections import defaultdict
d = defaultdict(preprocessing.LabelEncoder)

adult_dataset[to_encode] = adult_dataset[to_encode].apply(lambda x: d[x.name].fit_transform(x))

X = adult_dataset.loc[:, adult_dataset.columns!=class_feature]
y = adult_dataset[class_feature]

In [3]:
rf = RandomForestClassifier(n_estimators=100, max_depth=5)
rf.fit(X, y)
rfe_global_no_cat = RandomForestExplainer(rf, X, y)

start_time = time.time()
knn_no_cat = NearestNeighbors(n_neighbors=2, algorithm='ball_tree',
                                                metric=heterogeneous_euclidean_overlap_metric,
                                                metric_params={'feature_range': rfe_global_no_cat.X_train_stats['range'],
                                               'cat_features': rfe_global_no_cat.categorical_features,
                                               'non_cat_features': rfe_global_no_cat.non_categorical_features})
knn_no_cat.fit(X.iloc[:100, :].values)

print("no cat: ", time.time() - start_time, "s")

no cat:  0.22920465469360352 s


In [4]:
rfe_global_cat = RandomForestExplainer(rf, X, y, categorical_features=categorical_features)
start_time = time.time()
knn_cat = NearestNeighbors(n_neighbors=2, algorithm='ball_tree',
                                                metric=heterogeneous_euclidean_overlap_metric,
                                                metric_params={'feature_range': rfe_global_cat.X_train_stats['range'],
                                               'cat_features': rfe_global_cat.categorical_features,
                                               'non_cat_features': rfe_global_cat.non_categorical_features})

knn_cat.fit(X.iloc[:100, :].values)
print("cat: ", time.time() - start_time, "s")

cat:  0.26864194869995117 s


In [8]:
SPLITS = 3
EPSILON = [0.0001, 0.001, 0.1]
CONFIGURATION = ['no_constraints', 'only_categorical', 'cat+mono', 'cat+freeze']

skf = StratifiedKFold(n_splits=SPLITS, shuffle=True, random_state=1000)

categorical_features = [1, 3, 5, 6, 7, 8, 9, 13]
frozen_features = [8, 9]
left_frozen_features = [0]

scores = defaultdict(list)

accuracy = []
split = 0

for train_index, test_index in skf.split(X, y):
    print(split+1, "/", SPLITS)
    split += 1
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    rf = RandomForestClassifier(n_estimators=100, max_depth=5)
    rf.fit(X_train, y_train)
    accuracy.append(accuracy_score(y_test, rf.predict(X_test)))
    
    y_hat = rf.predict(X_test)
    
    X_0 = X_test[y_hat=='<=50K']
    X_1 = X_test[y_hat=='>50K']
    
    start_time = time.time()
    knn_cat_0 = NearestNeighbors(n_neighbors=2, algorithm='ball_tree',
                                                metric=heterogeneous_euclidean_overlap_metric,
                                                metric_params={'feature_range': rfe_global_cat.X_train_stats['range'],
                                               'cat_features': rfe_global_cat.categorical_features,
                                               'non_cat_features': rfe_global_cat.non_categorical_features})

    knn_cat_0.fit(X_0.values)
    
    knn_cat_1 = NearestNeighbors(n_neighbors=2, algorithm='ball_tree',
                                                metric=heterogeneous_euclidean_overlap_metric,
                                                metric_params={'feature_range': rfe_global_cat.X_train_stats['range'],
                                               'cat_features': rfe_global_cat.categorical_features,
                                               'non_cat_features': rfe_global_cat.non_categorical_features})

    knn_cat_1.fit(X_1.values)
    
    knn_no_cat_0 = NearestNeighbors(n_neighbors=2, algorithm='ball_tree',
                                                metric=heterogeneous_euclidean_overlap_metric,
                                                metric_params={'feature_range': rfe_global_cat.X_train_stats['range'],
                                               'cat_features': [],
                                               'non_cat_features': [i for i in range(X.shape[1])]})

    knn_no_cat_0.fit(X_0.values)
    
    knn_no_cat_1 = NearestNeighbors(n_neighbors=2, algorithm='ball_tree',
                                                metric=heterogeneous_euclidean_overlap_metric,
                                                metric_params={'feature_range': rfe_global_cat.X_train_stats['range'],
                                               'cat_features': [],
                                               'non_cat_features': [i for i in range(X.shape[1])]})

    knn_no_cat_1.fit(X_1.values)
    print(time.time() - start_time, "s")
    
    
#     print(accuracy)

    X_test_sample = X_test.sample(100)
    y_test_sample = rf.predict(X_test_sample)

    X_test_sample_0 = X_test_sample[y_test_sample=='<=50K']
    X_test_sample_1 = X_test_sample[y_test_sample=='>50K']
    start_time = time.time()
    closest_neighbors_cat_0 = knn_cat_0.kneighbors(X_test_sample_0.values, return_distance=False)[:, 1]
    closest_neighbors_cat_1 = knn_cat_1.kneighbors(X_test_sample_1.values, return_distance=False)[:, 1]
    
    closest_neighbors_no_cat_0 = knn_no_cat_0.kneighbors(X_test_sample_0.values, return_distance=False)[:, 1]
    closest_neighbors_no_cat_1 = knn_no_cat_1.kneighbors(X_test_sample_1.values, return_distance=False)[:, 1]
    
    X_0_cat_closest = X_0.iloc[closest_neighbors_cat_0]
    X_1_cat_closest = X_1.iloc[closest_neighbors_cat_1]
    
    X_0_no_cat_closest = X_0.iloc[closest_neighbors_no_cat_0]
    X_1_no_cat_closest = X_1.iloc[closest_neighbors_no_cat_1]
    
    print(time.time() - start_time, "s")
#     print(len(X_test_sample_0), len(X_test_sample_1))

    for eps in EPSILON:
        for conf in CONFIGURATION:
            if conf == 'no_constraints':
                categorical_features = []
                left_frozen_features = []
                frozen_features = []
            elif conf == 'only_categorical':
                categorical_features = [1, 3, 5, 6, 7, 8, 9, 13]
                left_frozen_features = []
                frozen_features = []
            elif conf == 'cat+mono':
                categorical_features = [1, 3, 5, 6, 7, 8, 9, 13]
                frozen_features = []
                left_frozen_features = [0]
            elif conf == 'cat+freeze':
                categorical_features = [1, 3, 5, 6, 7, 8, 9, 13]
                frozen_features = [8, 9]
                left_frozen_features = []
            print(eps, conf)
            
            
            rfe = RandomForestExplainer(rf, X_train, y_train, categorical_features=categorical_features, 
                                        left_frozen_features=left_frozen_features, frozen_features=frozen_features)

            cfs0 = rfe.explain_with_single_metric(X_test_sample_0, '>50K', eps=eps, metric='hoem', k=1, limit=1)
            cfs1 = rfe.explain_with_single_metric(X_test_sample_1, '<=50K', eps=eps, metric='hoem', k=1, limit=1)
            
            if conf == 'no_constraints':
                cfs0_closest_neighbour = rfe.explain_with_single_metric(X_0_no_cat_closest, '>50K', eps=eps, metric='hoem', k=1, limit=1)
                cfs1_closest_neighbour = rfe.explain_with_single_metric(X_1_no_cat_closest, '<=50K', eps=eps, metric='hoem', k=1, limit=1)
            else:
                cfs0_closest_neighbour = rfe.explain_with_single_metric(X_0_cat_closest, '>50K', eps=eps, metric='hoem', k=1, limit=1)
                cfs1_closest_neighbour = rfe.explain_with_single_metric(X_1_cat_closest, '<=50K', eps=eps, metric='hoem', k=1, limit=1)
                
                
            cfs_count = 0
            delta_count = 0
            s = []
            for i in range(len(cfs0)):
                if len(cfs0[i]) == 0:
                    continue
                cfs_count += 1
                proximity = heterogeneous_euclidean_overlap_metric(X_test_sample_0.iloc[i], cfs0[i].iloc[0],
                                                                  rfe.X_train_stats['range'], rfe.categorical_features,
                                                                  rfe.non_categorical_features)

                sparsity = unmatched_components_distance(X_test_sample_0.iloc[i], cfs0[i].iloc[0])
                
                delta = rf.predict(cfs0[i].iloc[0].to_frame(0).T) == '>50K'
                
                beta = heterogeneous_euclidean_overlap_metric(cfs0[i].iloc[0], cfs0_closest_neighbour[i].iloc[0],
                                                                  rfe.X_train_stats['range'], rfe.categorical_features,
                                                                  rfe.non_categorical_features)
                
                
                s.append([proximity, beta, sparsity, int(delta[0])])

            for i in range(len(cfs1)):
                if len(cfs1[i]) == 0:
                    continue
                cfs_count += 1
                proximity = heterogeneous_euclidean_overlap_metric(X_test_sample_1.iloc[i], cfs1[i].iloc[0],
                                                                  rfe.X_train_stats['range'], rfe.categorical_features,
                                                                  rfe.non_categorical_features)
                sparsity = unmatched_components_distance(X_test_sample_1.iloc[i], cfs1[i].iloc[0])
                delta = rf.predict(cfs1[i].iloc[0].to_frame(0).T) == '<=50K'
                
                beta = heterogeneous_euclidean_overlap_metric(cfs1[i].iloc[0], cfs1_closest_neighbour[i].iloc[0],
                                                                  rfe.X_train_stats['range'], rfe.categorical_features,
                                                                  rfe.non_categorical_features)
                
                s.append([proximity, beta, sparsity, int(delta[0])])

            scores[f'{eps}_{conf}'].append(np.hstack([np.mean(np.array(s), axis=0), cfs_count/len(X_test_sample)]))


accuracy_mean, accuracy_std = np.mean(accuracy), np.std(accuracy)
accuracy_mean

1 / 3
336.7198398113251 s
2343.049512386322 s
0.0001 no_constraints
[1/3] Extracting positive paths.
[2/3] Generating counterfactual examples for each tree. Total number of tasks: 100


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  9.2min
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed: 11.6min
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed: 13.9min

KeyboardInterrupt



In [None]:
scores_mean = {k: np.mean(scores[k], axis=0) for k, v in scores.items()}
pd.DataFrame.from_dict(scores_mean, orient='index', columns=['proximity', 'beta', 'sparsity', 'delta', 'found cfs'])