Experiment, when there are calculated some statistics for Adult and German statlog datasets, for comparision to gradient optimization methods

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import time
import pickle
from rf_counterfactuals import RandomForestExplainer, visualize, evaluate_counterfactual
from rf_counterfactuals.single_cf_costs_functions import heterogeneous_euclidean_overlap_metric, unmatched_components_distance
import os
from collections import defaultdict

from joblib import Parallel, delayed
from functools import partial

from sklearn import preprocessing
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, pairwise_distances
from sklearn.neighbors import NearestNeighbors
DATASET_PATH = "./datasets/"

N_ESTIMATORS = 100
MAX_DEPTH = 10
N_JOBS = -1

In [None]:
DATASET_NAME = 'German'

COLUMNS = ['checking_account', 'duration', 'credit_history', 'purpose', 'credit_amount', 'saving_account', 'present_employement',
          'installment_rate', 'personal_status', 'other_debtors', 'present_residence', 'property', 'age', 'installment_plans',
          'housing', 'existing_credits_number', 'job', 'people_to_provide_maintenance', 'has_telephone', 'foreign', 'risk']

german_dataset = pd.read_csv(os.path.join(DATASET_PATH, "german_data.csv"), sep=' ', names=COLUMNS)

class_feature = "risk"
CATEGORICAL_FEATURES = [0, 2, 3, 5, 6, 8, 9, 11, 13, 14, 16, 18, 19]
FROZEN_FEATURES = [2]
LEFT_FROZEN_FEATURES = [12]
to_encode = [c for no, c in enumerate(COLUMNS) if no in CATEGORICAL_FEATURES]

d = defaultdict(preprocessing.LabelEncoder)

german_dataset[to_encode] = german_dataset[to_encode].apply(lambda x: d[x.name].fit_transform(x))

X = german_dataset.loc[:, german_dataset.columns!=class_feature]
y = german_dataset[class_feature]

In [None]:
# DATASET_NAME = 'Adult'

# adult_dataset = pd.read_csv(os.path.join(DATASET_PATH, "adult.csv"))
# adult_dataset = adult_dataset.loc[:, adult_dataset.columns!='fnlwgt']

# class_feature = "income"
# feature_names = [c for c in adult_dataset.columns if c != class_feature]
# CATEGORICAL_FEATURES = [1, 2, 4, 5, 6, 7, 8, 12]
# FROZEN_FEATURES = [7, 8]
# LEFT_FROZEN_FEATURES = [0]
# to_encode = [c for no, c in enumerate(feature_names) if no in CATEGORICAL_FEATURES]

# d = defaultdict(preprocessing.LabelEncoder)

# adult_dataset[to_encode] = adult_dataset[to_encode].apply(lambda x: d[x.name].fit_transform(x))

# X = adult_dataset.loc[:, adult_dataset.columns!=class_feature]
# y = adult_dataset[class_feature]

In [None]:
def find_closest_neighbor(rfe, X, x_cf):
    distances = X.apply(lambda x: heterogeneous_euclidean_overlap_metric(x, x_cf,
                                                                  rfe.X_train_stats['range'], rfe.categorical_features,
                                                                  rfe.non_categorical_features), axis=1)
    return distances.nsmallest(2).index[1]

SPLITS = 10
EPSILON = [0.05]
CONFIGURATION = ['no_constraints', 'only_categorical', 'cat+mono', 'cat+freeze']

skf = StratifiedKFold(n_splits=SPLITS, shuffle=True, random_state=1000)

CLASSES = y.unique()

scores = defaultdict(list)

accuracy = []
split = 0

for train_index, test_index in skf.split(X, y):
    print(split+1, "/", SPLITS)
    split += 1
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    rf = RandomForestClassifier(n_estimators=N_ESTIMATORS, max_depth=MAX_DEPTH, random_state=1000)
    rf.fit(X_train, y_train)
    accuracy.append(accuracy_score(y_test, rf.predict(X_test)))
    
#     print(accuracy)
    y_hat = rf.predict(X)
    X_0 = X[y_hat==CLASSES[0]]
    X_1 = X[y_hat==CLASSES[1]]

    X_test_sample = X_test.sample(100)
    y_test_sample = rf.predict(X_test_sample)

    X_test_sample_0 = X_test_sample[y_test_sample==CLASSES[0]]
    X_test_sample_1 = X_test_sample[y_test_sample==CLASSES[1]]

    for eps in EPSILON:
        for conf in CONFIGURATION:
            if conf == 'no_constraints':
                categorical_features = []
                left_frozen_features = []
                frozen_features = []
            elif conf == 'only_categorical':
                categorical_features = CATEGORICAL_FEATURES
                left_frozen_features = []
                frozen_features = []
            elif conf == 'cat+mono':
                categorical_features = CATEGORICAL_FEATURES
                frozen_features = []
                left_frozen_features = LEFT_FROZEN_FEATURES
            elif conf == 'cat+freeze':
                categorical_features = CATEGORICAL_FEATURES
                frozen_features = FROZEN_FEATURES
                left_frozen_features = LEFT_FROZEN_FEATURES
            print(eps, conf)
            
            
            rfe = RandomForestExplainer(rf, X_train, y_train, categorical_features=categorical_features, 
                                        left_frozen_features=left_frozen_features, frozen_features=frozen_features)

            
            start_time = time.time()
            
            indices = Parallel(n_jobs=N_JOBS)(delayed(partial(find_closest_neighbor, rfe, X_0))(X_test_sample_0.iloc[i]) for i in range(X_test_sample_0.shape[0]))
            X_closest_neighbors_0 = X_0.loc[indices]
            
            indices = Parallel(n_jobs=N_JOBS)(delayed(partial(find_closest_neighbor, rfe, X_1))(X_test_sample_1.iloc[i]) for i in range(X_test_sample_1.shape[0]))
            X_closest_neighbors_1 = X_1.loc[indices]
            
            cfs0 = rfe.explain_with_single_metric(X_test_sample_0, CLASSES[1], eps=eps, metric='hoem', k=1, limit=1, n_jobs=N_JOBS)
            cfs1 = rfe.explain_with_single_metric(X_test_sample_1, CLASSES[0], eps=eps, metric='hoem', k=1, limit=1, n_jobs=N_JOBS)

            cfs0_closest_neighbour = rfe.explain_with_single_metric(X_closest_neighbors_0, CLASSES[1], eps=eps, metric='hoem', k=1, limit=1, n_jobs=N_JOBS)
            cfs1_closest_neighbour = rfe.explain_with_single_metric(X_closest_neighbors_1, CLASSES[0], eps=eps, metric='hoem', k=1, limit=1, n_jobs=N_JOBS)
                
                
            cfs_count = 0
            delta_count = 0
            s = []
            for i in range(len(cfs0)):
                if len(cfs0[i]) == 0 or len(cfs0_closest_neighbour[i]) == 0:
                    print(len(cfs0[i]), len(cfs0_closest_neighbour[i]))
                    continue
                cfs_count += 1
                proximity = heterogeneous_euclidean_overlap_metric(X_test_sample_0.iloc[i], cfs0[i].iloc[0],
                                                                  rfe.X_train_stats['range'], rfe.categorical_features,
                                                                  rfe.non_categorical_features)

                sparsity = unmatched_components_distance(X_test_sample_0.iloc[i], cfs0[i].iloc[0])
                
                delta = rf.predict(cfs0[i].iloc[0].to_frame(0).T) == CLASSES[1]
                

                beta = heterogeneous_euclidean_overlap_metric(cfs0[i].iloc[0], cfs0_closest_neighbour[i].iloc[0],
                                                                      rfe.X_train_stats['range'], rfe.categorical_features,
                                                                      rfe.non_categorical_features)

                
                
                s.append([proximity, beta, sparsity, int(delta[0])])

            for i in range(len(cfs1)):
                if len(cfs1[i]) == 0 or len(cfs1_closest_neighbour[i]) == 0:
                    print(len(cfs1[i]), len(cfs1_closest_neighbour[i]))
                    continue
                cfs_count += 1
                proximity = heterogeneous_euclidean_overlap_metric(X_test_sample_1.iloc[i], cfs1[i].iloc[0],
                                                                  rfe.X_train_stats['range'], rfe.categorical_features,
                                                                  rfe.non_categorical_features)
                sparsity = unmatched_components_distance(X_test_sample_1.iloc[i], cfs1[i].iloc[0])
                delta = rf.predict(cfs1[i].iloc[0].to_frame(0).T) == CLASSES[0]
                
                beta = heterogeneous_euclidean_overlap_metric(cfs1[i].iloc[0], cfs1_closest_neighbour[i].iloc[0],
                                                                  rfe.X_train_stats['range'], rfe.categorical_features,
                                                                  rfe.non_categorical_features)
                
                s.append([proximity, beta, sparsity, int(delta[0])])

            scores[f'{eps}_{conf}'].append(np.hstack([np.mean(np.array(s), axis=0), cfs_count/len(X_test_sample)]))


accuracy_mean, accuracy_std = np.mean(accuracy), np.std(accuracy)
accuracy_mean

In [None]:
scores_mean = {k: np.mean(scores[k], axis=0) for k, v in scores.items()}
scores_std = {k: np.std(scores[k], axis=0) for k, v in scores.items()}

pd.DataFrame.from_dict(scores_mean, orient='index', columns=['proximity', 'beta', 'sparsity', 'delta', 'found cfs'])

In [None]:
constraints_text = ["brak", "tylko atr. nominalne", "nominalne + monotonniczność", "nominalne + zamrożone"]

for i in range(len(scores_mean.keys()) // 4):
    print("\\multirow{4}{*}{FT} ")
    for no, k in enumerate(list(scores_mean.keys())[i*4:(i+1)*4]):
        print(f"& {constraints_text[no]} & {scores_mean[k][0]:1.4f} & {scores_mean[k][1]:1.4f} & {scores_mean[k][2]:1.4f} & {scores_mean[k][3]:1.4f}\\\\")
        if no != 3:
            print("\\hhline{~-----}")
    print("\\hline")
    

In [None]:
constraints_text = ["brak", "tylko atr. nominalne", "nominalne + monotonniczność", "nominalne + zamrożone"]

print("\\multirow{4}{*}{FT} ")
for no, k in enumerate(scores_mean.keys()):
    print(f"& {constraints_text[no]} & {scores_mean[k][0]:1.4f}({scores_std[k][0]:1.4f}) & {scores_mean[k][1]:1.4f}({scores_std[k][1]:1.4f}) & {scores_mean[k][2]:1.4f}({scores_std[k][2]:1.4f}) & {scores_mean[k][3]:1.4f}({scores_std[k][3]:1.4f})\\\\")
    print("\\hhline{~-----}")
print("\\hline")