This experiments is done on Adult dataset.

Here are tested constraints on extracted test examples. There is also visualisation of founded counterfactuals.

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import time
import pickle
from rf_counterfactuals import RandomForestExplainer, visualize
import os

from sklearn import preprocessing
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

DATASET_PATH = "./datasets/"

# Encode categorical features to ordinal scale

In [None]:
adult_dataset = pd.read_csv(os.path.join(DATASET_PATH, "adult.csv"))

class_feature = "income"
feature_names = [c for c in adult_dataset.columns if c != class_feature]
categorical_features = [1, 3, 5, 6, 7, 8, 9, 13]
to_encode = [c for no, c in enumerate(feature_names) if no in categorical_features]

from collections import defaultdict
d = defaultdict(preprocessing.LabelEncoder)

adult_dataset[to_encode] = adult_dataset[to_encode].apply(lambda x: d[x.name].fit_transform(x))

# Split dataset to train/test

In [None]:
train, test = train_test_split(adult_dataset, train_size=0.67, random_state=1000, stratify=adult_dataset[class_feature])
X_train = train.loc[:, train.columns!=class_feature]
y_train = train[class_feature]

X_test = test.loc[:, test.columns!=class_feature]
y_test = test[class_feature]

X_train.shape, X_test.shape

## Class labels distribution

In [None]:
fig = plt.figure(figsize=(10, 6))
bars = plt.bar(['<=50K', '>50K'], adult_dataset[class_feature].value_counts()[adult_dataset[class_feature].unique()])
fig.gca().bar_label(bars)
plt.xticks([0.0, 1], ['<=50K', '>50K'])
plt.xlabel("Label")
plt.ylabel("Count")
# plt.grid()
plt.title("Adult dataset. Class labels distribution")
plt.show()

# Train RF and evaluate it

In [None]:
rf = RandomForestClassifier(n_estimators=30, max_depth=5)
rf.fit(X_train, y_train)

print(f"Accuracy score: {accuracy_score(y_test, rf.predict(X_test)):1.3f}")
print(f"Recall score: {recall_score(y_test, rf.predict(X_test), pos_label='>50K'):1.3f}")
print(f"Precision score: {precision_score(y_test, rf.predict(X_test), pos_label='>50K'):1.3f}")
print(f"F1 score: {f1_score(y_test, rf.predict(X_test), pos_label='>50K'):1.3f}")

# Here is the fun part. 

## 1. Extract 10k samples from test data that have label='<=50K'
## 2. Get counterfactuals without any constraints on attributes
## 3. Get counterfactuals with constraints on attributes

These constraints are: 

Categorical attributes: 'workclass, education, marital-status, occupation, relationship, race, gender, native-country'

Frozen attributes: 'race', 'gender'

Left frozen attributes: 'age'

In [None]:
X_test_label_0 = X_test[y_test=='<=50K'].sample(10000)
X_test_label_0.shape

categorical_features = []
frozen_features = []
left_frozen_features = []

rfe_no_constraints = RandomForestExplainer(rf, X_train, y_train, categorical_features=categorical_features, 
                            left_frozen_features=left_frozen_features, frozen_features=frozen_features)
start_time = time.time()
cfs_no_constraints = rfe_no_constraints.explain_with_single_metric(X_test_label_0, '>50K', metric='hoem', limit=None)
end_time = time.time()

time_no_constraints = end_time - start_time

print(f"Total counterfactuals found: {sum([len(c) for c in cfs_no_constraints])}")
print(f"Finished in {end_time - start_time: 1.4f}s")

categorical_features = [1, 3, 5, 6, 7, 8, 9, 13]
frozen_features = [8, 9]
left_frozen_features = [0]

rfe_constraints = RandomForestExplainer(rf, X_train, y_train, categorical_features=categorical_features, 
                            left_frozen_features=left_frozen_features, frozen_features=frozen_features)
start_time = time.time()
cfs_constraints = rfe_constraints.explain_with_single_metric(X_test_label_0, '>50K', metric='hoem', limit=None)
end_time = time.time()

time_constraints = end_time - start_time

print(f"Total counterfactuals found: {sum([len(c) for c in cfs_constraints])}")
print(f"Finished in {end_time - start_time: 1.4f}s")

# Below code to visualize some counterfactuals

In [None]:
sample_no = 0
cf_no = 0
visualize(rfe_no_constraints, X_test_label_0.loc[cfs_no_constraints[sample_no].index[cf_no]], cfs_no_constraints[sample_no].iloc[cf_no], d)

In [None]:
sample_no = 0
cf_no = 0
visualize(rfe_constraints, X_test_label_0.loc[cfs_constraints[sample_no].index[cf_no]], cfs_constraints[sample_no].iloc[cf_no], d)