## Use MCCE method using the Data and RandomForest class

In [10]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

from mcce.mcce import MCCE
from mcce.data import Data
from mcce.rf import RandomForestModel
from mcce.metrics import distance, feasibility, constraint_violation, success_rate

## Load data

In [2]:
feature_order = ['age', 'workclass', 'fnlwgt', 'education-num', 'marital-status', 'occupation', 
                 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'country', 'income']
                 
dtypes = {"age": "float", 
          "workclass": "category", 
          "fnlwgt": "float", 
          "education-num": "float",
          "marital-status": "category", 
          "occupation": "category", 
          "relationship": "category", 
          "race": "category",
          "sex": "category", 
          "capital-gain": "float", 
          "capital-loss": "float", 
          "hours-per-week": "float", 
          "country": "category", 
          "income": "category"}

response = 'income'

fixed_features = ['age', 'sex']

path = '../Data/adult_data.csv'
dataset = Data(path, feature_order, dtypes, response, fixed_features, "OneHot_drop_first", "MinMax")

## Train predictive model

In [3]:
ml_model = RandomForestModel(dataset)

## Select observations to generate counterfactuals for

In [4]:
preds = ml_model.predict_proba(dataset.df)[:,1]
factual_id = np.where(preds < 0.5)
factuals = dataset.df.loc[factual_id]
test_factual = factuals.iloc[:5]

print(test_factual.head(2))

        age    fnlwgt  education-num  capital-gain  capital-loss  \
0  0.301370  0.044131            0.8       0.02174           0.0   
1  0.452055  0.048052            0.8       0.00000           0.0   

   hours-per-week  income  workclass_1  workclass_2  workclass_3  ...  \
0        0.397959       0          0.0          0.0          1.0  ...   
1        0.122449       0          1.0          0.0          0.0  ...   

   relationship_1  relationship_2  relationship_3  race_1  race_2  race_3  \
0             1.0             0.0             0.0     0.0     0.0     0.0   
1             0.0             0.0             0.0     0.0     0.0     0.0   

   sex_1  country_1  country_2  country_3  
0    0.0        0.0        0.0        0.0  
1    0.0        0.0        0.0        0.0  

[2 rows x 26 columns]


## Create objects to feed into MCCE method

In [5]:
y_col = dataset.target
cont_feat = dataset.continuous

cat_feat = dataset.categorical
cat_feat_encoded = dataset.categorical_encoded

dtypes = dict([(x, "float") for x in cont_feat])
for x in cat_feat_encoded:
    dtypes[x] = "category"
df = (dataset.df).astype(dtypes)

## Fit MCCE method

In [6]:
mcce = MCCE(dataset=dataset, model=ml_model)

print("Fit trees")
mcce.fit(df.drop(dataset.target, axis=1), dtypes)

print("Sample observations from tree nodes")
cfs = mcce.generate(test_factual.drop(dataset.target, axis=1), k=100)

print("Process sampled observations")
mcce.postprocess(cfs, test_factual, cutoff=0.5)


Fit trees
Sample observations from tree nodes
Process sampled observations


## Print counterfactuals

In [7]:
cfs = mcce.results_sparse
cfs['income'] = test_factual['income'] # add back the original response

# invert the features to their original form
print("Original factuals:")
decoded_factuals = dataset.inverse_transform(test_factual)[feature_order]

decoded_factuals

Original factuals:


Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,income
0,39.0,3,77516.0,13.0,1,3,1,0,0,2174.0,0.0,40.0,0,0
1,50.0,1,83311.0,13.0,0,2,0,0,0,0.0,0.0,13.0,0,0
2,38.0,0,215646.0,9.0,2,3,1,0,0,0.0,0.0,40.0,0,0
3,53.0,0,234721.0,7.0,0,3,0,1,0,0.0,0.0,40.0,0,0
4,28.0,0,338409.0,13.0,0,0,3,1,1,0.0,0.0,40.0,3,0


In [9]:
print("Generated counterfactuals:")
decoded_cfs = dataset.inverse_transform(cfs)[feature_order]
decoded_cfs

Generated counterfactuals:


Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,income
0,39.0,3,1455435.0,13.0,0,3,0,0,0,0.0,0.0,72.0,0,0
1,50.0,0,128143.0,13.0,0,2,0,0,0,0.0,0.0,50.0,0,0
2,38.0,0,207066.0,9.0,0,3,0,0,0,0.0,0.0,48.0,0,0
3,53.0,0,141340.0,9.0,0,3,0,1,0,0.0,0.0,40.0,0,0
4,28.0,0,115677.0,13.0,0,0,0,0,1,7688.0,0.0,40.0,0,0


## Calculate some metrics

In [12]:
distance_pd = pd.DataFrame(distance(cfs, test_factual, dataset))

feasibility_pd = pd.DataFrame(feasibility(cfs, df, dataset.categorical_encoded + dataset.continuous), columns=['feasibility'])

const_pd = pd.DataFrame(constraint_violation(decoded_cfs, decoded_factuals, dataset), columns=['violation'])

success_pd = pd.DataFrame(success_rate(cfs[dataset.categorical_encoded + dataset.continuous], ml_model), columns=['success'])


In [13]:
results = pd.concat([decoded_cfs, distance_pd, feasibility_pd, const_pd, success_pd], axis=1)
results

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,income,L0,L1,L2,feasibility,violation,success
0,39.0,3,1455435.0,13.0,0,3,0,0,0,0.0,0.0,72.0,0,0,5.0,3.280484,2.976117,0.516814,0,1
1,50.0,0,128143.0,13.0,0,2,0,0,0,0.0,0.0,50.0,0,0,3.0,1.407882,1.143465,0.019199,0,1
2,38.0,0,207066.0,9.0,0,3,0,0,0,0.0,0.0,48.0,0,0,4.0,2.087437,2.006698,0.010791,0,1
3,53.0,0,141340.0,9.0,0,3,0,1,0,0.0,0.0,40.0,0,0,2.0,0.196509,0.021769,0.03413,0,1
4,28.0,0,115677.0,13.0,0,0,0,0,1,7688.0,0.0,40.0,0,0,5.0,3.227567,3.028617,1.002213,0,1
