## Use MCCE method without using the Data or RandomForest class

In [None]:
import warnings
warnings.filterwarnings('ignore')

import re
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn import preprocessing, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from mcce.mcce import MCCE
from mcce.metrics import distance, feasibility, constraint_violation, success_rate

## Load data

In [2]:
feature_order = ['age', 'workclass', 'fnlwgt', 'education-num', 'marital-status', 'occupation', 
                 'relationship', 'race', 'sex', 'hours-per-week',]
                 
dtypes = {"age": "float", 
          "workclass": "category", 
          "fnlwgt": "float", 
          "education-num": "float",
          "marital-status": "category", 
          "occupation": "category", 
          "relationship": "category", 
          "race": "category",
          "sex": "category", 
          "hours-per-week": "float",
          "income": "category"}

categorical = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex']
continuous = ['age', 'fnlwgt', 'education-num', 'hours-per-week']
fixed_features = ['age', 'sex']
target = ['income']
features = categorical + continuous

path = '../Data/adult_data.csv'

df = pd.read_csv(path)
df = df[features + target]

print(f"The fixed features are {fixed_features}")

The fixed features are ['age', 'sex']


## Scale the continuous features between 0 and 1. Encode the categorical features using one-hot encoding

In [3]:
encoder = preprocessing.OneHotEncoder(drop="first", sparse=False).fit(df[categorical])
df_encoded = encoder.transform(df[categorical])

scaler = preprocessing.MinMaxScaler().fit(df[continuous])
df_scaled = scaler.transform(df[continuous])

categorical_encoded = encoder.get_feature_names(categorical).tolist()
df_scaled = pd.DataFrame(df_scaled, columns=continuous)
df_encoded = pd.DataFrame(df_encoded, columns=categorical_encoded)

df = pd.concat([df_scaled, df_encoded, df[target]], axis=1)

print(f"The encoded categorical features are {categorical_encoded}")

The encoded categorical features are ['workclass_1', 'workclass_2', 'workclass_3', 'marital-status_1', 'marital-status_2', 'marital-status_3', 'occupation_1', 'occupation_2', 'occupation_3', 'relationship_1', 'relationship_2', 'relationship_3', 'race_1', 'race_2', 'race_3', 'sex_1']


## Define an inverse_transform function to go easily back to the non-scaled/encoded feature version

In [4]:
def inverse_transform(df, 
                      scaler, 
                      encoder, 
                      continuous,
                      categorical,
                      categorical_encoded, 
                      ):

    df_categorical = pd.DataFrame(encoder.inverse_transform(df[categorical_encoded]), columns=categorical)
    df_continuous = pd.DataFrame(scaler.inverse_transform(df[continuous]), columns=continuous)

    return pd.concat([df_categorical, df_continuous], axis=1)

## Find the fixed features in their encoded form

In [5]:
fixed_features_encoded = []
for fixed in fixed_features:
    if fixed in categorical:
        for new_col in categorical_encoded:
            match = re.search(fixed, new_col)
            if match:
                fixed_features_encoded.append(new_col)
    else:
        fixed_features_encoded.append(fixed)

print(f"Encoded fixed features are: {fixed_features_encoded}")

Encoded fixed features are: ['age', 'sex_1']


## Train predictive model

In [6]:
y = df[target]
X = df.drop(target, axis=1)
test_size = 0.33

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
clf = RandomForestClassifier(max_depth=None, random_state=0)
ml_model = clf.fit(X_train, y_train)

pred_train = ml_model.predict(X_train)
pred_test = ml_model.predict(X_test)

fpr, tpr, _ = metrics.roc_curve(y_train, pred_train, pos_label=1)
train_auc = metrics.auc(fpr, tpr)

fpr, tpr, _ = metrics.roc_curve(y_test, pred_test, pos_label=1)
test_auc = metrics.auc(fpr, tpr)

model_prediction = clf.predict(X)

print(f"The out-of-sample AUC is {round(test_auc, 2)}")

The out-of-sample AUC is 0.73


## Select observations to generate counterfactuals for

In [7]:
preds = ml_model.predict_proba(df.drop(target, axis=1))[:,1]
factual_id = np.where(preds < 0.5)
factuals = df.loc[factual_id]
test_factual = factuals.iloc[:5]

print(test_factual.head(2))

        age    fnlwgt  education-num  hours-per-week  workclass_1  \
0  0.301370  0.044131       0.800000        0.397959          0.0   
2  0.287671  0.137581       0.533333        0.397959          0.0   

   workclass_2  workclass_3  marital-status_1  marital-status_2  \
0          0.0          1.0               1.0               0.0   
2          0.0          0.0               0.0               1.0   

   marital-status_3  occupation_1  occupation_2  occupation_3  relationship_1  \
0               0.0           0.0           0.0           1.0             1.0   
2               0.0           0.0           0.0           1.0             1.0   

   relationship_2  relationship_3  race_1  race_2  race_3  sex_1  income  
0             0.0             0.0     0.0     0.0     0.0    0.0       0  
2             0.0             0.0     0.0     0.0     0.0    0.0       0  


## Create objects to feed into MCCE method

In [8]:
class Dataset():
    def __init__(self, 
                 fixed_features, 
                 target,
                 categorical,
                 fixed_features_encoded,
                 continuous,
                 features,
                 encoder,
                 scaler,
                 inverse_transform,
                 ):
        
        self.fixed_features = fixed_features
        self.target = target
        self.feature_order = feature_order
        self.dtypes = dtypes

        self.categorical = categorical
        self.continuous = continuous
        self.features = self.categorical + self.continuous
        self.cols = self.features + [self.target]
        self.fixed_features_encoded = fixed_features_encoded
        self.encoder = encoder
        self.scaler = scaler
        self.inverse_transform = inverse_transform
        
        

In [9]:
dataset = Dataset(fixed_features, 
                  target,
                  categorical,
                  fixed_features_encoded,
                  continuous,
                  features,
                  encoder,
                  scaler,
                  inverse_transform)

In [10]:
dtypes = dict([(x, "float") for x in continuous])
for x in categorical_encoded:
    dtypes[x] = "category"
df = (df).astype(dtypes)

## Fit MCCE method

In [11]:
mcce = MCCE(dataset=dataset,
            model=ml_model)

print("Fit trees")
mcce.fit(df.drop(target, axis=1), dtypes)

print("Sample observations for the specific test observations")
cfs = mcce.generate(test_factual.drop(target, axis=1), k=100)

print("Process the sampled observations")
mcce.postprocess(cfs=cfs, test_factual=test_factual, cutoff=0.5)


Fit trees
Sample observations for the specific test observations
Process the sampled observations


## Print counterfactuals

In [18]:
cfs = mcce.results_sparse
cfs['income'] = test_factual['income'] # add back the original response

# invert the features to their original form
print("Original factuals:")
decoded_factuals = dataset.inverse_transform(test_factual,
                                             scaler, 
                                             encoder, 
                                             continuous,
                                             categorical,
                                             categorical_encoded)[feature_order]

decoded_factuals

Original factuals:


Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,hours-per-week
0,39.0,3,77516.0,13.0,1,3,1,0,0,40.0
1,38.0,0,215646.0,9.0,2,3,1,0,0,40.0
2,53.0,0,234721.0,7.0,0,3,0,1,0,40.0
3,28.0,0,338409.0,13.0,0,0,3,1,1,40.0
4,37.0,0,284582.0,14.0,0,2,3,0,1,40.0


In [17]:
print("Generated counterfactuals:")
decoded_cfs = dataset.inverse_transform(cfs,
                                        scaler, 
                                        encoder, 
                                        continuous,
                                        categorical,
                                        categorical_encoded)[feature_order]
decoded_cfs

Generated counterfactuals:


Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,hours-per-week
0,39.0,0,175232.0,13.0,1,0,1,0,0,40.0
1,38.0,0,86643.0,16.0,2,0,1,0,0,45.0
2,53.0,0,184176.0,9.0,0,3,0,0,0,40.0
3,28.0,0,132686.0,14.0,0,0,0,0,1,40.0
4,37.0,0,174150.0,14.0,0,2,0,0,1,40.0


## Calculate some metrics

In [23]:
distance_pd = pd.DataFrame(distance(cfs, test_factual, dataset))

feasibility_pd = pd.DataFrame(feasibility(cfs, df, categorical_encoded + continuous), columns=['feasibility'])

const_pd = pd.DataFrame(constraint_violation(decoded_cfs, decoded_factuals, dataset), columns=['violation'])

success_pd = pd.DataFrame(success_rate(cfs[categorical_encoded + continuous], ml_model), columns=['success'])


In [24]:
results = pd.concat([decoded_cfs, distance_pd, feasibility_pd, const_pd, success_pd], axis=1)
results

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,L0,L1,L2,feasibility,violation,success
0,39.0,0,175232.0,13.0,1,0,1,0,0,40.0,3.0,2.066109,2.00437,0.0246,0,0
1,38.0,0,86643.0,16.0,2,0,1,0,0,45.0,4.0,1.604962,1.227998,0.102196,0,0
2,53.0,0,184176.0,9.0,0,3,0,0,0,40.0,3.0,1.167529,1.018947,0.00598,0,0
3,28.0,0,132686.0,14.0,0,0,0,0,1,40.0,4.0,2.205846,2.023815,1.000421,0,0
4,37.0,0,174150.0,14.0,0,2,0,0,1,40.0,2.0,1.074711,1.005582,1.000401,0,0
