## Results for Tables 1, 2, 3 for CARLA methods

In [7]:
from carla.data.catalog import OnlineCatalog
import pandas as pd
import os
import numpy as np

# load catalog dataset
data_name = "adult"
data_name = "give_me_some_credit"
# data_name = 'compas'
dataset = OnlineCatalog(data_name)


In [8]:
dataset.df.columns

Index(['RevolvingUtilizationOfUnsecuredLines', 'age',
       'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
       'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
       'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
       'NumberOfDependents', 'SeriousDlqin2yrs'],
      dtype='object')

In [9]:
# dataset.df[dataset.df.income==0].shape[0] / dataset.df.shape[0]
dataset.df[dataset.df.SeriousDlqin2yrs==0].shape[0] / dataset.df.shape[0]

0.06787157980385537

In [None]:
from carla.models.catalog import MLModelCatalog
import torch
torch.manual_seed(0)

ml_model = MLModelCatalog(
        dataset, 
        model_type="ann", 
        load_online=False, 
        backend="pytorch"
    )
if data_name == 'adult':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=1024,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'give_me_some_credit':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=2048,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'compas':
    ml_model.train(
    learning_rate=0.002,
    epochs=25,
    batch_size=25,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )


In [None]:
from sklearn import metrics

if data_name == 'adult':
    y = dataset.df_test['income']
elif data_name == 'give_me_some_credit':
    y = dataset.df_test['SeriousDlqin2yrs']
elif data_name == 'compas':
    y = dataset.df_test['score']

pred = ml_model.predict_proba(dataset.df_test)
pred = [row[1] for row in pred]
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=1)
metrics.auc(fpr, tpr)

In [None]:
from carla.models.negative_instances import predict_negative_instances
import carla.recourse_methods.catalog as recourse_catalog

factuals = predict_negative_instances(ml_model, dataset.df)
test_factual = factuals.iloc[:100]

In [None]:
print((f"Factuals: {factuals.shape[0]}"))
print((f"Factuals: {(factuals.shape[0]) / dataset.df.shape[0]}"))


## Violation

In [None]:
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

results = []
for method in ['cchvae', 'cem-vae', 'revise', 'clue', 'crud', 'face']:
    if data_name == 'adult':
        cfs = pd.read_csv("Results/adult_manifold_results.csv")
    elif data_name == 'give_me_some_credit':
        cfs = pd.read_csv("Results/give_me_some_credit_manifold_results.csv")
    elif data_name == 'compas':
        cfs = pd.read_csv("Results/compas_manifold_results.csv")
    factuals = predict_negative_instances(ml_model, dataset.df)
    test_factual = factuals.iloc[:100]

    cfs.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
    cfs.set_index(['index'], inplace=True)

    df_cfs = cfs[cfs['method'] == method].drop(['method',	'data'], axis=1)

    nan_idx = df_cfs.index[df_cfs.isnull().any(axis=1)]

    output_factuals = test_factual.copy()
    output_counterfactuals = df_cfs.copy()

    output_factuals = output_factuals.drop(index=nan_idx)
    output_counterfactuals = output_counterfactuals.drop(index=nan_idx)

    test_factual = output_factuals
    df_cfs = output_counterfactuals

    df_decoded_cfs = dataset.inverse_transform(df_cfs.copy())

    df_factuals = dataset.inverse_transform(test_factual.copy())

    cfs_continuous_immutable = df_decoded_cfs[
        intersection(dataset.continuous, dataset.immutables)
    ]
    factual_continuous_immutable = df_factuals[
        intersection(dataset.continuous, dataset.immutables)
    ]

    continuous_violations = np.invert(
        np.isclose(cfs_continuous_immutable, factual_continuous_immutable)
    )
    continuous_violations = np.sum(continuous_violations, axis=1).reshape(
        (-1, 1)
    )  # sum over features

    # check categorical by boolean comparison
    cfs_categorical_immutable = df_decoded_cfs[
        intersection(dataset.categorical, dataset.immutables)
    ]
    factual_categorical_immutable = df_factuals[
        intersection(dataset.categorical, dataset.immutables)
    ]

    categorical_violations = cfs_categorical_immutable != factual_categorical_immutable
    categorical_violations = np.sum(categorical_violations.values, axis=1).reshape(
        (-1, 1)
    )  # sum over features

    total_violations = continuous_violations + categorical_violations

    for x in total_violations:
        results.append(x[0])
        
final_results = cfs.copy()
final_results.dropna(inplace=True)
final_results['violations'] = results

## Distance

In [None]:
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

results = []
for method in ['cchvae', 'cem-vae', 'revise', 'clue', 'crud', 'face']:
    if data_name == 'adult':
        cfs = pd.read_csv("Results/adult_manifold_results.csv")
    elif data_name == 'give_me_some_credit':
        cfs = pd.read_csv("Results/give_me_some_credit_manifold_results.csv")
    elif data_name == 'compas':
        cfs = pd.read_csv("Results/compas_manifold_results.csv")
    factuals = predict_negative_instances(ml_model, dataset.df)
    test_factual = factuals.iloc[:100]

    cfs.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
    cfs.set_index(['index'], inplace=True)

    df_cfs = cfs[cfs['method'] == method].drop(['method',	'data'], axis=1)

    nan_idx = df_cfs.index[df_cfs.isnull().any(axis=1)]

    output_factuals = test_factual.copy()
    output_counterfactuals = df_cfs.copy()

    output_factuals = output_factuals.drop(index=nan_idx)
    output_counterfactuals = output_counterfactuals.drop(index=nan_idx)

    factual_without_nans = output_factuals
    counterfactuals_without_nans = output_counterfactuals

    columns = ["Distance_1", "Distance_2", "Distance_3", "Distance_4"]
        
    arr_f = ml_model.get_ordered_features(factual_without_nans).to_numpy()
    arr_cf = ml_model.get_ordered_features(
        counterfactuals_without_nans
    ).to_numpy()

    delta = arr_f - arr_cf 

    d1 = np.sum(np.invert(np.isclose(delta, np.zeros_like(delta))), axis=1, dtype=np.float).tolist()
    d1_old = np.sum(delta.round(2) != 0, axis=1, dtype=np.float).tolist()

    d2 = np.sum(np.abs(delta), axis=1, dtype=np.float).tolist()
    d3 = np.sum(np.square(np.abs(delta)), axis=1, dtype=np.float).tolist()

    results.append(pd.DataFrame({'L0': d1, 'L1': d2, 'L2': d3, 'time': df_cfs['time (seconds)'].mean()}))

temp = pd.concat(results)
temp.index = final_results.index
final_results = pd.concat([final_results, temp], axis=1)


## Validity

In [None]:
results = []
for method in ['cchvae', 'cem-vae', 'revise', 'clue', 'crud', 'face']:
    if data_name == 'adult':
        cfs = pd.read_csv("Results/adult_manifold_results.csv")
        y_col = 'income'
    elif data_name == 'give_me_some_credit':
        cfs = pd.read_csv("Results/give_me_some_credit_manifold_results.csv")
        y_col = "SeriousDlqin2yrs"
    elif data_name == 'compas':
        cfs = pd.read_csv("Results/compas_manifold_results.csv")
        y_col = "score"

    cfs.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
    cfs.set_index(['index'], inplace=True)

    df_cfs = cfs[cfs['method'] == method].drop(['method',	'data'], axis=1)
    results.append(pd.DataFrame({'validity': df_cfs[y_col]}))

In [None]:
temp = ml_model.predict_proba(final_results)
temp2 = []
temp3 = []
for x in temp:
    temp2.append(x[1]>= 0.5) #  >= 0.5

    temp3.append(x[1]) #  >= 0.5
    
final_results['validity'] = temp2
final_results['prediction'] = temp3
final_results['validity'] = final_results['validity'].astype(int)

## Feasibility

In [None]:
from sklearn.neighbors import NearestNeighbors

results = []
for method in ['cchvae', 'cem-vae', 'revise', 'clue', 'crud', 'face']:
    if data_name == 'adult':
        cfs = pd.read_csv("Results/adult_manifold_results.csv")
    elif data_name == 'give_me_some_credit':
        cfs = pd.read_csv("Results/give_me_some_credit_manifold_results.csv")
    elif data_name == 'compas':
        cfs = pd.read_csv("Results/compas_manifold_results.csv")
    factuals = predict_negative_instances(ml_model, dataset.df)
    test_factual = factuals.iloc[:100]

    cfs.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
    cfs.set_index(['index'], inplace=True)

    df_cfs = cfs[cfs['method'] == method].drop(['method',	'data'], axis=1)

    nan_idx = df_cfs.index[df_cfs.isnull().any(axis=1)]

    output_factuals = test_factual.copy()
    output_counterfactuals = df_cfs.copy()

    output_factuals = output_factuals.drop(index=nan_idx)
    output_counterfactuals = output_counterfactuals.drop(index=nan_idx)

    factual_without_nans = output_factuals
    counterfactuals_without_nans = output_counterfactuals


    cols = dataset.df.columns
    cols.drop(dataset.target)

    nbrs = NearestNeighbors(n_neighbors=5).fit(factual_without_nans[cols].values)

    for i, row in counterfactuals_without_nans[cols].iterrows():
        knn = nbrs.kneighbors(row.values.reshape((1, -1)), 5, return_distance=True)[0]
        
        results.append(np.mean(knn))
final_results['feasibility'] = results

In [None]:
pd.set_option('display.max_columns', None)

temp = final_results[['L0', 'L1', 'L2',  'feasibility', 'violations', 'validity', 'prediction']]
cfs.dropna(inplace=True)
temp = pd.concat([temp, dataset.inverse_transform(cfs)], axis=1)


In [None]:
import pandas as pd

results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_k_10000_n_100.csv", index_col=0)

results['data'] = data_name
results['method'] = 'mcce'
results.rename(columns={'violation': 'violations'}, inplace=True)

preds = ml_model.predict_proba(results)
new_preds = []
for x in preds:
    new_preds.append(x[1])
results['prediction'] = new_preds
results = dataset.inverse_transform(results)
results.head(1)

results['validity'] = np.where(np.asarray(new_preds) >= 0.5, 1, 0)

# results.loc[263]

In [None]:
temp = pd.concat([temp, results[temp.columns]])

temp2 = factuals.copy()
preds = ml_model.predict_proba(temp2)
new_preds = []
for x in preds:
    new_preds.append(x[1])
temp2['prediction'] = new_preds
temp2 = dataset.inverse_transform(temp2)
temp2.head(1)
temp2['L0'] = np.nan
temp2['L1'] = np.nan
temp2['L2'] = np.nan
temp2['validity'] = np.nan
temp2['violations'] = np.nan
temp2['feasibility'] = np.nan
temp2['time (seconds)'] = np.nan
temp2['method'] = 'original'
temp2['data'] = data_name

temp = pd.concat([temp, temp2.iloc[0:100][temp.columns]], axis=0)
temp

In [None]:
# results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_baseline_results_n_100.csv", index_col=0)
# results['data'] = data_name
# results['method'] = 'baseline'
# results.rename(columns={'violation': 'violations'}, inplace=True)

# preds = ml_model.predict_proba(results)
# new_preds = []
# for x in preds:
#     new_preds.append(x[1])
# results['prediction'] = new_preds
# results = dataset.inverse_transform(results)
# results.head(1)

# results['validity'] = np.where(np.asarray(new_preds) >= 0.5, 1, 0)
# results[temp.columns]
# temp = pd.concat([temp, results[temp.columns]])
# temp

# temp.columns[9:13].to_list() + temp.columns[15:].to_list()

In [None]:
if data_name == 'give_me_some_credit':
    cols = ['method', 'data', 'prediction', 'L0', 'L1', 'L2', 'feasibility', 'violations', 'validity', 'time (seconds)'] + temp.columns[9:-1].to_list()
    temp = temp[cols]
elif data_name == 'adult':
    cols = ['method', 'data', 'prediction', 'L0', 'L1', 'L2', 'feasibility', 'violations', 'validity', 'time (seconds)'] + temp.columns[9:16].to_list() + temp.columns[17:].to_list()
    temp = temp[cols]

elif data_name == 'compas':
    cols = ['method', 'data', 'prediction', 'L0', 'L1', 'L2', 'feasibility', 'violations', 'validity', 'time (seconds)'] + temp.columns[9:13].to_list() + temp.columns[15:].to_list()
    temp = temp[cols]
temp.to_csv(f"Final_results/{data_name}_results_mcce_and_carla_K_10000_n_100.csv")



## To get Table 1

In [None]:
temp = pd.read_csv("Final_results/adult_results_mcce_and_carla_K_10000_n_100.csv", index_col=0)

to_write = temp[['method', 'L0', 'L2', 'feasibility', 'violations', 'validity', 'time (seconds)']].groupby(['method']).mean()

to_write.reset_index(inplace=True)

CE_N = temp.groupby(['method']).size().reset_index().rename(columns={0: 'CE_N'})
to_write = pd.concat([to_write, CE_N.CE_N], axis=1)

# to_write.sort_values(['method'], inplace=True, ascending=False)
to_write = to_write[['method', 'L0', 'L2', 'feasibility', 'violations', 'validity', 'CE_N', 'time (seconds)']]

print(to_write.to_latex(index=False, float_format="%.2f", ))

In [None]:
temp = pd.read_csv("Final_results/give_me_some_credit_results_mcce_and_carla_K_10000_n_100.csv", index_col=0)

to_write = temp[['method', 'L0', 'L2', 'feasibility', 'violations', 'validity', 'time (seconds)']].groupby(['method']).mean()

to_write.reset_index(inplace=True)

CE_N = temp.groupby(['method']).size().reset_index().rename(columns={0: 'CE_N'})
to_write = pd.concat([to_write, CE_N.CE_N], axis=1)

# to_write.sort_values(['method'], inplace=True, ascending=False)
to_write = to_write[['method', 'L0', 'L2', 'feasibility', 'violations', 'validity', 'CE_N', 'time (seconds)']]

print(to_write.to_latex(index=False, float_format="%.2f", ))

In [None]:
temp = pd.read_csv("Final_results/compas_results_mcce_and_carla_K_10000_n_100.csv", index_col=0)

to_write = temp[['method', 'L0', 'L2', 'feasibility', 'violations', 'validity', 'time (seconds)']].groupby(['method']).mean()

to_write.reset_index(inplace=True)

CE_N = temp.groupby(['method']).size().reset_index().rename(columns={0: 'CE_N'})
to_write = pd.concat([to_write, CE_N.CE_N], axis=1)

# to_write.sort_values(['method'], inplace=True, ascending=False)
to_write = to_write[['method', 'L0', 'L2', 'feasibility', 'violations', 'validity', 'CE_N', 'time (seconds)']]

print(to_write.to_latex(index=False, float_format="%.2f", ))

## To get Adult examples in table 2

In [None]:
import pandas as pd

temp = pd.read_csv("Final_results/adult_results_mcce_and_carla_K_10000_n_100.csv", index_col=0)

# cols = ['Method', 'Pred', 'Age', 'Work Class', 'FNLWGT', 'Educat.', 'Mar. Stat.', 'Relat.', 'Cap. Gain', 'Cap. Loss', 'Hr.', 'Co.']

# cols = ['method', 'prediction', 'age', 'workclass', 'fnlwgt', 'education-num', 'marital-status', 'relationship', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

to_write = temp.loc[31]
# to_write.columns = cols
# to_write.sort_values(['Method'], inplace=True, ascending=False)
pd.set_option('display.max_columns', None)
to_write


In [None]:
# print(to_write.Pred.to_latex(index=False, float_format="%.2f", ))
feature = 'marital-status'
dct = {'Married': 'M', 'Non-Married': 'NM'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'native-country'
dct = {'Non-US': 'NUS', 'US': 'US'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'occupation'
dct = {'Managerial-Specialist': 'MS', 'Other': 'O'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'race'
dct = {'White': 'W', 'Non-White': 'NW'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'relationship'
dct = {'Husband': 'H', 'Non-Husband': 'NH'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'sex'
dct = {'Male': 'M'}
to_write[feature] = [dct[item] for item in to_write[feature]]


feature = 'workclass'
dct = {'Self-emp-not-inc': 'SENI', 'Private': 'P', 'Non-Private': 'NP'}
to_write[feature] = [dct[item] for item in to_write[feature]]

In [None]:
cols = ['method', 'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', \
       'hours-per-week', 'marital-status', 'native-country', \
       'occupation', 'race', 'relationship', 'sex', 'workclass']

print(to_write[cols].to_latex(index=False, float_format="%.0f", ))

## To get GMC examples in table 3

In [None]:
temp = pd.read_csv("Final_results/give_me_some_credit_results_mcce_and_carla_K_10000_n_100.csv", index_col=0)
# temp.loc[263]

cols = ['method', 'prediction', 'age', 'RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

to_write = temp[cols].loc[263]

cols = ['Method', 'Pred', 'Age', 'Unsec. Lines', 'Nb Days Past 30', 'Debt Ratio', 'Month Inc.', 'Nb Credit Lines', 'Nb Times 90 Days Late', 'Nb Real Estate Loans', 'Nb Times 60 Days Past', 'Nb Dep.']

to_write.columns = cols
# to_write.sort_values(['Method'], inplace=True, ascending=False)

# print(to_write.to_latex(index=False, float_format="%.0f", ))

print(to_write.to_latex(index=False, float_format="%.2f", ))

In [None]:
if data_name == 'give_me_some_credit':
    features = [ 'age', 'RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse','DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']
    metric_names = ['method', 'L0', 'L1', 'violations', 'validity', 'prediction']

    temp = dataset.inverse_transform(final_results.dropna()[features])
    temp = pd.concat([final_results[metric_names], temp], axis=1)
    # temp.sort_values(temp.index.name)

In [None]:
if data_name == 'give_me_some_credit':
    mcce_results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/give_me_some_credit_mcce_results_k_10000.csv")
    mcce_results.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
    mcce_results.set_index(['index'], inplace=True)

    predictions = ml_model.predict_proba(mcce_results)
    temp3 = []
    for x in predictions:
        temp3.append(x[1]) #  >= 0.5
        
    # temp.index = final_results.index
    mcce_results['prediction'] = temp3


In [None]:
if data_name == 'give_me_some_credit':
    mcce_results.sort_values(mcce_results.index.name, inplace=True)
    mcce_results['method'] = 'mcce'
    mcce_results.rename(columns={'success': 'validity', 'violation': 'violations'}, inplace=True)
    temp_mcce = dataset.inverse_transform(mcce_results.dropna()[features])
    temp_mcce = pd.concat([mcce_results[metric_names], temp_mcce], axis=1)
    # temp_mcce.sort_values(temp_mcce.index.name)

In [None]:
if data_name == 'give_me_some_credit':
    pd.set_option('display.max_columns', None)
    temp2 = pd.concat([temp, temp_mcce], axis=0)

    temp2.sort_values(temp2.index.name)

    features = ['method', 'prediction', 'age', 'RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse','DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

    temp2.loc[263][features]

In [None]:
if data_name == 'give_me_some_credit':
    factuals = predict_negative_instances(ml_model, dataset.df)
    dataset.inverse_transform(factuals.iloc[3:4])[['age', 'RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse','DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']]

    ml_model.predict_proba(factuals.iloc[3:4])


In [None]:
temp = pd.read_csv("Final_results/compas_results_mcce_and_carla_K_10000_n_100.csv", index_col=0)
temp

In [None]:

cols = ['method', 'prediction', 'age', 'two_year_recid', 'priors_count', 'length_of_stay', 'c_charge_degree', 'race', 'sex']

to_write = temp[cols].loc[40]

cols = ['Method', 'Pred', 'Age', 'Two Year Recid', 'Priors Count', 'Length of Stay', 'C Charge Degree', 'Race', 'Sex']

to_write.columns = cols
# to_write.sort_values(['Method'], inplace=True, ascending=False)

# print(to_write.to_latex(index=False, float_format="%.0f", ))

print(to_write.to_latex(index=False, float_format="%.2f", ))

round(to_write[['Method', 'Pred']],2)