## Results for Tables 1, 2, 3 for CARLA methods

In [4]:
from carla.data.catalog import OnlineCatalog
import pandas as pd
import os
import numpy as np

# load catalog dataset
data_name = "adult"
# data_name = "give_me_some_credit"
# data_name = 'compas'
dataset = OnlineCatalog(data_name)


In [5]:
from carla.models.catalog import MLModelCatalog
import torch
torch.manual_seed(0)

ml_model = MLModelCatalog(
        dataset, 
        model_type="ann", 
        load_online=False, 
        backend="pytorch"
    )
if data_name == 'adult':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=1024,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'give_me_some_credit':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=2048,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'compas':
    ml_model.train(
    learning_rate=0.002,
    epochs=25,
    batch_size=25,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )


balance on test set 0.23883245958934032, balance on test set 0.2408256880733945
Epoch 0/19
----------


  x = self.softmax(x)


train Loss: 0.4668 Acc: 0.7734

test Loss: 0.4055 Acc: 0.8005

Epoch 1/19
----------
train Loss: 0.3946 Acc: 0.8121

test Loss: 0.3910 Acc: 0.8189

Epoch 2/19
----------
train Loss: 0.3784 Acc: 0.8222

test Loss: 0.3747 Acc: 0.8226

Epoch 3/19
----------
train Loss: 0.3655 Acc: 0.8290

test Loss: 0.3600 Acc: 0.8324

Epoch 4/19
----------
train Loss: 0.3535 Acc: 0.8343

test Loss: 0.3505 Acc: 0.8373

Epoch 5/19
----------
train Loss: 0.3460 Acc: 0.8372

test Loss: 0.3472 Acc: 0.8389

Epoch 6/19
----------
train Loss: 0.3431 Acc: 0.8387

test Loss: 0.3450 Acc: 0.8402

Epoch 7/19
----------
train Loss: 0.3405 Acc: 0.8402

test Loss: 0.3435 Acc: 0.8384

Epoch 8/19
----------
train Loss: 0.3404 Acc: 0.8389

test Loss: 0.3376 Acc: 0.8396

Epoch 9/19
----------
train Loss: 0.3348 Acc: 0.8421

test Loss: 0.3421 Acc: 0.8400

Epoch 10/19
----------
train Loss: 0.3348 Acc: 0.8411

test Loss: 0.3362 Acc: 0.8426

Epoch 11/19
----------
train Loss: 0.3345 Acc: 0.8401

test Loss: 0.3339 Acc: 0.8435



In [6]:
from sklearn import metrics

if data_name == 'adult':
    y = dataset.df_test['income']
elif data_name == 'give_me_some_credit':
    y = dataset.df_test['SeriousDlqin2yrs']
elif data_name == 'compas':
    y = dataset.df_test['score']

pred = ml_model.predict_proba(dataset.df_test)
pred = [row[1] for row in pred]
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=1)
metrics.auc(fpr, tpr)

0.8999147090860513

In [7]:
from carla.models.negative_instances import predict_negative_instances
import carla.recourse_methods.catalog as recourse_catalog

factuals = predict_negative_instances(ml_model, dataset.df)
test_factual = factuals.iloc[:100]

## Violation

In [8]:
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

results = []
for method in ['cchvae', 'cem-vae', 'revise', 'clue', 'crud', 'face']:
    if data_name == 'adult':
        cfs = pd.read_csv("Results/adult_manifold_results.csv")
    elif data_name == 'give_me_some_credit':
        cfs = pd.read_csv("Results/give_me_some_credit_manifold_results.csv")
    elif data_name == 'compas':
        cfs = pd.read_csv("Results/compas_manifold_results.csv")
    factuals = predict_negative_instances(ml_model, dataset.df)
    test_factual = factuals.iloc[:100]

    cfs.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
    cfs.set_index(['index'], inplace=True)

    df_cfs = cfs[cfs['method'] == method].drop(['method',	'data'], axis=1)

    nan_idx = df_cfs.index[df_cfs.isnull().any(axis=1)]

    output_factuals = test_factual.copy()
    output_counterfactuals = df_cfs.copy()

    output_factuals = output_factuals.drop(index=nan_idx)
    output_counterfactuals = output_counterfactuals.drop(index=nan_idx)

    test_factual = output_factuals
    df_cfs = output_counterfactuals

    df_decoded_cfs = dataset.inverse_transform(df_cfs.copy())

    df_factuals = dataset.inverse_transform(test_factual.copy())

    cfs_continuous_immutable = df_decoded_cfs[
        intersection(dataset.continuous, dataset.immutables)
    ]
    factual_continuous_immutable = df_factuals[
        intersection(dataset.continuous, dataset.immutables)
    ]

    continuous_violations = np.invert(
        np.isclose(cfs_continuous_immutable, factual_continuous_immutable)
    )
    continuous_violations = np.sum(continuous_violations, axis=1).reshape(
        (-1, 1)
    )  # sum over features

    # check categorical by boolean comparison
    cfs_categorical_immutable = df_decoded_cfs[
        intersection(dataset.categorical, dataset.immutables)
    ]
    factual_categorical_immutable = df_factuals[
        intersection(dataset.categorical, dataset.immutables)
    ]

    categorical_violations = cfs_categorical_immutable != factual_categorical_immutable
    categorical_violations = np.sum(categorical_violations.values, axis=1).reshape(
        (-1, 1)
    )  # sum over features

    total_violations = continuous_violations + categorical_violations

    for x in total_violations:
        results.append(x[0])
        
final_results = cfs.copy()
final_results.dropna(inplace=True)
final_results['violations'] = results

  x = self.softmax(x)
  x = self.softmax(x)
  x = self.softmax(x)
  x = self.softmax(x)
  x = self.softmax(x)
  x = self.softmax(x)


## Distance

In [9]:
def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

results = []
for method in ['cchvae', 'cem-vae', 'revise', 'clue', 'crud', 'face']:
    if data_name == 'adult':
        cfs = pd.read_csv("Results/adult_manifold_results.csv")
    elif data_name == 'give_me_some_credit':
        cfs = pd.read_csv("Results/give_me_some_credit_manifold_results.csv")
    elif data_name == 'compas':
        cfs = pd.read_csv("Results/compas_manifold_results.csv")
    factuals = predict_negative_instances(ml_model, dataset.df)
    test_factual = factuals.iloc[:100]

    cfs.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
    cfs.set_index(['index'], inplace=True)

    df_cfs = cfs[cfs['method'] == method].drop(['method',	'data'], axis=1)

    nan_idx = df_cfs.index[df_cfs.isnull().any(axis=1)]

    output_factuals = test_factual.copy()
    output_counterfactuals = df_cfs.copy()

    output_factuals = output_factuals.drop(index=nan_idx)
    output_counterfactuals = output_counterfactuals.drop(index=nan_idx)

    factual_without_nans = output_factuals
    counterfactuals_without_nans = output_counterfactuals

    columns = ["Distance_1", "Distance_2", "Distance_3", "Distance_4"]
        
    arr_f = ml_model.get_ordered_features(factual_without_nans).to_numpy()
    arr_cf = ml_model.get_ordered_features(
        counterfactuals_without_nans
    ).to_numpy()

    delta = arr_f - arr_cf 

    d1 = np.sum(np.invert(np.isclose(delta, np.zeros_like(delta))), axis=1, dtype=np.float).tolist()
    d1_old = np.sum(delta.round(2) != 0, axis=1, dtype=np.float).tolist()

    d2 = np.sum(np.abs(delta), axis=1, dtype=np.float).tolist()
    d3 = np.sum(np.square(np.abs(delta)), axis=1, dtype=np.float).tolist()

    results.append(pd.DataFrame({'L0': d1, 'L1': d2, 'L2': d3, 'time': df_cfs['time (seconds)'].mean()}))

temp = pd.concat(results)
temp.index = final_results.index
final_results = pd.concat([final_results, temp], axis=1)


  x = self.softmax(x)
  x = self.softmax(x)
  x = self.softmax(x)
  x = self.softmax(x)
  x = self.softmax(x)
  x = self.softmax(x)


## Validity

In [10]:
results = []
for method in ['cchvae', 'cem-vae', 'revise', 'clue', 'crud', 'face']:
    if data_name == 'adult':
        cfs = pd.read_csv("Results/adult_manifold_results.csv")
        y_col = 'income'
    elif data_name == 'give_me_some_credit':
        cfs = pd.read_csv("Results/give_me_some_credit_manifold_results.csv")
        y_col = "SeriousDlqin2yrs"
    elif data_name == 'compas':
        cfs = pd.read_csv("Results/compas_manifold_results.csv")
        y_col = "score"

    cfs.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
    cfs.set_index(['index'], inplace=True)

    df_cfs = cfs[cfs['method'] == method].drop(['method',	'data'], axis=1)
    results.append(pd.DataFrame({'validity': df_cfs[y_col]}))

In [11]:
temp = ml_model.predict_proba(final_results)
temp2 = []
temp3 = []
for x in temp:
    temp2.append(x[1]>= 0.5) #  >= 0.5

    temp3.append(x[1]) #  >= 0.5
    
final_results['validity'] = temp2
final_results['prediction'] = temp3
final_results['validity'] = final_results['validity'].astype(int)

  x = self.softmax(x)


## Feasibility

In [12]:
from sklearn.neighbors import NearestNeighbors

results = []
for method in ['cchvae', 'cem-vae', 'revise', 'clue', 'crud', 'face']:
    if data_name == 'adult':
        cfs = pd.read_csv("Results/adult_manifold_results.csv")
    elif data_name == 'give_me_some_credit':
        cfs = pd.read_csv("Results/give_me_some_credit_manifold_results.csv")
    elif data_name == 'compas':
        cfs = pd.read_csv("Results/compas_manifold_results.csv")
    factuals = predict_negative_instances(ml_model, dataset.df)
    test_factual = factuals.iloc[:100]

    cfs.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
    cfs.set_index(['index'], inplace=True)

    df_cfs = cfs[cfs['method'] == method].drop(['method',	'data'], axis=1)

    nan_idx = df_cfs.index[df_cfs.isnull().any(axis=1)]

    output_factuals = test_factual.copy()
    output_counterfactuals = df_cfs.copy()

    output_factuals = output_factuals.drop(index=nan_idx)
    output_counterfactuals = output_counterfactuals.drop(index=nan_idx)

    factual_without_nans = output_factuals
    counterfactuals_without_nans = output_counterfactuals


    cols = dataset.df.columns
    cols.drop(dataset.target)

    nbrs = NearestNeighbors(n_neighbors=5).fit(factual_without_nans[cols].values)

    for i, row in counterfactuals_without_nans[cols].iterrows():
        knn = nbrs.kneighbors(row.values.reshape((1, -1)), 5, return_distance=True)[0]
        
        results.append(np.mean(knn))
final_results['feasibility'] = results

  x = self.softmax(x)
  x = self.softmax(x)
  x = self.softmax(x)
  x = self.softmax(x)
  x = self.softmax(x)


In [13]:
pd.set_option('display.max_columns', None)

temp = final_results[['L0', 'L1', 'L2',  'feasibility', 'violations', 'validity', 'prediction']]
cfs.dropna(inplace=True)
temp = pd.concat([temp, dataset.inverse_transform(cfs)], axis=1)


In [14]:
import pandas as pd

results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_k_10000_n_100.csv", index_col=0)

results['data'] = data_name
results['method'] = 'mcce'
results.rename(columns={'violation': 'violations'}, inplace=True)

preds = ml_model.predict_proba(results)
new_preds = []
for x in preds:
    new_preds.append(x[1])
results['prediction'] = new_preds
results = dataset.inverse_transform(results)
results.head(1)

results['validity'] = np.where(np.asarray(new_preds) >= 0.5, 1, 0)

# results.loc[263]

  x = self.softmax(x)


In [15]:
temp = pd.concat([temp, results[temp.columns]])

temp2 = factuals.copy()
preds = ml_model.predict_proba(temp2)
new_preds = []
for x in preds:
    new_preds.append(x[1])
temp2['prediction'] = new_preds
temp2 = dataset.inverse_transform(temp2)
temp2.head(1)
temp2['L0'] = np.nan
temp2['L1'] = np.nan
temp2['L2'] = np.nan
temp2['validity'] = np.nan
temp2['violations'] = np.nan
temp2['feasibility'] = np.nan
temp2['time (seconds)'] = np.nan
temp2['method'] = 'original'
temp2['data'] = data_name

temp = pd.concat([temp, temp2.iloc[0:100][temp.columns]], axis=0)
temp

  x = self.softmax(x)


Unnamed: 0,L0,L1,L2,feasibility,violations,validity,prediction,method,data,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,time (seconds),marital-status,native-country,occupation,race,relationship,sex,workclass
0,9.0,3.067301,2.590808,0.929125,1.0,1.0,0.693481,cchvae,adult,38.765888,189643.930163,10.111852,3653.572464,172.417493,40.222952,1.0,85.685519,Married,US,Managerial-Specialist,White,Husband,Male,Private
1,7.0,1.511288,0.692132,0.929117,1.0,1.0,0.693502,cchvae,adult,38.766169,189644.580534,10.111858,3653.939460,172.424236,40.223089,1.0,85.685519,Married,US,Managerial-Specialist,White,Husband,Male,Private
2,10.0,3.441477,3.076889,0.929140,1.0,1.0,0.693444,cchvae,adult,38.764629,189644.255348,10.111844,3652.894471,172.413507,40.223213,1.0,85.685519,Married,US,Managerial-Specialist,White,Husband,Male,Private
3,9.0,2.772269,2.152981,0.929135,1.0,1.0,0.693467,cchvae,adult,38.765008,189643.471947,10.111861,3653.306067,172.419401,40.223155,1.0,85.685519,Married,US,Managerial-Specialist,White,Husband,Male,Private
4,11.0,4.779930,4.139920,0.929141,2.0,1.0,0.693440,cchvae,adult,38.764654,189643.856257,10.111830,3652.854971,172.413703,40.222983,1.0,85.685519,Married,US,Managerial-Specialist,White,Husband,Male,Private
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,,,,,,,0.005838,original,adult,22.000000,102632.000000,9.000000,0.000000,0.000000,41.000000,0.0,,Non-Married,US,Other,White,Non-Husband,Male,Private
120,,,,,,,0.003653,original,adult,21.000000,199915.000000,10.000000,0.000000,0.000000,40.000000,0.0,,Non-Married,US,Other,White,Non-Husband,Female,Private
122,,,,,,,0.094945,original,adult,30.000000,77143.000000,13.000000,0.000000,0.000000,40.000000,0.0,,Non-Married,Non-US,Managerial-Specialist,Non-White,Non-Husband,Male,Private
124,,,,,,,0.009554,original,adult,19.000000,301606.000000,10.000000,0.000000,0.000000,35.000000,0.0,,Non-Married,US,Other,Non-White,Non-Husband,Male,Private


In [16]:
# results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_baseline_results_n_100.csv", index_col=0)
# results['data'] = data_name
# results['method'] = 'baseline'
# results.rename(columns={'violation': 'violations'}, inplace=True)

# preds = ml_model.predict_proba(results)
# new_preds = []
# for x in preds:
#     new_preds.append(x[1])
# results['prediction'] = new_preds
# results = dataset.inverse_transform(results)
# results.head(1)

# results['validity'] = np.where(np.asarray(new_preds) >= 0.5, 1, 0)
# results[temp.columns]
# temp = pd.concat([temp, results[temp.columns]])
# temp

In [17]:
if data_name == 'give_me_some_credit':
    cols = ['method', 'data', 'prediction', 'L0', 'L1', 'L2', 'feasibility', 'violations', 'validity', 'time (seconds)'] + temp.columns[9:-1].to_list()
    temp = temp[cols]
elif data_name == 'adult':
    cols = ['method', 'data', 'prediction', 'L0', 'L1', 'L2', 'feasibility', 'violations', 'validity', 'time (seconds)'] + temp.columns[9:16].to_list() + temp.columns[17:].to_list()
    temp = temp[cols]
temp.to_csv(f"Final_results/{data_name}_results_mcce_and_carla_K_10000_n_100.csv")



## To get Table 1

In [18]:
temp = pd.read_csv("Final_results/adult_results_mcce_and_carla_K_10000_n_100.csv", index_col=0)

to_write = temp[['method', 'L0', 'L2', 'feasibility', 'violations', 'validity', 'time (seconds)']].groupby(['method']).mean()

to_write.reset_index(inplace=True)

CE_N = temp.groupby(['method']).size().reset_index().rename(columns={0: 'CE_N'})
to_write = pd.concat([to_write, CE_N.CE_N], axis=1)

# to_write.sort_values(['method'], inplace=True, ascending=False)
to_write = to_write[['method', 'L0', 'L2', 'feasibility', 'violations', 'validity', 'CE_N', 'time (seconds)']]

print(to_write.to_latex(index=False, float_format="%.2f", ))

\begin{tabular}{lrrrrrrr}
\toprule
   method &    L0 &   L2 &  feasibility &  violations &  validity &  CE\_N &  time (seconds) \\
\midrule
   cchvae &  9.39 & 2.67 &         0.93 &        1.29 &      1.00 &   100 &           85.69 \\
  cem-vae &  6.52 & 2.19 &         0.98 &        1.04 &      0.52 &   100 &           81.22 \\
     clue & 11.34 & 2.00 &         1.26 &        1.29 &      1.00 &   100 &          354.79 \\
     crud & 10.66 & 2.33 &         1.09 &        1.29 &      1.00 &   100 &         1145.26 \\
     face &  6.96 & 2.72 &         1.03 &        1.36 &      1.00 &   100 &          955.89 \\
     mcce &  3.04 & 0.74 &         0.05 &        0.00 &      1.00 &   100 &          106.21 \\
 original &   NaN &  NaN &          NaN &         NaN &       NaN &   100 &             NaN \\
   revise &  9.66 & 3.22 &         1.06 &        1.39 &      1.00 &    38 &          907.06 \\
\bottomrule
\end{tabular}



In [19]:
temp = pd.read_csv("Final_results/give_me_some_credit_results_mcce_and_carla_K_10000_n_100.csv", index_col=0)

to_write = temp[['method', 'L0', 'L2', 'feasibility', 'violations', 'validity', 'time (seconds)']].groupby(['method']).mean()

to_write.reset_index(inplace=True)

CE_N = temp.groupby(['method']).size().reset_index().rename(columns={0: 'CE_N'})
to_write = pd.concat([to_write, CE_N.CE_N], axis=1)

# to_write.sort_values(['method'], inplace=True, ascending=False)
to_write = to_write[['method', 'L0', 'L2', 'feasibility', 'violations', 'validity', 'CE_N', 'time (seconds)']]

print(to_write.to_latex(index=False, float_format="%.2f", ))

\begin{tabular}{lrrrrrrr}
\toprule
   method &    L0 &   L2 &  feasibility &  violations &  validity &  CE\_N &  time (seconds) \\
\midrule
   cchvae & 10.00 & 0.41 &         0.44 &        1.00 &      1.00 &   100 &          202.02 \\
  cem-vae &  8.43 & 0.58 &         0.42 &        0.93 &      0.96 &   100 &           86.35 \\
     clue & 10.00 & 0.37 &         0.40 &        0.81 &      1.00 &   100 &          341.87 \\
     crud & 10.00 & 0.38 &         0.49 &        1.00 &      1.00 &   100 &         1153.13 \\
     face &  8.51 & 0.60 &         0.46 &        0.98 &      1.00 &   100 &         3052.45 \\
     mcce &  3.48 & 0.18 &         0.03 &        0.00 &      1.00 &   100 &         1242.18 \\
 original &   NaN &  NaN &          NaN &         NaN &       NaN &   100 &             NaN \\
   revise & 10.00 & 1.00 &         0.79 &        1.00 &      1.00 &   100 &          805.43 \\
\bottomrule
\end{tabular}



## To get Adult examples in table 2

In [20]:
import pandas as pd

temp = pd.read_csv("Final_results/adult_results_mcce_and_carla_K_10000_n_100.csv", index_col=0)

# cols = ['Method', 'Pred', 'Age', 'Work Class', 'FNLWGT', 'Educat.', 'Mar. Stat.', 'Relat.', 'Cap. Gain', 'Cap. Loss', 'Hr.', 'Co.']

# cols = ['method', 'prediction', 'age', 'workclass', 'fnlwgt', 'education-num', 'marital-status', 'relationship', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']

to_write = temp.loc[31]
# to_write.columns = cols
# to_write.sort_values(['Method'], inplace=True, ascending=False)
pd.set_option('display.max_columns', None)
to_write


Unnamed: 0,method,data,prediction,L0,L1,L2,feasibility,violations,validity,time (seconds),age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,marital-status,native-country,occupation,race,relationship,sex,workclass
31,cchvae,adult,0.693449,11.0,4.691737,4.141263,0.929137,1.0,1.0,85.685519,38.764985,189644.462285,10.111844,3652.98447,172.409726,40.223185,1.0,Married,US,Managerial-Specialist,White,Husband,Male,Private
31,cem-vae,adult,0.500259,7.0,2.410045,2.039343,1.11354,1.0,1.0,81.218853,23.0,190709.000094,12.0,10303.477725,0.0,51.999999,1.0,Non-Married,US,Other,White,Non-Husband,Male,Non-Private
31,clue,adult,0.773839,8.0,2.803834,2.141725,1.477626,1.0,1.0,354.790007,10.527289,398961.583832,9.570491,10724.602753,-62.028129,49.267808,1.0,Non-Married,US,Other,White,Non-Husband,Male,Private
31,crud,adult,0.945782,13.0,3.415565,1.386162,1.207662,1.0,1.0,1145.259907,50.366662,147878.430972,16.521214,8414.476454,-30.494723,41.116929,1.0,Married,US,Managerial-Specialist,White,Husband,Male,Private
31,face,adult,0.509658,8.0,3.497612,3.072894,0.492528,1.0,1.0,955.889991,35.0,38948.0,11.0,3103.0,0.0,40.0,1.0,Married,US,Other,White,Husband,Male,Private
31,mcce,adult,0.928255,3.0,0.638007,0.178944,0.037504,0.0,1.0,106.208441,20.0,188923.0,10.0,34095.0,0.0,20.0,0.928255,Non-Married,US,Other,Non-White,Non-Husband,Male,Private
31,original,adult,0.015605,,,,,,,,20.0,266015.0,10.0,0.0,0.0,44.0,0.0,Non-Married,US,Other,Non-White,Non-Husband,Male,Private


In [21]:
# print(to_write.Pred.to_latex(index=False, float_format="%.2f", ))
feature = 'marital-status'
dct = {'Married': 'M', 'Non-Married': 'NM'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'native-country'
dct = {'Non-US': 'NUS', 'US': 'US'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'occupation'
dct = {'Managerial-Specialist': 'MS', 'Other': 'O'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'race'
dct = {'White': 'W', 'Non-White': 'NW'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'relationship'
dct = {'Husband': 'H', 'Non-Husband': 'NH'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'sex'
dct = {'Male': 'M'}
to_write[feature] = [dct[item] for item in to_write[feature]]


feature = 'workclass'
dct = {'Self-emp-not-inc': 'SENI', 'Private': 'P', 'Non-Private': 'NP'}
to_write[feature] = [dct[item] for item in to_write[feature]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in t

In [22]:
cols = ['method', 'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', \
       'hours-per-week', 'marital-status', 'native-country', \
       'occupation', 'race', 'relationship', 'sex', 'workclass']

print(to_write[cols].to_latex(index=False, float_format="%.0f", ))

\begin{tabular}{lrrrrrrlllllll}
\toprule
   method &  age &  fnlwgt &  education-num &  capital-gain &  capital-loss &  hours-per-week & marital-status & native-country & occupation & race & relationship & sex & workclass \\
\midrule
   cchvae &   39 &  189644 &             10 &          3653 &           172 &              40 &              M &             US &         MS &    W &            H &   M &         P \\
  cem-vae &   23 &  190709 &             12 &         10303 &             0 &              52 &             NM &             US &          O &    W &           NH &   M &        NP \\
     clue &   11 &  398962 &             10 &         10725 &           -62 &              49 &             NM &             US &          O &    W &           NH &   M &         P \\
     crud &   50 &  147878 &             17 &          8414 &           -30 &              41 &              M &             US &         MS &    W &            H &   M &         P \\
     face &   35 &   38948 &  

## To get GMC examples in table 3

In [23]:
temp = pd.read_csv("Final_results/give_me_some_credit_results_mcce_and_carla_K_10000_n_100.csv", index_col=0)
temp.loc[263]

Unnamed: 0,method,data,prediction,L0,L1,L2,feasibility,violations,validity,time (seconds),RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
263,cchvae,give_me_some_credit,0.958915,10.0,1.683243,0.780067,0.439565,1.0,1.0,202.023006,4.180795,51.233605,0.267863,9.326275,6247.850267,8.485077,0.111863,0.979294,0.086688,0.817727,1.0
263,cem-vae,give_me_some_credit,0.60186,8.0,1.745561,1.213027,0.381716,1.0,1.0,86.349849,0.0,69.0,2.0,0.193935,3000.000061,10.0,0.0,0.0,2.0,0.0,1.0
263,clue,give_me_some_credit,0.996728,10.0,1.376965,0.746909,0.374789,1.0,1.0,341.871374,-2.746603,37.309792,1.986658,-148.846837,627.710725,8.414357,-1.905691,0.269575,-1.185221,2.13745,1.0
263,crud,give_me_some_credit,0.996066,10.0,1.303359,0.39748,0.566557,1.0,1.0,1153.126735,0.017851,82.7968,0.5215,88.250007,9245.43453,17.240654,-1.007147,1.144465,0.266708,2.025982,1.0
263,revise,give_me_some_credit,0.952792,10.0,2.40186,0.938449,0.782334,1.0,1.0,805.431304,0.972036,37.999129,0.155846,10.144193,3968.102287,7.551169,0.026027,0.498095,0.080697,0.715548,1.0
263,face,give_me_some_credit,0.520177,7.0,0.908533,0.322813,0.319428,1.0,1.0,3052.451393,0.0,42.0,2.0,0.460208,2600.0,6.0,1.0,1.0,1.0,2.0,1.0
263,mcce,give_me_some_credit,0.583052,4.0,0.627327,0.265629,0.034067,0.0,1.0,1242.184716,0.763765,38.0,2.0,0.463105,3650.0,8.0,1.0,0.0,0.0,2.0,0.583052
263,original,give_me_some_credit,0.495998,,,,,,,,1.002647,38.0,2.0,0.472543,3550.0,8.0,1.0,0.0,1.0,4.0,1.0


In [24]:
cols = ['method', 'prediction', 'age', 'RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

to_write = temp[cols].loc[263]

cols = ['Method', 'Pred', 'Age', 'Unsec. Lines', 'Nb Days Past 30', 'Debt Ratio', 'Month Inc.', 'Nb Credit Lines', 'Nb Times 90 Days Late', 'Nb Real Estate Loans', 'Nb Times 60 Days Past', 'Nb Dep.']

to_write.columns = cols
# to_write.sort_values(['Method'], inplace=True, ascending=False)

# print(to_write.to_latex(index=False, float_format="%.0f", ))

print(to_write.to_latex(index=False, float_format="%.2f", ))

\begin{tabular}{lrrrrrrrrrrr}
\toprule
   Method &  Pred &   Age &  Unsec. Lines &  Nb Days Past 30 &  Debt Ratio &  Month Inc. &  Nb Credit Lines &  Nb Times 90 Days Late &  Nb Real Estate Loans &  Nb Times 60 Days Past &  Nb Dep. \\
\midrule
   cchvae &  0.96 & 51.23 &          4.18 &             0.27 &        9.33 &     6247.85 &             8.49 &                   0.11 &                  0.98 &                   0.09 &     0.82 \\
  cem-vae &  0.60 & 69.00 &          0.00 &             2.00 &        0.19 &     3000.00 &            10.00 &                   0.00 &                  0.00 &                   2.00 &     0.00 \\
     clue &  1.00 & 37.31 &         -2.75 &             1.99 &     -148.85 &      627.71 &             8.41 &                  -1.91 &                  0.27 &                  -1.19 &     2.14 \\
     crud &  1.00 & 82.80 &          0.02 &             0.52 &       88.25 &     9245.43 &            17.24 &                  -1.01 &                  1.14 &          

In [29]:
if data_name == 'give_me_some_credit':
    features = [ 'age', 'RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse','DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']
    metric_names = ['method', 'L0', 'L1', 'violations', 'validity', 'prediction']

    temp = dataset.inverse_transform(final_results.dropna()[features])
    temp = pd.concat([final_results[metric_names], temp], axis=1)
    # temp.sort_values(temp.index.name)

In [31]:
if data_name == 'give_me_some_credit':
    mcce_results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/give_me_some_credit_mcce_results_k_10000.csv")
    mcce_results.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
    mcce_results.set_index(['index'], inplace=True)

    predictions = ml_model.predict_proba(mcce_results)
    temp3 = []
    for x in predictions:
        temp3.append(x[1]) #  >= 0.5
        
    # temp.index = final_results.index
    mcce_results['prediction'] = temp3


In [33]:
if data_name == 'give_me_some_credit':
    mcce_results.sort_values(mcce_results.index.name, inplace=True)
    mcce_results['method'] = 'mcce'
    mcce_results.rename(columns={'success': 'validity', 'violation': 'violations'}, inplace=True)
    temp_mcce = dataset.inverse_transform(mcce_results.dropna()[features])
    temp_mcce = pd.concat([mcce_results[metric_names], temp_mcce], axis=1)
    # temp_mcce.sort_values(temp_mcce.index.name)

In [None]:
temp.index

In [34]:
if data_name == 'give_me_some_credit':
    pd.set_option('display.max_columns', None)
    temp2 = pd.concat([temp, temp_mcce], axis=0)

    temp2.sort_values(temp2.index.name)

    features = ['method', 'prediction', 'age', 'RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse','DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']

    temp2.loc[263][features]

In [35]:
if data_name == 'give_me_some_credit':
    factuals = predict_negative_instances(ml_model, dataset.df)
    dataset.inverse_transform(factuals.iloc[3:4])[['age', 'RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse','DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']]

    ml_model.predict_proba(factuals.iloc[3:4])
