In [11]:
import sys
import yaml
import argparse
import pandas as pd
import numpy as np

from carla.data.catalog import OnlineCatalog
from carla.models.catalog import MLModelCatalog
from carla.models.negative_instances import predict_negative_instances
import carla.recourse_methods.catalog as recourse_catalog

import torch

from mcce import MCCE

data_name = "adult"
data_name = 'give_me_some_credit'
# dataset = ['compas']
K = 100
n_test = 100
seed = 1

# for data_name in dataset:
dataset = OnlineCatalog(data_name)

torch.manual_seed(0)
ml_model = MLModelCatalog(
        dataset, 
        model_type="ann", 
        load_online=False, 
        backend="pytorch"
    )


if data_name == 'adult':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=1024,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'give_me_some_credit':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=2048,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'compas':
    ml_model.train(
    learning_rate=0.002,
    epochs=25,
    batch_size=25,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )

# (2) Find unhappy customers and choose which ones to make counterfactuals for

factuals = predict_negative_instances(ml_model, dataset.df)
test_factual = factuals.iloc[:n_test]
# test_factual_inverse = dataset.inverse_transform(test_factual)



balance on test set 0.9320676322926885, balance on test set 0.9323107818018143
Epoch 0/19
----------


  x = self.softmax(x)


train Loss: 0.2694 Acc: 0.9321

test Loss: 0.2232 Acc: 0.9323

Epoch 1/19
----------
train Loss: 0.2133 Acc: 0.9321

test Loss: 0.2077 Acc: 0.9323

Epoch 2/19
----------
train Loss: 0.2024 Acc: 0.9321

test Loss: 0.1995 Acc: 0.9330

Epoch 3/19
----------
train Loss: 0.1978 Acc: 0.9337

test Loss: 0.1992 Acc: 0.9344

Epoch 4/19
----------
train Loss: 0.1968 Acc: 0.9349

test Loss: 0.2000 Acc: 0.9346

Epoch 5/19
----------
train Loss: 0.1960 Acc: 0.9350

test Loss: 0.2025 Acc: 0.9350

Epoch 6/19
----------
train Loss: 0.1954 Acc: 0.9346

test Loss: 0.1959 Acc: 0.9353

Epoch 7/19
----------
train Loss: 0.1950 Acc: 0.9349

test Loss: 0.1961 Acc: 0.9351

Epoch 8/19
----------
train Loss: 0.1946 Acc: 0.9351

test Loss: 0.1965 Acc: 0.9355

Epoch 9/19
----------
train Loss: 0.1945 Acc: 0.9352

test Loss: 0.1952 Acc: 0.9355

Epoch 10/19
----------
train Loss: 0.1945 Acc: 0.9349

test Loss: 0.1954 Acc: 0.9356

Epoch 11/19
----------
train Loss: 0.1944 Acc: 0.9349

test Loss: 0.1965 Acc: 0.9356



In [12]:
dataset.inverse_transform(factuals)
dataset.inverse_transform(test_factual)


Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,...,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,SeriousDlqin2yrs
9,0.964673,40.0,3.0,0.382965,13700.0,...,3.0,1.0,1.0,2.0,0
151,0.786485,44.0,0.0,0.291710,3883.0,...,2.0,0.0,2.0,1.0,0
155,1.095083,53.0,5.0,0.536704,3500.0,...,1.0,1.0,2.0,0.0,1
263,1.002647,38.0,2.0,0.472543,3550.0,...,1.0,0.0,1.0,4.0,1
287,0.507973,69.0,2.0,0.193935,3000.0,...,2.0,0.0,2.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
5082,0.858949,32.0,2.0,0.157109,3417.0,...,2.0,0.0,0.0,3.0,1
5125,0.000000,28.0,1.0,0.424144,4000.0,...,6.0,1.0,1.0,0.0,0
5277,0.955684,44.0,1.0,0.660843,4578.0,...,6.0,1.0,0.0,0.0,0
5288,0.991332,48.0,1.0,0.281100,5200.0,...,1.0,0.0,5.0,3.0,1


In [13]:
y_col = dataset.target
features_and_response = dataset.df.columns
cont_feat = dataset.continuous
cat_feat = [x for x in features_and_response if x not in cont_feat] #  these have new names since encode_normalize_order_factuals()

if data_name == 'adult': 
    fixed_features = ['age', 'sex_Male']
    immutables = ['age', 'sex']
elif data_name == 'give_me_some_credit':
    fixed_features = ['age']
    immutables = ['age']
elif data_name == 'compas':
    fixed_features = ['age', 'sex_Male', 'race_Other']
    immutables = ['age', 'sex', 'race']

#  Create dtypes for MCCE()
dtypes = dict([(x, "float") for x in cont_feat])
for x in cat_feat:
    dtypes[x] = "category"
df = (dataset.df).astype(dtypes)

In [14]:
fixed_features
cat_feat
cont_feat

['RevolvingUtilizationOfUnsecuredLines',
 'age',
 'NumberOfTime30-59DaysPastDueNotWorse',
 'DebtRatio',
 'MonthlyIncome',
 'NumberOfOpenCreditLinesAndLoans',
 'NumberOfTimes90DaysLate',
 'NumberRealEstateLoansOrLines',
 'NumberOfTime60-89DaysPastDueNotWorse',
 'NumberOfDependents']

In [15]:
# dataset.inverse_transform(test_factual).loc[263]
# dataset.inverse_transform(factuals).loc[263]

In [16]:
import time
start = time.time()

mcce = MCCE(fixed_features=fixed_features, immutables=immutables, model=ml_model, seed=1,\
    continuous=cont_feat, categorical=cat_feat)

mcce.fit(df.drop(y_col, axis=1), dtypes)

synth_df = mcce.generate(test_factual.drop(y_col, axis=1), k=K)

In [17]:
dataset.inverse_transform(test_factual).loc[263]
dataset.inverse_transform(synth_df).loc[263]

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
263,0.932107,38.0,0.0,0.423528,3300.0,5.0,0.0,0.0,0.0,3.0
263,0.499873,38.0,0.0,0.356301,3900.0,8.0,0.0,1.0,0.0,1.0
263,0.000000,38.0,0.0,0.155077,2200.0,2.0,0.0,0.0,0.0,0.0
263,0.983494,38.0,2.0,0.406785,3900.0,8.0,0.0,0.0,0.0,2.0
263,0.352600,38.0,1.0,0.282932,4000.0,7.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
263,0.388046,38.0,0.0,0.322390,12000.0,4.0,0.0,1.0,0.0,0.0
263,0.021460,38.0,0.0,0.241752,4583.0,11.0,0.0,1.0,0.0,0.0
263,0.695321,38.0,0.0,0.618575,3600.0,6.0,0.0,1.0,0.0,0.0
263,1.386017,38.0,0.0,0.438106,3974.0,5.0,3.0,2.0,5.0,1.0


In [20]:
# (4) Postprocess generated counterfactuals
print("Postprocessing counterfactuals with MCCE...")
mcce.postprocess(data=df, synth=synth_df, test=test_factual, response=y_col, \
    inverse_transform=dataset.inverse_transform, cutoff=0.5)

timing = time.time() - start
print(timing)

mcce.results_sparse['time (seconds)'] = timing

Postprocessing counterfactuals with MCCE...


  x = self.softmax(x)


173.34104776382446


In [21]:
dataset.inverse_transform(mcce.results_sparse).loc[263]

RevolvingUtilizationOfUnsecuredLines    0.983494
age                                           38
NumberOfTime30-59DaysPastDueNotWorse           2
DebtRatio                               0.406785
MonthlyIncome                               3900
NumberOfOpenCreditLinesAndLoans                8
NumberOfTimes90DaysLate                        0
NumberRealEstateLoansOrLines                   0
NumberOfTime60-89DaysPastDueNotWorse           0
NumberOfDependents                             2
SeriousDlqin2yrs                         0.78222
L0                                             6
L1                                      0.732111
L2                                      0.275674
feasibility                             0.107195
success                                        1
violation                                      0
time (seconds)                           173.341
Name: 263, dtype: object

In [None]:
mcce.results_sparse.to_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_k_{K}_n_{n_test}.csv")

In [32]:
temp =pd.read_csv("/nr/samba/user/anr/pkg/MCCE_Python/Results/give_me_some_credit_mcce_results_k_10000_n_100_inverse_transform.csv", index_col=0)
# temp.loc[263]

In [None]:
# dataset = OnlineCatalog("adult")

# results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/adult_mcce_results_k_{K}_n_{n_test}.csv")

# dataset.inverse_transform(results.iloc[0:1])[['age', 'workclass', 'fnlwgt', 'education-num', 'marital-status', 'relationship', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']].iloc[0]


In [None]:
data_name = 'adult'
K = 50000
n_test = 100
import pandas as pd
results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_k_{K}_n_{n_test}_inverse_transform.csv")
print(results.L0.mean())
print(results.L1.mean())
print(results.L2.mean())


In [None]:
# print(mcce.results_sparse.)
print(results.L0.mean())
print(results.L1.mean())
print(results.L2.mean())
print(results.feasibility.mean())
print(results.violation.mean())
print(results.success.mean())
print(results.shape[0])

In [None]:
results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_k_{K}_n_{n_test}.csv")

results['data'] = data_name
results['method'] = 'mcce'
results.rename(columns={'violation': 'violations'}, inplace=True)

preds = ml_model.predict_proba(results)
new_preds = []
for x in preds:
    new_preds.append(x[1])
results['prediction'] = new_preds
results = dataset.inverse_transform(results)
results.head(1)

results['validity'] = np.where(np.asarray(new_preds) >= 0.5, 1, 0)

results

In [None]:
# pd.set_option('display.max_columns', None)

# dataset = OnlineCatalog("give_me_some_credit")

# results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/give_me_some_credit_mcce_results_k_{K}_n_{n_test}.csv")

# results.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
# results.set_index(['index'], inplace=True)
# temp = results.sort_values([results.index.name]).iloc[0:1]
# dataset.inverse_transform(temp)[['age', 'RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse','DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']].iloc[0]


In [None]:
# print(mcce.results_sparse.)
# print(mcce.results_sparse.L0.mean())
# print(mcce.results_sparse.L1.mean())
# print(mcce.results_sparse.L2.mean())
# results_sparse = mcce.results_sparse
# results_sparse.index.rename('index', inplace=True)
# results_sparse.groupby('index').size().sort_values(ascending=False)