In [42]:
import sys
import yaml
import argparse
import pandas as pd
import numpy as np

from carla.data.catalog import OnlineCatalog
from carla.models.catalog import MLModelCatalog
from carla.models.negative_instances import predict_negative_instances, predict_label
import carla.recourse_methods.catalog as recourse_catalog

import torch

from mcce import MCCE

dataset = ["adult"]
dataset = ['give_me_some_credit']
# dataset = ['compas']
n_test = 100
seed = 1
results_all = None

# Use CARLA to load dataset and predictive model
print("Loading data from Carla...")

for data_name in dataset:
    dataset = OnlineCatalog(data_name)
    
    # (1) Load predictive model and predict probabilities

    torch.manual_seed(0)
    ml_model = MLModelCatalog(
            dataset, 
            model_type="ann", 
            load_online=False, 
            backend="pytorch"
        )


    if data_name == 'adult':
        ml_model.train(
        learning_rate=0.002,
        epochs=20,
        batch_size=1024,
        hidden_size=[18, 9, 3],
        force_train=True, # don't forget to add this or it might load an older model from disk
        )
    elif data_name == 'give_me_some_credit':
        ml_model.train(
        learning_rate=0.002,
        epochs=20,
        batch_size=2048,
        hidden_size=[18, 9, 3],
        force_train=True, # don't forget to add this or it might load an older model from disk
        )
    elif data_name == 'compas':
        ml_model.train(
        learning_rate=0.002,
        epochs=25,
        batch_size=25,
        hidden_size=[18, 9, 3],
        force_train=True, # don't forget to add this or it might load an older model from disk
        )

    # (2) Find unhappy customers and choose which ones to make counterfactuals for
    
    factuals = predict_negative_instances(ml_model, dataset.df)
    test_factual = factuals.iloc[:n_test]
    # test_factual_inverse = dataset.inverse_transform(test_factual)
    
    y_col = dataset.target
    features_and_response = dataset.df.columns
    cont_feat = dataset.continuous
    cat_feat = [x for x in features_and_response if x not in cont_feat] #  these have new names since encode_normalize_order_factuals()
    
    if data_name == 'adult': 
        fixed_features = ['age', 'sex_Male']
        immutables = ['age', 'sex']
    elif data_name == 'give_me_some_credit':
        fixed_features = ['age']
        immutables = ['age']
    elif data_name == 'compas':
        fixed_features = ['age', 'sex_Male', 'race_Other']
        immutables = ['age', 'sex', 'race']
    
    #  Create dtypes for MCCE()
    dtypes = dict([(x, "float") for x in cont_feat])
    for x in cat_feat:
        dtypes[x] = "category"
    df = (dataset.df).astype(dtypes)

Loading data from Carla...
balance on test set 0.9320676322926885, balance on test set 0.9323107818018143
Epoch 0/19
----------


  x = self.softmax(x)


train Loss: 0.2694 Acc: 0.9321

test Loss: 0.2232 Acc: 0.9323

Epoch 1/19
----------
train Loss: 0.2133 Acc: 0.9321

test Loss: 0.2077 Acc: 0.9323

Epoch 2/19
----------
train Loss: 0.2024 Acc: 0.9321

test Loss: 0.1995 Acc: 0.9330

Epoch 3/19
----------
train Loss: 0.1978 Acc: 0.9337

test Loss: 0.1992 Acc: 0.9344

Epoch 4/19
----------
train Loss: 0.1968 Acc: 0.9349

test Loss: 0.2000 Acc: 0.9346

Epoch 5/19
----------
train Loss: 0.1960 Acc: 0.9350

test Loss: 0.2025 Acc: 0.9350

Epoch 6/19
----------
train Loss: 0.1954 Acc: 0.9346

test Loss: 0.1959 Acc: 0.9353

Epoch 7/19
----------
train Loss: 0.1950 Acc: 0.9349

test Loss: 0.1961 Acc: 0.9351

Epoch 8/19
----------
train Loss: 0.1946 Acc: 0.9351

test Loss: 0.1965 Acc: 0.9355

Epoch 9/19
----------
train Loss: 0.1945 Acc: 0.9352

test Loss: 0.1952 Acc: 0.9355

Epoch 10/19
----------
train Loss: 0.1945 Acc: 0.9349

test Loss: 0.1954 Acc: 0.9356

Epoch 11/19
----------
train Loss: 0.1944 Acc: 0.9349

test Loss: 0.1965 Acc: 0.9356



In [43]:
positives = dataset.df.copy()
positives["y"] = predict_label(ml_model, positives)
positives = positives[positives["y"] == 1]
positives = positives.drop("y", axis="columns")

positives = dataset.inverse_transform(positives)
test_factual_inverse = dataset.inverse_transform(test_factual)
test_factual_inverse.index.name = 'test'

  x = self.softmax(x)


In [44]:
import time
start = time.time()

synth = pd.merge(test_factual_inverse.reset_index()[dataset.immutables + ['test']], positives, on = dataset.immutables).set_index(['test']) # 'train',
synth = dataset.transform(synth) # go from normal to one-hot encoded

In [45]:
from mcce import MCCE

mcce = MCCE(fixed_features=fixed_features, immutables=immutables, \
    model=ml_model, continuous=dataset.continuous, categorical=dataset.categorical)

mcce.fit((dataset.df).drop(dataset.target, axis=1), dtypes)


In [46]:
mcce.postprocess(data=dataset.df, synth=synth, test=test_factual, response=y_col, \
    transform=None, inverse_transform=dataset.inverse_transform, cutoff=0.5)

timing = time.time() - start
print(timing)

  x = self.softmax(x)


724.3012878894806


In [48]:
mcce.results_sparse

Unnamed: 0,age,RevolvingUtilizationOfUnsecuredLines,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,...,L1,L2,feasibility,success,violation
2049,0.273973,0.000000,0.0,0.000054,0.104523,...,0.413841,0.063331,0.162537,1,0.0
5125,0.095890,0.000080,0.0,0.000007,0.080402,...,0.314548,0.090194,0.003420,1,0.0
9,0.260274,0.000000,0.0,0.000292,0.198995,...,0.665299,0.157167,0.000000,1,0.0
3089,0.410959,0.000266,0.0,0.000278,0.160804,...,0.401324,0.100002,0.003135,1,0.0
2579,0.246575,0.000034,0.0,0.000144,0.195920,...,0.522753,0.094365,0.007929,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1509,0.315068,0.000071,0.0,0.000116,0.233487,...,0.104948,0.010014,0.022517,1,0.0
3565,0.068493,0.001297,0.0,0.000096,0.058633,...,0.671660,0.142932,0.005759,1,0.0
1525,0.260274,0.000000,0.0,0.000278,0.125628,...,0.367955,0.057436,0.000000,1,0.0
3065,0.534247,0.000172,0.0,0.000332,0.123055,...,0.436871,0.090637,0.008489,1,0.0


In [49]:

print(mcce.results_sparse.violation.mean())
print(mcce.results_sparse.L0.mean())
print(mcce.results_sparse.L1.mean())
print(mcce.results_sparse.L2.mean())

print(mcce.results_sparse.feasibility.mean())
print(mcce.results_sparse.shape[0])

0.0
4.38
0.46854236051594694
0.1491814634404985
0.020818667484521835
100


In [None]:
# synth.index.name = None
# synth
# test_factual

In [None]:
# data=dataset.df
# test=test_factual
# response=y_col
# transform=None
# inverse_transform=dataset.inverse_transform
# cutoff=0.5

In [None]:
# Predict response of generated data
# synth[response] = ml_model.predict(synth)
# synth_positive = synth[synth[response]>=cutoff] # drop negative responses


# # Duplicate original test observations N times where N is number of positive counterfactuals
# n_counterfactuals = synth_positive.groupby(synth_positive.index).size()
# n_counterfactuals = pd.DataFrame(n_counterfactuals, columns = ['N'])

# test_repeated = test.copy()

# test_repeated = test_repeated.join(n_counterfactuals)
# test_repeated.dropna(inplace = True)

# test_repeated = test_repeated.reindex(test_repeated.index.repeat(test_repeated.N))
# test_repeated.drop(['N'], axis=1, inplace=True)

# test = test_repeated


In [None]:
# test_repeated.sort_index(inplace=True)
# test_repeated.iloc[804]

In [None]:
# features = synth.columns.to_list()
# features.remove(response)

# synth_metrics = synth.copy()

In [None]:
# synth.sort_index(inplace=True)
# synth.iloc[804:806]
# .iloc[804:806]



In [None]:
# def intersection(lst1, lst2):
#     return list(set(lst1) & set(lst2))

# df_decoded_cfs = inverse_transform(synth.copy())

# df_factuals = inverse_transform(test.copy())

# # check continuous using np.isclose to allow for very small numerical differences
# cfs_continuous_immutable = df_decoded_cfs[
#     intersection(dataset.continuous, fixed_features)
# ]
# factual_continuous_immutable = df_factuals[
#     intersection(dataset.continuous, dataset.immutables)
# ]
# # print(cfs_continuous_immutable)
# print(factual_continuous_immutable.shape)

# continuous_violations = np.invert(
#     np.isclose(cfs_continuous_immutable, factual_continuous_immutable)
# )
# continuous_violations = np.sum(continuous_violations, axis=1).reshape(
#     (-1, 1)
# )  # sum over features

# cfs_categorical_immutable = df_decoded_cfs[
#     intersection(dataset.categorical, dataset.immutables)
# ]
# factual_categorical_immutable = df_factuals[
#     intersection(dataset.categorical, dataset.immutables)
# ]

# cfs_categorical_immutable.sort_index(inplace=True)
# factual_categorical_immutable.sort_index(inplace=True)
# cfs_categorical_immutable.index.name = None

# categorical_violations = cfs_categorical_immutable != factual_categorical_immutable

# categorical_violations = np.sum(categorical_violations.values, axis=1).reshape(
#             (-1, 1)
#         )


# factual_categorical_immutable


In [None]:
# blah = []
# for x in (continuous_violations + categorical_violations):
#     blah.append(x[0])
# np.mean(blah)

In [None]:
# for i, x in enumerate(blah):
#     if x == 1:
#         print(i)

In [None]:
# mcce.results_sparse.iloc[1]

In [None]:
# test_factual.iloc[1]

In [None]:
## if you want to find out which data point the test observation "found" in the training data

# idx = 1
# temp = mcce.results_sparse.iloc[idx:(idx + 1)]

# feat = ['age', 'fnlwgt', 'education-num', 'capital-gain']

# to_show = pd.merge(temp[feat], dataset.df.reset_index(), on = feat).set_index('index')

# to_show.iloc[0]

In [None]:

# print(mcce.results_sparse.violation.mean())
# print(mcce.results_sparse.L0.mean())
# print(mcce.results_sparse.L1.mean())
# print(mcce.results_sparse.L2.mean())

# print(mcce.results_sparse.feasibility.mean())
# print(mcce.results_sparse.shape[0])

In [None]:
# mcce.results_sparse