In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

from carla.data.catalog import OnlineCatalog
from carla.models.catalog import MLModelCatalog
from carla.models.negative_instances import predict_negative_instances, predict_label

import torch
import time
            
from mcce import MCCE

## FOR EACH DATA SET you have to adjust n below - 
## for adult and gmc, I use 100, 1000, 10000 and the size of the data set
## for compas, I use 100, 1000, 5000, and the size of the data set 

data_name = "adult"
data_name = 'give_me_some_credit'
data_name = 'compas'
n_test = 100
seed = 1

dataset = OnlineCatalog(data_name)

torch.manual_seed(0)
ml_model = MLModelCatalog(
        dataset, 
        model_type="ann", 
        load_online=False, 
        backend="pytorch"
        )
if data_name == 'adult':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=1024,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'give_me_some_credit':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=2048,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'compas':
    ml_model.train(
    learning_rate=0.002,
    epochs=25,
    batch_size=25,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )

# (2) Find unhappy customers and choose which ones to make counterfactuals for

factuals = predict_negative_instances(ml_model, dataset.df)
test_factual = factuals.iloc[:n_test]

y_col = dataset.target
cont_feat = dataset.continuous

cat_feat = dataset.categorical
cat_feat_encoded = dataset.encoder.get_feature_names(dataset.categorical)

if data_name == 'adult': 
    fixed_features_encoded = ['age', 'sex_Male']
    fixed_features = ['age', 'sex']
elif data_name == 'give_me_some_credit':
    fixed_features_encoded = ['age']
    fixed_features = ['age']
elif data_name == 'compas':
    fixed_features_encoded = ['age', 'sex_Male', 'race_Other']
    fixed_features = ['age', 'sex', 'race']

#  Create dtypes for MCCE()
dtypes = dict([(x, "float") for x in cont_feat])
for x in cat_feat_encoded:
    dtypes[x] = "category"
df = (dataset.df).astype(dtypes)

## Subset full data set

In [None]:
factual_indices = test_factual.index.to_list()
all_indices = dataset.df.index.to_list()
possible_train_indices = set(factual_indices) ^ set(all_indices)

In [None]:
if data_name == 'adult': 
    n_list = [100, 1000, 10000, len(possible_train_indices)]
elif data_name == 'give_me_some_credit':
    n_list = [100, 1000, 10000, len(possible_train_indices)]
elif data_name == 'compas':
    n_list = [100, 1000, 5000, len(possible_train_indices)]

## Here we fit "MCCE-light" method

In [None]:
import random 
import numpy as np
import time

results = []
for n in n_list:
    print(n)

    if n == len(possible_train_indices): # if the whole data set

        dim = dataset.df.shape[0]
        # print(dim)
        random.seed(s)
        rows = random.sample(possible_train_indices, n)
        rows = np.sort(rows)

        positives = (df.loc[rows]).copy()
        positives["y"] = predict_label(ml_model, positives)
        positives = positives[positives["y"] == 1]
        positives = positives.drop("y", axis="columns")

        positives = dataset.inverse_transform(positives)
        test_factual_inverse = dataset.inverse_transform(test_factual)
        test_factual_inverse.index.name = 'test'

        start = time.time()

        synth = pd.merge(test_factual_inverse.reset_index()[dataset.immutables + ['test']], positives, on = dataset.immutables).set_index(['test']) # 'train',
        synth = dataset.transform(synth) # go from normal to one-hot encoded

        mcce = MCCE(fixed_features=fixed_features,\
                fixed_features_encoded=fixed_features_encoded,
                    continuous=dataset.continuous, categorical=dataset.categorical,\
                        model=ml_model, seed=1)

        mcce.fit(df.drop(dataset.target, axis=1), dtypes)

        mcce.postprocess(data=df, synth=synth, test=test_factual, response=y_col, \
            inverse_transform=dataset.inverse_transform, cutoff=0.5)

        timing = time.time() - start

        mcce.results_sparse['time (seconds)'] = timing

        results.append([mcce.results_sparse.L0.mean(), mcce.results_sparse.L2.mean(), mcce.results_sparse.feasibility.mean(), mcce.results_sparse.violation.mean(), mcce.results_sparse.shape[0], timing, n, s])
    else:
        for s in range(5): # range(5):
            print(s)

            dim = dataset.df.shape[0]
            # print(dim)
            random.seed(s)
            rows = random.sample(possible_train_indices, n)
            rows = np.sort(rows)

            positives = (df.loc[rows]).copy()
            positives["y"] = predict_label(ml_model, positives)
            positives = positives[positives["y"] == 1]
            positives = positives.drop("y", axis="columns")

            positives = dataset.inverse_transform(positives)
            test_factual_inverse = dataset.inverse_transform(test_factual)
            test_factual_inverse.index.name = 'test'

            start = time.time()

            synth = pd.merge(test_factual_inverse.reset_index()[dataset.immutables + ['test']], positives, on = dataset.immutables).set_index(['test']) # 'train',
            synth = dataset.transform(synth) # go from normal to one-hot encoded

            mcce = MCCE(fixed_features=fixed_features,\
                fixed_features_encoded=fixed_features_encoded,
                    continuous=dataset.continuous, categorical=dataset.categorical,\
                        model=ml_model, seed=1)

            mcce.fit(df.drop(dataset.target, axis=1), dtypes)

            mcce.postprocess(data=df, synth=synth, test=test_factual, response=y_col, \
                inverse_transform=dataset.inverse_transform, cutoff=0.5)

            timing = time.time() - start

            mcce.results_sparse['time (seconds)'] = timing

            results.append([mcce.results_sparse.L0.mean(), mcce.results_sparse.L2.mean(), mcce.results_sparse.feasibility.mean(), mcce.results_sparse.violation.mean(), mcce.results_sparse.shape[0], timing, n, s])


In [None]:
results2 = pd.DataFrame(results, columns=['L0', 'L2', 'feasibility', 'violation', 'NCE', 'timing', 'Ntest', 'seed'])
# results2

In [None]:
results2.to_csv(f"Results/{data_name}_mcce_light_n_{n_test}.csv")

results2.groupby(['Ntest']).mean()

## Load results

In [None]:
results2 = pd.read_csv(f"Results/{data_name}_mcce_light_n_{n_test}.csv")

results2.groupby(['Ntest']).mean()

In [None]:
# positives = dataset.df.copy()
# positives["y"] = predict_label(ml_model, positives)
# positives = positives[positives["y"] == 1]
# positives = positives.drop("y", axis="columns")

# positives = dataset.inverse_transform(positives)
# test_factual_inverse = dataset.inverse_transform(test_factual)
# test_factual_inverse.index.name = 'test'

# import time
# start = time.time()

# synth = pd.merge(test_factual_inverse.reset_index()[dataset.immutables + ['test']], positives, on = dataset.immutables).set_index(['test']) # 'train',
# synth = dataset.transform(synth) # go from normal to one-hot encoded

# from mcce import MCCE

# mcce = MCCE(fixed_features=fixed_features, immutables=immutables, model=ml_model, continuous=dataset.continuous, categorical=dataset.categorical)

# mcce.fit(df.drop(dataset.target, axis=1), dtypes)

# mcce.postprocess(data=dataset.df, synth=synth, test=test_factual, response=y_col, \
#     transform=None, inverse_transform=dataset.inverse_transform, cutoff=0.5)

# timing = time.time() - start
# print(timing)

# mcce.results_sparse['time (seconds)'] = timing

# results.append([mcce.results_sparse.L0.mean(), mcce.results_sparse.L2.mean(), mcce.results_sparse.feasibility.mean(), mcce.results_sparse.violation.mean(), mcce.results_sparse.shape[0], timing, 48000, 1])


In [None]:
# synth.index.name = None
# synth
# test_factual

In [None]:
# data=dataset.df
# test=test_factual
# response=y_col
# transform=None
# inverse_transform=dataset.inverse_transform
# cutoff=0.5

In [None]:
# Predict response of generated data
# synth[response] = ml_model.predict(synth)
# synth_positive = synth[synth[response]>=cutoff] # drop negative responses


# # Duplicate original test observations N times where N is number of positive counterfactuals
# n_counterfactuals = synth_positive.groupby(synth_positive.index).size()
# n_counterfactuals = pd.DataFrame(n_counterfactuals, columns = ['N'])

# test_repeated = test.copy()

# test_repeated = test_repeated.join(n_counterfactuals)
# test_repeated.dropna(inplace = True)

# test_repeated = test_repeated.reindex(test_repeated.index.repeat(test_repeated.N))
# test_repeated.drop(['N'], axis=1, inplace=True)

# test = test_repeated


In [None]:
# test_repeated.sort_index(inplace=True)
# test_repeated.iloc[804]

In [None]:
# features = synth.columns.to_list()
# features.remove(response)

# synth_metrics = synth.copy()

In [None]:
# synth.sort_index(inplace=True)
# synth.iloc[804:806]
# .iloc[804:806]



In [None]:
# def intersection(lst1, lst2):
#     return list(set(lst1) & set(lst2))

# df_decoded_cfs = inverse_transform(synth.copy())

# df_factuals = inverse_transform(test.copy())

# # check continuous using np.isclose to allow for very small numerical differences
# cfs_continuous_immutable = df_decoded_cfs[
#     intersection(dataset.continuous, fixed_features)
# ]
# factual_continuous_immutable = df_factuals[
#     intersection(dataset.continuous, dataset.immutables)
# ]
# # print(cfs_continuous_immutable)
# print(factual_continuous_immutable.shape)

# continuous_violations = np.invert(
#     np.isclose(cfs_continuous_immutable, factual_continuous_immutable)
# )
# continuous_violations = np.sum(continuous_violations, axis=1).reshape(
#     (-1, 1)
# )  # sum over features

# cfs_categorical_immutable = df_decoded_cfs[
#     intersection(dataset.categorical, dataset.immutables)
# ]
# factual_categorical_immutable = df_factuals[
#     intersection(dataset.categorical, dataset.immutables)
# ]

# cfs_categorical_immutable.sort_index(inplace=True)
# factual_categorical_immutable.sort_index(inplace=True)
# cfs_categorical_immutable.index.name = None

# categorical_violations = cfs_categorical_immutable != factual_categorical_immutable

# categorical_violations = np.sum(categorical_violations.values, axis=1).reshape(
#             (-1, 1)
#         )


# factual_categorical_immutable


In [None]:
# blah = []
# for x in (continuous_violations + categorical_violations):
#     blah.append(x[0])
# np.mean(blah)

In [None]:
# for i, x in enumerate(blah):
#     if x == 1:
#         print(i)

In [None]:
# mcce.results_sparse.iloc[1]

In [None]:
# test_factual.iloc[1]

In [None]:
## if you want to find out which data point the test observation "found" in the training data

# idx = 1
# temp = mcce.results_sparse.iloc[idx:(idx + 1)]

# feat = ['age', 'fnlwgt', 'education-num', 'capital-gain']

# to_show = pd.merge(temp[feat], dataset.df.reset_index(), on = feat).set_index('index')

# to_show.iloc[0]

In [None]:

# print(mcce.results_sparse.violation.mean())
# print(mcce.results_sparse.L0.mean())
# print(mcce.results_sparse.L1.mean())
# print(mcce.results_sparse.L2.mean())

# print(mcce.results_sparse.feasibility.mean())
# print(mcce.results_sparse.shape[0])

In [None]:
# mcce.results_sparse