In [1]:
import pandas as pd

from carla.data.catalog import OnlineCatalog
from carla.models.catalog import MLModelCatalog
from carla.models.negative_instances import predict_negative_instances, predict_label

import torch

from mcce import MCCE

## FOR EACH DATA SET you have to adjust n below - 
## for adult and gmc, I use 100, 1000, 10000 and the size of the data set
## for compas, I use 100, 1000, 5000, and the size of the data set 


data_name = "adult"
data_name = 'give_me_some_credit'
data_name = 'compas'
n_test = 100
seed = 1

dataset = OnlineCatalog(data_name)

torch.manual_seed(0)
ml_model = MLModelCatalog(
        dataset, 
        model_type="ann", 
        load_online=False, 
        backend="pytorch"
        )
if data_name == 'adult':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=1024,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'give_me_some_credit':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=2048,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'compas':
    ml_model.train(
    learning_rate=0.002,
    epochs=25,
    batch_size=25,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )

# (2) Find unhappy customers and choose which ones to make counterfactuals for

factuals = predict_negative_instances(ml_model, dataset.df)
test_factual = factuals.iloc[:n_test]
# test_factual_inverse = dataset.inverse_transform(test_factual)

y_col = dataset.target
features_and_response = dataset.df.columns
cont_feat = dataset.continuous
cat_feat = [x for x in features_and_response if x not in cont_feat] #  these have new names since encode_normalize_order_factuals()

if data_name == 'adult': 
    fixed_features = ['age', 'sex_Male']
    immutables = ['age', 'sex']
elif data_name == 'give_me_some_credit':
    fixed_features = ['age']
    immutables = ['age']
elif data_name == 'compas':
    fixed_features = ['age', 'sex_Male', 'race_Other']
    immutables = ['age', 'sex', 'race']

#  Create dtypes for MCCE()
dtypes = dict([(x, "float") for x in cont_feat])
for x in cat_feat:
    dtypes[x] = "category"
df = (dataset.df).astype(dtypes)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from .autonotebook import tqdm as notebook_tqdm
Using TensorFlow backend.


[INFO] Using Python-MIP package version 1.12.0 [model.py <module>]
balance on test set 0.8131345863037374, balance on test set 0.8191834089436163
Epoch 0/24
----------


  x = self.softmax(x)


train Loss: 0.5018 Acc: 0.7280

test Loss: 0.4022 Acc: 0.8192

Epoch 1/24
----------
train Loss: 0.4002 Acc: 0.8283

test Loss: 0.3704 Acc: 0.8471

Epoch 2/24
----------
train Loss: 0.3796 Acc: 0.8356

test Loss: 0.3557 Acc: 0.8419

Epoch 3/24
----------
train Loss: 0.3665 Acc: 0.8423

test Loss: 0.3405 Acc: 0.8477

Epoch 4/24
----------
train Loss: 0.3620 Acc: 0.8438

test Loss: 0.3483 Acc: 0.8464

Epoch 5/24
----------
train Loss: 0.3589 Acc: 0.8445

test Loss: 0.3409 Acc: 0.8542

Epoch 6/24
----------
train Loss: 0.3592 Acc: 0.8434

test Loss: 0.3361 Acc: 0.8555

Epoch 7/24
----------
train Loss: 0.3579 Acc: 0.8451

test Loss: 0.3358 Acc: 0.8529

Epoch 8/24
----------
train Loss: 0.3564 Acc: 0.8445

test Loss: 0.3367 Acc: 0.8548

Epoch 9/24
----------
train Loss: 0.3559 Acc: 0.8440

test Loss: 0.3383 Acc: 0.8516

Epoch 10/24
----------
train Loss: 0.3564 Acc: 0.8471

test Loss: 0.3534 Acc: 0.8542

Epoch 11/24
----------
train Loss: 0.3556 Acc: 0.8447

test Loss: 0.3376 Acc: 0.8535



## Subset of Adult

In [2]:
factual_indices = test_factual.index.to_list()
all_indices = dataset.df.index.to_list()
possible_train_indices = set(factual_indices) ^ set(all_indices)

In [3]:
dataset.df.shape
print(len(possible_train_indices))

6072


In [6]:
import random 
import numpy as np
results = []
for n in [100, 1000, 5000]: # [len(possible_train_indices)]:

    for s in range(1): # range(5):

        dim = dataset.df.shape[0]
        print(dim)
        random.seed(s)
        rows = random.sample(possible_train_indices, n)
        rows = np.sort(rows)

        positives = (df.loc[rows]).copy()
        positives["y"] = predict_label(ml_model, positives)
        positives = positives[positives["y"] == 1]
        positives = positives.drop("y", axis="columns")

        positives = dataset.inverse_transform(positives)
        test_factual_inverse = dataset.inverse_transform(test_factual)
        test_factual_inverse.index.name = 'test'

        import time
        start = time.time()

        synth = pd.merge(test_factual_inverse.reset_index()[dataset.immutables + ['test']], positives, on = dataset.immutables).set_index(['test']) # 'train',
        synth = dataset.transform(synth) # go from normal to one-hot encoded

        from mcce import MCCE

        mcce = MCCE(fixed_features=fixed_features, immutables=immutables, \
            model=ml_model, continuous=dataset.continuous, categorical=dataset.categorical)

        mcce.fit(df.drop(dataset.target, axis=1), dtypes)

        mcce.postprocess(data=df, synth=synth, test=test_factual, response=y_col, \
        transform=None, inverse_transform=dataset.inverse_transform, cutoff=0.5)

        timing = time.time() - start

        mcce.results_sparse['time (seconds)'] = timing

        results.append([mcce.results_sparse.L0.mean(), mcce.results_sparse.L2.mean(), mcce.results_sparse.feasibility.mean(), mcce.results_sparse.violation.mean(), mcce.results_sparse.shape[0], timing, n, s])


6172


  x = self.softmax(x)
  x = self.softmax(x)


In [7]:
results2 = pd.DataFrame(results, columns=['L0', 'L2', 'feasibility', 'violation', 'NCE', 'timing', 'Ntest', 'seed'])
results2
results2.groupby(['Ntest']).mean()

Unnamed: 0_level_0,L0,L2,feasibility,violation,NCE,timing,seed
Ntest,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
100,2.56251,0.80111,0.072443,0.0,59.4,0.495268,2.0
1000,1.816698,0.213898,0.04773,0.0,98.2,1.378713,2.0
5000,1.514,0.096621,0.02243,0.0,100.0,4.562577,2.0
6072,1.49,0.080221,0.020698,0.0,100.0,5.47787,0.0


## All of Adult

In [None]:
positives = dataset.df.copy()
positives["y"] = predict_label(ml_model, positives)
positives = positives[positives["y"] == 1]
positives = positives.drop("y", axis="columns")

positives = dataset.inverse_transform(positives)
test_factual_inverse = dataset.inverse_transform(test_factual)
test_factual_inverse.index.name = 'test'

import time
start = time.time()

synth = pd.merge(test_factual_inverse.reset_index()[dataset.immutables + ['test']], positives, on = dataset.immutables).set_index(['test']) # 'train',
synth = dataset.transform(synth) # go from normal to one-hot encoded

from mcce import MCCE

mcce = MCCE(fixed_features=fixed_features, immutables=immutables, model=ml_model, continuous=dataset.continuous, categorical=dataset.categorical)

mcce.fit(df.drop(dataset.target, axis=1), dtypes)

mcce.postprocess(data=dataset.df, synth=synth, test=test_factual, response=y_col, \
    transform=None, inverse_transform=dataset.inverse_transform, cutoff=0.5)

timing = time.time() - start
print(timing)

mcce.results_sparse['time (seconds)'] = timing

In [None]:
results.append([mcce.results_sparse.L0.mean(), mcce.results_sparse.L2.mean(), mcce.results_sparse.feasibility.mean(), mcce.results_sparse.violation.mean(), mcce.results_sparse.shape[0], timing, 48000, 1])

results2 = pd.DataFrame(results, columns=['L0', 'L2', 'feasibility', 'violation', 'NCE', 'timing', 'Ntest', 'seed'])
results2
results2.groupby(['Ntest']).mean()

In [None]:
# test_factual.index.to_list()

In [None]:
temp = mcce.results_sparse
temp.to_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_baseline_results_n_{n_test}.csv")

In [None]:
# synth.index.name = None
# synth
# test_factual

In [None]:
# data=dataset.df
# test=test_factual
# response=y_col
# transform=None
# inverse_transform=dataset.inverse_transform
# cutoff=0.5

In [None]:
# Predict response of generated data
# synth[response] = ml_model.predict(synth)
# synth_positive = synth[synth[response]>=cutoff] # drop negative responses


# # Duplicate original test observations N times where N is number of positive counterfactuals
# n_counterfactuals = synth_positive.groupby(synth_positive.index).size()
# n_counterfactuals = pd.DataFrame(n_counterfactuals, columns = ['N'])

# test_repeated = test.copy()

# test_repeated = test_repeated.join(n_counterfactuals)
# test_repeated.dropna(inplace = True)

# test_repeated = test_repeated.reindex(test_repeated.index.repeat(test_repeated.N))
# test_repeated.drop(['N'], axis=1, inplace=True)

# test = test_repeated


In [None]:
# test_repeated.sort_index(inplace=True)
# test_repeated.iloc[804]

In [None]:
# features = synth.columns.to_list()
# features.remove(response)

# synth_metrics = synth.copy()

In [None]:
# synth.sort_index(inplace=True)
# synth.iloc[804:806]
# .iloc[804:806]



In [None]:
# def intersection(lst1, lst2):
#     return list(set(lst1) & set(lst2))

# df_decoded_cfs = inverse_transform(synth.copy())

# df_factuals = inverse_transform(test.copy())

# # check continuous using np.isclose to allow for very small numerical differences
# cfs_continuous_immutable = df_decoded_cfs[
#     intersection(dataset.continuous, fixed_features)
# ]
# factual_continuous_immutable = df_factuals[
#     intersection(dataset.continuous, dataset.immutables)
# ]
# # print(cfs_continuous_immutable)
# print(factual_continuous_immutable.shape)

# continuous_violations = np.invert(
#     np.isclose(cfs_continuous_immutable, factual_continuous_immutable)
# )
# continuous_violations = np.sum(continuous_violations, axis=1).reshape(
#     (-1, 1)
# )  # sum over features

# cfs_categorical_immutable = df_decoded_cfs[
#     intersection(dataset.categorical, dataset.immutables)
# ]
# factual_categorical_immutable = df_factuals[
#     intersection(dataset.categorical, dataset.immutables)
# ]

# cfs_categorical_immutable.sort_index(inplace=True)
# factual_categorical_immutable.sort_index(inplace=True)
# cfs_categorical_immutable.index.name = None

# categorical_violations = cfs_categorical_immutable != factual_categorical_immutable

# categorical_violations = np.sum(categorical_violations.values, axis=1).reshape(
#             (-1, 1)
#         )


# factual_categorical_immutable


In [None]:
# blah = []
# for x in (continuous_violations + categorical_violations):
#     blah.append(x[0])
# np.mean(blah)

In [None]:
# for i, x in enumerate(blah):
#     if x == 1:
#         print(i)

In [None]:
# mcce.results_sparse.iloc[1]

In [None]:
# test_factual.iloc[1]

In [None]:
## if you want to find out which data point the test observation "found" in the training data

# idx = 1
# temp = mcce.results_sparse.iloc[idx:(idx + 1)]

# feat = ['age', 'fnlwgt', 'education-num', 'capital-gain']

# to_show = pd.merge(temp[feat], dataset.df.reset_index(), on = feat).set_index('index')

# to_show.iloc[0]

In [None]:

# print(mcce.results_sparse.violation.mean())
# print(mcce.results_sparse.L0.mean())
# print(mcce.results_sparse.L1.mean())
# print(mcce.results_sparse.L2.mean())

# print(mcce.results_sparse.feasibility.mean())
# print(mcce.results_sparse.shape[0])

In [None]:
# mcce.results_sparse