In [1]:
import sys
import yaml
import argparse
import pandas as pd
import numpy as np

from carla.data.catalog import OnlineCatalog
from carla.models.catalog import MLModelCatalog
from carla.models.negative_instances import predict_negative_instances, predict_label
import carla.recourse_methods.catalog as recourse_catalog

import torch

from mcce import MCCE

dataset = ["adult"]
dataset = ['give_me_some_credit']
# dataset = ['compas']
n_test = 100
seed = 1
results_all = None

# Use CARLA to load dataset and predictive model
print("Loading data from Carla...")

for data_name in dataset:
    dataset = OnlineCatalog(data_name)
    
    # (1) Load predictive model and predict probabilities

    torch.manual_seed(0)
    ml_model = MLModelCatalog(
            dataset, 
            model_type="ann", 
            load_online=False, 
            backend="pytorch"
        )


    if data_name == 'adult':
        ml_model.train(
        learning_rate=0.002,
        epochs=20,
        batch_size=1024,
        hidden_size=[18, 9, 3],
        force_train=True, # don't forget to add this or it might load an older model from disk
        )
    elif data_name == 'give_me_some_credit':
        ml_model.train(
        learning_rate=0.002,
        epochs=20,
        batch_size=2048,
        hidden_size=[18, 9, 3],
        force_train=True, # don't forget to add this or it might load an older model from disk
        )
    elif data_name == 'compas':
        ml_model.train(
        learning_rate=0.002,
        epochs=25,
        batch_size=25,
        hidden_size=[18, 9, 3],
        force_train=True, # don't forget to add this or it might load an older model from disk
        )

    # (2) Find unhappy customers and choose which ones to make counterfactuals for
    
    factuals = predict_negative_instances(ml_model, dataset.df)
    test_factual = factuals.iloc[:n_test]
    # test_factual_inverse = dataset.inverse_transform(test_factual)
    
    y_col = dataset.target
    features_and_response = dataset.df.columns
    cont_feat = dataset.continuous
    cat_feat = [x for x in features_and_response if x not in cont_feat] #  these have new names since encode_normalize_order_factuals()
    
    if data_name == 'adult': 
        fixed_features = ['age', 'sex_Male']
        immutables = ['age', 'sex']
    elif data_name == 'give_me_some_credit':
        fixed_features = ['age']
        immutables = ['age']
    elif data_name == 'compas':
        fixed_features = ['age', 'sex_Male', 'race_Other']
        immutables = ['age', 'sex', 'race']
    
    #  Create dtypes for MCCE()
    dtypes = dict([(x, "float") for x in cont_feat])
    for x in cat_feat:
        dtypes[x] = "category"
    df = (dataset.df).astype(dtypes)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from .autonotebook import tqdm as notebook_tqdm
Using TensorFlow backend.


[INFO] Using Python-MIP package version 1.12.0 [model.py <module>]
Loading data from Carla...
balance on test set 0.9320676322926885, balance on test set 0.9323107818018143
Epoch 0/19
----------


  x = self.softmax(x)


train Loss: 0.2694 Acc: 0.9321

test Loss: 0.2232 Acc: 0.9323

Epoch 1/19
----------
train Loss: 0.2133 Acc: 0.9321

test Loss: 0.2077 Acc: 0.9323

Epoch 2/19
----------
train Loss: 0.2024 Acc: 0.9321

test Loss: 0.1995 Acc: 0.9330

Epoch 3/19
----------
train Loss: 0.1978 Acc: 0.9337

test Loss: 0.1992 Acc: 0.9344

Epoch 4/19
----------
train Loss: 0.1968 Acc: 0.9349

test Loss: 0.2000 Acc: 0.9346

Epoch 5/19
----------
train Loss: 0.1960 Acc: 0.9350

test Loss: 0.2025 Acc: 0.9350

Epoch 6/19
----------
train Loss: 0.1954 Acc: 0.9346

test Loss: 0.1959 Acc: 0.9353

Epoch 7/19
----------
train Loss: 0.1950 Acc: 0.9349

test Loss: 0.1961 Acc: 0.9351

Epoch 8/19
----------
train Loss: 0.1946 Acc: 0.9351

test Loss: 0.1965 Acc: 0.9355

Epoch 9/19
----------
train Loss: 0.1945 Acc: 0.9352

test Loss: 0.1952 Acc: 0.9355

Epoch 10/19
----------
train Loss: 0.1945 Acc: 0.9349

test Loss: 0.1954 Acc: 0.9356

Epoch 11/19
----------
train Loss: 0.1944 Acc: 0.9349

test Loss: 0.1965 Acc: 0.9356



In [2]:
positives = dataset.df.copy()
positives["y"] = predict_label(ml_model, positives)
positives = positives[positives["y"] == 1]
positives = positives.drop("y", axis="columns")

positives = dataset.inverse_transform(positives)
test_factual_inverse = dataset.inverse_transform(test_factual)
test_factual_inverse.index.name = 'test'

  x = self.softmax(x)


In [3]:
import time
start = time.time()

synth = pd.merge(test_factual_inverse.reset_index()[dataset.immutables + ['test']], positives, on = dataset.immutables).set_index(['test']) # 'train',
synth = dataset.transform(synth) # go from normal to one-hot encoded

In [4]:
from mcce import MCCE

mcce = MCCE(fixed_features=fixed_features, immutables=immutables, \
    model=ml_model, continuous=dataset.continuous, categorical=dataset.categorical)

mcce.fit((dataset.df).drop(dataset.target, axis=1), dtypes)


In [5]:
mcce.postprocess(data=dataset.df, synth=synth, test=test_factual, response=y_col, \
    transform=None, inverse_transform=dataset.inverse_transform, cutoff=0.5)

timing = time.time() - start
print(timing)

mcce.results_sparse['time (seconds)'] = timing

  x = self.softmax(x)


216.89076375961304


In [6]:
# mcce.results_sparse

In [7]:
print(mcce.results_sparse.L0.mean())
print(mcce.results_sparse.L1.mean())
print(mcce.results_sparse.L2.mean())
print(mcce.results_sparse.feasibility.mean())
print(mcce.results_sparse.violation.mean())
print(mcce.results_sparse.shape[0])

4.24
0.5422250478910271
0.18016155865529665
0.024267278920193123
0.0
100


In [8]:
# test_factual.index.to_list()

In [9]:
temp = mcce.results_sparse
temp.index.to_list()

for x in test_factual.index.to_list():
    if x not in temp.index.to_list():
        print(x)

In [10]:
temp = mcce.results_sparse
temp.to_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_baseline_results_n_{n_test}.csv")

In [11]:
temp

Unnamed: 0,age,RevolvingUtilizationOfUnsecuredLines,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,...,L2,feasibility,success,violation,time (seconds)
2049,0.273973,0.001520,0.1,0.000239,0.056945,...,0.340585,0.026638,1,0.0,216.890764
5125,0.095890,0.000001,0.1,0.000045,0.120603,...,0.329741,0.024507,1,0.0,216.890764
9,0.260274,0.000819,0.1,0.000338,0.110553,...,0.157167,0.000000,1,0.0,216.890764
3089,0.410959,0.001297,0.1,0.000000,0.124201,...,0.071363,0.000000,1,0.0,216.890764
2579,0.246575,0.000738,0.1,0.000164,0.060101,...,0.296497,0.012618,1,0.0,216.890764
...,...,...,...,...,...,...,...,...,...,...,...
1509,0.315068,0.000423,0.2,0.000325,0.044985,...,0.116149,0.000000,1,0.0,216.890764
3565,0.068493,0.000872,0.2,0.000103,0.106533,...,0.055641,0.026565,1,0.0,216.890764
1525,0.260274,0.001259,0.2,0.000284,0.096482,...,0.122813,0.000000,1,0.0,216.890764
3065,0.534247,0.001187,0.2,0.000374,0.135055,...,0.038874,0.043832,1,0.0,216.890764


In [12]:
# synth.index.name = None
# synth
# test_factual

In [13]:
# data=dataset.df
# test=test_factual
# response=y_col
# transform=None
# inverse_transform=dataset.inverse_transform
# cutoff=0.5

In [14]:
# Predict response of generated data
# synth[response] = ml_model.predict(synth)
# synth_positive = synth[synth[response]>=cutoff] # drop negative responses


# # Duplicate original test observations N times where N is number of positive counterfactuals
# n_counterfactuals = synth_positive.groupby(synth_positive.index).size()
# n_counterfactuals = pd.DataFrame(n_counterfactuals, columns = ['N'])

# test_repeated = test.copy()

# test_repeated = test_repeated.join(n_counterfactuals)
# test_repeated.dropna(inplace = True)

# test_repeated = test_repeated.reindex(test_repeated.index.repeat(test_repeated.N))
# test_repeated.drop(['N'], axis=1, inplace=True)

# test = test_repeated


In [15]:
# test_repeated.sort_index(inplace=True)
# test_repeated.iloc[804]

In [16]:
# features = synth.columns.to_list()
# features.remove(response)

# synth_metrics = synth.copy()

In [17]:
# synth.sort_index(inplace=True)
# synth.iloc[804:806]
# .iloc[804:806]



In [18]:
# def intersection(lst1, lst2):
#     return list(set(lst1) & set(lst2))

# df_decoded_cfs = inverse_transform(synth.copy())

# df_factuals = inverse_transform(test.copy())

# # check continuous using np.isclose to allow for very small numerical differences
# cfs_continuous_immutable = df_decoded_cfs[
#     intersection(dataset.continuous, fixed_features)
# ]
# factual_continuous_immutable = df_factuals[
#     intersection(dataset.continuous, dataset.immutables)
# ]
# # print(cfs_continuous_immutable)
# print(factual_continuous_immutable.shape)

# continuous_violations = np.invert(
#     np.isclose(cfs_continuous_immutable, factual_continuous_immutable)
# )
# continuous_violations = np.sum(continuous_violations, axis=1).reshape(
#     (-1, 1)
# )  # sum over features

# cfs_categorical_immutable = df_decoded_cfs[
#     intersection(dataset.categorical, dataset.immutables)
# ]
# factual_categorical_immutable = df_factuals[
#     intersection(dataset.categorical, dataset.immutables)
# ]

# cfs_categorical_immutable.sort_index(inplace=True)
# factual_categorical_immutable.sort_index(inplace=True)
# cfs_categorical_immutable.index.name = None

# categorical_violations = cfs_categorical_immutable != factual_categorical_immutable

# categorical_violations = np.sum(categorical_violations.values, axis=1).reshape(
#             (-1, 1)
#         )


# factual_categorical_immutable


In [19]:
# blah = []
# for x in (continuous_violations + categorical_violations):
#     blah.append(x[0])
# np.mean(blah)

In [20]:
# for i, x in enumerate(blah):
#     if x == 1:
#         print(i)

In [21]:
# mcce.results_sparse.iloc[1]

In [22]:
# test_factual.iloc[1]

In [23]:
## if you want to find out which data point the test observation "found" in the training data

# idx = 1
# temp = mcce.results_sparse.iloc[idx:(idx + 1)]

# feat = ['age', 'fnlwgt', 'education-num', 'capital-gain']

# to_show = pd.merge(temp[feat], dataset.df.reset_index(), on = feat).set_index('index')

# to_show.iloc[0]

In [24]:

# print(mcce.results_sparse.violation.mean())
# print(mcce.results_sparse.L0.mean())
# print(mcce.results_sparse.L1.mean())
# print(mcce.results_sparse.L2.mean())

# print(mcce.results_sparse.feasibility.mean())
# print(mcce.results_sparse.shape[0])

In [25]:
# mcce.results_sparse