In [1]:
import pandas as pd

from carla.data.catalog import OnlineCatalog
from carla.models.catalog import MLModelCatalog
from carla.models.negative_instances import predict_negative_instances

import torch

from mcce import MCCE

data_name = "adult"
# data_name = 'give_me_some_credit'
# data_name = 'compas'
K = 1000
n_test = 100
seed = 1

dataset = OnlineCatalog(data_name)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from .autonotebook import tqdm as notebook_tqdm
Using TensorFlow backend.


[INFO] Using Python-MIP package version 1.12.0 [model.py <module>]


In [2]:
torch.manual_seed(0)
ml_model = MLModelCatalog(
        dataset, 
        model_type="ann", 
        load_online=False, 
        backend="pytorch"
    )


In [4]:
ml_model.feature_input_order

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'workclass_Private',
 'marital-status_Non-Married',
 'occupation_Other',
 'relationship_Non-Husband',
 'race_White',
 'sex_Male',
 'native-country_US']

In [None]:

if data_name == 'adult':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=1024,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'give_me_some_credit':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=2048,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'compas':
    ml_model.train(
    learning_rate=0.002,
    epochs=25,
    batch_size=25,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )

factuals = predict_negative_instances(ml_model, dataset.df)
test_factual = factuals.iloc[:n_test]

In [None]:
ml_model.feature_input_order

In [None]:
1 - factuals.shape[0] / dataset.df.shape[0]

In [None]:
y_col = dataset.target
cont_feat = dataset.continuous

cat_feat = dataset.categorical
cat_feat_encoded = dataset.encoder.get_feature_names(dataset.categorical)

if data_name == 'adult': 
    fixed_features = ['age', 'sex_Male']
elif data_name == 'give_me_some_credit':
    fixed_features = ['age']
elif data_name == 'compas':
    fixed_features = ['age', 'sex_Male', 'race_Other']

#  Create dtypes for MCCE()
dtypes = dict([(x, "float") for x in cont_feat])
for x in cat_feat_encoded:
    dtypes[x] = "category"
df = (dataset.df).astype(dtypes)

In [None]:
results = []

import time
start = time.time()
time1 = time.time()
# fixed_features = names in dataset
# categorical = original feature names

mcce = MCCE(fixed_features=fixed_features, continuous=dataset.continuous, categorical=dataset.categorical,\
            model=ml_model, seed=1, catalog=dataset.catalog)

mcce.fit(df.drop(y_col, axis=1), dtypes)

time2 = time.time()
print(time2 - time1) # 0.24

synth_df = mcce.generate(test_factual.drop(y_col, axis=1), k=K)

time3 = time.time()
print(time3 - time2) # 2.41

mcce.postprocess(data=df, synth=synth_df, test=test_factual, response=y_col, \
    inverse_transform=dataset.inverse_transform, cutoff=0.5)

time4 = time.time()
print(time4 - time3) # 

timing = time.time() - start

mcce.results_sparse['time (seconds)'] = timing

results.append([mcce.results_sparse.L0.mean(), mcce.results_sparse.L2.mean(), mcce.results_sparse.feasibility.mean(),\
     mcce.results_sparse.violation.mean(), mcce.results_sparse.shape[0], timing])

In [None]:
# mcce.results_sparse.to_csv(f"Results/{data_name}_mcce_results_k_{K}_n_{n_test}.csv")

In [None]:
# data=df
# synth=synth_df
# test=test_factual
# response=y_col
# inverse_transform=dataset.inverse_transform
# cutoff=0.5

# # Predict response of generated data
# synth[response] = mcce.model.predict(synth)
# synth_positive = synth[synth[response]>=cutoff] # drop negative responses

# # Duplicate original test observations N times where N is number of positive counterfactuals
# n_counterfactuals = synth_positive.groupby(synth_positive.index).size()
# n_counterfactuals = pd.DataFrame(n_counterfactuals, columns = ['N'])

# test_repeated = test.copy()

# test_repeated = test_repeated.join(n_counterfactuals)
# test_repeated.dropna(inplace = True)

# test_repeated = test_repeated.reindex(test_repeated.index.repeat(test_repeated.N))
# test_repeated.drop(['N'], axis=1, inplace=True)

# mcce.test_repeated = test_repeated

In [None]:
# synth=synth_positive
# test=mcce.test_repeated
# data=data
# model=mcce.model
# response=response
# inverse_transform=inverse_transform

In [None]:
# import metrics

# features = synth.columns.to_list()
# features.remove(response)
# synth.sort_index(inplace=True)

# if inverse_transform:  # necessary for violation rate
#     df_decoded_cfs = inverse_transform(synth.copy())
#     df_decoded_factuals = inverse_transform(test.copy())

# else:
#     df_decoded_cfs = synth.copy()
#     df_decoded_factuals = test.copy()


# synth_metrics = synth.copy()

# # 1) Distance: Sparsity and Euclidean distance
# factual = test[features]#.sort_index().to_numpy()
# counterfactuals = synth[features]#.sort_index().to_numpy()

# time1 = time.time()
# distances = pd.DataFrame(metrics.distance(counterfactuals, factual, mcce.model), index=factual.index)

# time2 = time.time()
# mcce.distance_cpu_time = time2 - time1
# synth_metrics = pd.concat([synth_metrics, distances], axis=1)

In [None]:
# counterfactuals.shape

In [None]:
# cols = data.columns.to_list()
# cols.remove(response)

# time1 = time.time()
# synth_metrics['feasibility'] = metrics.feasibility(counterfactuals, factual, cols, response)

# time2 = time.time()
# mcce.feasibility_cpu_time = time2 - time1

# # 3) Success
# synth_metrics['success'] = 1

In [None]:
# time1 = time.time()
# violations = metrics.constraint_violation(df_decoded_cfs, df_decoded_factuals, \
#     mcce.continuous, mcce.categorical, mcce.catalog['immutable'])

# synth_metrics['violation'] = violations
# time2 = time.time()
# mcce.violation_cpu_time = time2 - time1


In [None]:
# results2 = pd.DataFrame(results, columns=['L0', 'L2', 'feasibility', 'violation', 'NCE', 'timing'])