In [1]:
import pandas as pd
import numpy as np

from carla.data.catalog import OnlineCatalog
from carla.models.catalog import MLModelCatalog
from carla.models.negative_instances import predict_negative_instances

import torch

from mcce import MCCE

data_name = "adult"
# data_name = 'give_me_some_credit'
# data_name = 'compas'
K = 10000
n_test = 100
seed = 1

# for data_name in dataset:
dataset = OnlineCatalog(data_name)

torch.manual_seed(0)
ml_model = MLModelCatalog(
        dataset, 
        model_type="ann", 
        load_online=False, 
        backend="pytorch"
    )

if data_name == 'adult':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=1024,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'give_me_some_credit':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=2048,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'compas':
    ml_model.train(
    learning_rate=0.002,
    epochs=25,
    batch_size=25,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )

# (2) Find unhappy customers and choose which ones to make counterfactuals for

factuals = predict_negative_instances(ml_model, dataset.df)
test_factual = factuals.iloc[:n_test]
# test_factual_inverse = dataset.inverse_transform(test_factual)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from .autonotebook import tqdm as notebook_tqdm
Using TensorFlow backend.


[INFO] Using Python-MIP package version 1.12.0 [model.py <module>]
balance on test set 0.23883245958934032, balance on test set 0.2408256880733945
Epoch 0/19
----------


  x = self.softmax(x)


train Loss: 0.4668 Acc: 0.7734

test Loss: 0.4055 Acc: 0.8005

Epoch 1/19
----------
train Loss: 0.3946 Acc: 0.8121

test Loss: 0.3910 Acc: 0.8189

Epoch 2/19
----------
train Loss: 0.3784 Acc: 0.8222

test Loss: 0.3747 Acc: 0.8226

Epoch 3/19
----------
train Loss: 0.3655 Acc: 0.8290

test Loss: 0.3600 Acc: 0.8324

Epoch 4/19
----------
train Loss: 0.3535 Acc: 0.8343

test Loss: 0.3505 Acc: 0.8373

Epoch 5/19
----------
train Loss: 0.3460 Acc: 0.8372

test Loss: 0.3472 Acc: 0.8389

Epoch 6/19
----------
train Loss: 0.3431 Acc: 0.8387

test Loss: 0.3450 Acc: 0.8402

Epoch 7/19
----------
train Loss: 0.3405 Acc: 0.8402

test Loss: 0.3435 Acc: 0.8384

Epoch 8/19
----------
train Loss: 0.3404 Acc: 0.8389

test Loss: 0.3376 Acc: 0.8396

Epoch 9/19
----------
train Loss: 0.3348 Acc: 0.8421

test Loss: 0.3421 Acc: 0.8400

Epoch 10/19
----------
train Loss: 0.3348 Acc: 0.8411

test Loss: 0.3362 Acc: 0.8426

Epoch 11/19
----------
train Loss: 0.3345 Acc: 0.8401

test Loss: 0.3339 Acc: 0.8435



In [4]:
print(factuals.shape[0])
print(dataset.df.shape[0])

48832 - 39476

39476
48832


9356

In [None]:
y_col = dataset.target
features_and_response = dataset.df.columns
cont_feat = dataset.continuous
cat_feat = [x for x in features_and_response if x not in cont_feat] #  these have new names since encode_normalize_order_factuals()

if data_name == 'adult': 
    fixed_features = ['age', 'sex_Male']
    immutables = ['age', 'sex']
elif data_name == 'give_me_some_credit':
    fixed_features = ['age']
    immutables = ['age']
elif data_name == 'compas':
    fixed_features = ['age', 'sex_Male', 'race_Other']
    immutables = ['age', 'sex', 'race']

#  Create dtypes for MCCE()
dtypes = dict([(x, "float") for x in cont_feat])
for x in cat_feat:
    dtypes[x] = "category"
df = (dataset.df).astype(dtypes)

In [None]:
# test_factual.drop(y_col, axis=1)

In [None]:
import random
results = []

import time
start = time.time()

mcce = MCCE(fixed_features=fixed_features, immutables=immutables, model=ml_model, seed=1, continuous=cont_feat, categorical=cat_feat)

mcce.fit(df.drop(y_col, axis=1), dtypes)

synth_df = mcce.generate(test_factual.drop(y_col, axis=1), k=K)

mcce.postprocess(data=df, synth=synth_df, test=test_factual, response=y_col, \
    inverse_transform=dataset.inverse_transform, cutoff=0.5)

timing = time.time() - start
# print(timing)

mcce.results_sparse['time (seconds)'] = timing

results.append([mcce.results_sparse.L0.mean(), mcce.results_sparse.L2.mean(), mcce.results_sparse.feasibility.mean(),\
     mcce.results_sparse.violation.mean(), mcce.results_sparse.shape[0], timing])

In [None]:
# f"{data_name}_mcce_results_k_{K}_n_{n_test}.csv"

In [None]:
mcce.results_sparse.to_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_k_{K}_n_{n_test}.csv")

In [None]:
results2 = pd.DataFrame(results, columns=['L0', 'L2', 'feasibility', 'violation', 'NCE', 'timing'])
results2

In [None]:
# mcce.df_columns = df.columns.tolist()
# mcce.n_df_rows, mcce.n_df_columns = np.shape(df)
# mcce.df_dtypes = dtypes
# mcce.mutable_features = [col for col in mcce.df_columns if (col not in mcce.fixed_features)]
# mcce.cont_feat = [feat for feat in dtypes.keys() if dtypes[feat] != 'category']

# mcce.n_fixed, mcce.n_mutable = len(mcce.fixed_features), len(mcce.mutable_features)

# # column indices of mutable features
# mcce.visit_sequence = [index for index, col in enumerate(mcce.df_columns) if (col in mcce.fixed_features)] # if (col in mccemutable_features)
# for index, col in enumerate(mcce.df_columns):
#     if col in mcce.mutable_features:
#         mcce.visit_sequence.append(index)

# # convert indices to column names
# mcce.visit_sequence = [mcce.df_columns[i] for i in mcce.visit_sequence]

# mcce.visited_columns = [col for col in mcce.df_columns if col in mcce.visit_sequence]
# mcce.visit_sequence = pd.Series([mcce.visit_sequence.index(col) for col in mcce.visited_columns], index=mcce.visited_columns)

# # create list of methods to use - currently only cart implemented
# mcce.method = []
# for col in mcce.visited_columns:
#     if col in mcce.fixed_features:
#         mcce.method.append('sample') # these will be fit but not sampled 
#     else:
#         mcce.method.append('cart')
# mcce.method = pd.Series(mcce.method, index=mcce.df_columns)

# # predictor_matrix_validator:
# mcce.predictor_matrix = np.zeros([len(mcce.visit_sequence), len(mcce.visit_sequence)], dtype=int)
# mcce.predictor_matrix = pd.DataFrame(mcce.predictor_matrix, index=mcce.visit_sequence.index, columns=mcce.visit_sequence.index)
# visited_columns = []
# for col, _ in mcce.visit_sequence.sort_values().iteritems():
#     mcce.predictor_matrix.loc[col, visited_columns] = 1
#     visited_columns.append(col)

In [None]:
# from cart import CARTMethod
# from sample import SampleMethod

# METHODS_MAP = {'cart': CARTMethod, 'sample': SampleMethod}

# mcce.saved_methods = {}
# mcce.trees = {}

# # train
# mcce.predictor_matrix_columns = mcce.predictor_matrix.columns.to_numpy()
# for col, _ in mcce.visit_sequence.sort_values().iteritems():
#     # initialise the method
#     col_method = METHODS_MAP[mcce.method[col]](dtype=mcce.df_dtypes[col], random_state=mcce.seed)
    
#     # fit the method
#     col_predictors = mcce.predictor_matrix_columns[mcce.predictor_matrix.loc[col].to_numpy() == 1]
    
#     # print(df[col_predictors])
#     # print(df[col])

#     col_method.fit(X_df=df[col_predictors], y_df=df[col])
    
#     if col == 'capital-gain':
#         print(col)
#         print(col_method.leaves_y_dict)

#     # save the method
#     if mcce.method[col] == 'cart':
#         mcce.trees[col] = col_method.leaves_y_dict
#     mcce.saved_methods[col] = col_method

In [None]:
# k = 100
# test = test_factual.drop(y_col, axis=1)
# n_test = test.shape[0]

# # create data set with the fixed features repeated k times
# synth_df = test[mcce.fixed_features]
# synth_df = pd.concat([synth_df] * k)
# synth_df.sort_index(inplace=True)

# # repeat 0 for mutable features k times
# synth_df_mutable = pd.DataFrame(data=np.zeros([k * n_test, mcce.n_mutable]), columns=mcce.mutable_features, index=synth_df.index)

# synth_df = pd.concat([synth_df, synth_df_mutable], axis=1)
# # print(synth_df.head(10))
# start_time = time.time()
# for col in mcce.mutable_features:
#     print(col)
#     # reload the method
#     col_method = mcce.saved_methods[col]
#     # print(col_method)
#     # predict with the method
#     col_predictors = mcce.predictor_matrix_columns[mcce.predictor_matrix.loc[col].to_numpy() == 1]
#     # print(col_predictors)
#     # print(col_predictors)

#     # print(synth_df[col_predictors])
#     synth_df[col] = col_method.predict(synth_df[col_predictors])
#     # print(synth_df[col][0:10])
#     # if col == 'education-num':
#     #     print(synth_df)
#     X_test_df = synth_df[col_predictors]
#     # print(X_test_df)
#     X_test_df, _ = col_method.prepare_dfs(X_df=X_test_df, normalise_num_cols=False, one_hot_cat_cols=False, fit=False)
#     # if col == 'education-num':
#     #     print(X_test_df)
#     # print(col_method.cart.get_params())
#     # print(col_method.cart.tree_)
    
#     # predict the leaves and for each leaf randomly sample from the observed values
#     X_test = X_test_df.to_numpy()
#     # if col == 'education-num':
#     #     print(X_test[0])
#     leaves_pred = col_method.cart.apply(X_test)
#     # print(leaves_pred)
#     y_pred = np.zeros(len(leaves_pred), dtype=object)

#     leaves_pred_index_df = pd.DataFrame({'leaves_pred': leaves_pred, 'index': range(len(leaves_pred))})
#     # print(leaves_pred_index_df)
#     leaves_pred_index_dict = leaves_pred_index_df.groupby('leaves_pred').apply(lambda x: x.to_numpy()[:, -1]).to_dict()
#     # print(leaves_pred_index_dict.items())
#     for leaf, indices in leaves_pred_index_dict.items():
#         np.random.seed(0)
#         y_pred[indices] = np.random.choice(col_method.leaves_y_dict[leaf], size=len(indices), replace=True)
    
#     # map dtype to original dtype
#     # synth_df[col] = synth_df[col].astype(mcce.df_dtypes[col])

# synth_df = synth_df[test.columns]
# print(synth_df)

In [None]:
# temp = pd.read_csv("/nr/samba/user/anr/pkg/MCCE_Python/Results/give_me_some_credit_mcce_results_k_10000_n_100_inverse_transform.csv", index_col=0)
# temp.loc[263]

In [None]:
# dataset = OnlineCatalog("adult")

# results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/adult_mcce_results_k_{K}_n_{n_test}.csv")

# dataset.inverse_transform(results.iloc[0:1])[['age', 'workclass', 'fnlwgt', 'education-num', 'marital-status', 'relationship', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']].iloc[0]


In [None]:
# data_name = 'adult'
# K = 50000
# n_test = 100
# import pandas as pd
# results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_k_{K}_n_{n_test}_inverse_transform.csv")
# print(results.L0.mean())
# print(results.L1.mean())
# print(results.L2.mean())


In [None]:
# # print(mcce.results_sparse.)
# print(results.L0.mean())
# print(results.L1.mean())
# print(results.L2.mean())
# print(results.feasibility.mean())
# print(results.violation.mean())
# print(results.success.mean())
# print(results.shape[0])

In [None]:
# results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_k_{K}_n_{n_test}.csv")

# results['data'] = data_name
# results['method'] = 'mcce'
# results.rename(columns={'violation': 'violations'}, inplace=True)

# preds = ml_model.predict_proba(results)
# new_preds = []
# for x in preds:
#     new_preds.append(x[1])
# results['prediction'] = new_preds
# results = dataset.inverse_transform(results)
# results.head(1)

# results['validity'] = np.where(np.asarray(new_preds) >= 0.5, 1, 0)

# results

In [None]:
# pd.set_option('display.max_columns', None)

# dataset = OnlineCatalog("give_me_some_credit")

# results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/give_me_some_credit_mcce_results_k_{K}_n_{n_test}.csv")

# results.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
# results.set_index(['index'], inplace=True)
# temp = results.sort_values([results.index.name]).iloc[0:1]
# dataset.inverse_transform(temp)[['age', 'RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse','DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']].iloc[0]


In [None]:
# print(mcce.results_sparse.)
# print(mcce.results_sparse.L0.mean())
# print(mcce.results_sparse.L1.mean())
# print(mcce.results_sparse.L2.mean())
# results_sparse = mcce.results_sparse
# results_sparse.index.rename('index', inplace=True)
# results_sparse.groupby('index').size().sort_values(ascending=False)