In [6]:
import pandas as pd
import numpy as np

from carla.data.catalog import OnlineCatalog
from carla.models.catalog import MLModelCatalog
from carla.models.negative_instances import predict_negative_instances

import torch

from mcce import MCCE

data_name = "adult"
# data_name = 'give_me_some_credit'
# dataset = ['compas']
K = 10000
n_test = 100
seed = 1

# for data_name in dataset:
dataset = OnlineCatalog(data_name)

torch.manual_seed(0)
ml_model = MLModelCatalog(
        dataset, 
        model_type="ann", 
        load_online=False, 
        backend="pytorch"
        )

if data_name == 'adult':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=1024,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'give_me_some_credit':
    ml_model.train(
    learning_rate=0.002,
    epochs=20,
    batch_size=2048,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )
elif data_name == 'compas':
    ml_model.train(
    learning_rate=0.002,
    epochs=25,
    batch_size=25,
    hidden_size=[18, 9, 3],
    force_train=True, # don't forget to add this or it might load an older model from disk
    )

# (2) Find unhappy customers and choose which ones to make counterfactuals for

factuals = predict_negative_instances(ml_model, dataset.df)
test_factual = factuals.iloc[:n_test]
# test_factual_inverse = dataset.inverse_transform(test_factual)

balance on test set 0.23883245958934032, balance on test set 0.2408256880733945
Epoch 0/19
----------


  x = self.softmax(x)


train Loss: 0.4668 Acc: 0.7734

test Loss: 0.4055 Acc: 0.8005

Epoch 1/19
----------
train Loss: 0.3946 Acc: 0.8121

test Loss: 0.3910 Acc: 0.8189

Epoch 2/19
----------
train Loss: 0.3784 Acc: 0.8222

test Loss: 0.3747 Acc: 0.8226

Epoch 3/19
----------
train Loss: 0.3655 Acc: 0.8290

test Loss: 0.3600 Acc: 0.8324

Epoch 4/19
----------
train Loss: 0.3535 Acc: 0.8343

test Loss: 0.3505 Acc: 0.8373

Epoch 5/19
----------
train Loss: 0.3460 Acc: 0.8372

test Loss: 0.3472 Acc: 0.8389

Epoch 6/19
----------
train Loss: 0.3431 Acc: 0.8387

test Loss: 0.3450 Acc: 0.8402

Epoch 7/19
----------
train Loss: 0.3405 Acc: 0.8402

test Loss: 0.3435 Acc: 0.8384

Epoch 8/19
----------
train Loss: 0.3404 Acc: 0.8389

test Loss: 0.3376 Acc: 0.8396

Epoch 9/19
----------
train Loss: 0.3348 Acc: 0.8421

test Loss: 0.3421 Acc: 0.8400

Epoch 10/19
----------
train Loss: 0.3348 Acc: 0.8411

test Loss: 0.3362 Acc: 0.8426

Epoch 11/19
----------
train Loss: 0.3345 Acc: 0.8401

test Loss: 0.3339 Acc: 0.8435



In [7]:
y_col = dataset.target
features_and_response = dataset.df.columns
cont_feat = dataset.continuous
cat_feat = [x for x in features_and_response if x not in cont_feat] #  these have new names since encode_normalize_order_factuals()

if data_name == 'adult': 
    fixed_features = ['age', 'sex_Male']
    immutables = ['age', 'sex']
elif data_name == 'give_me_some_credit':
    fixed_features = ['age']
    immutables = ['age']
elif data_name == 'compas':
    fixed_features = ['age', 'sex_Male', 'race_Other']
    immutables = ['age', 'sex', 'race']

#  Create dtypes for MCCE()
dtypes = dict([(x, "float") for x in cont_feat])
for x in cat_feat:
    dtypes[x] = "category"
df = (dataset.df).astype(dtypes)

In [11]:
print(df.dtypes)

age                            float64
fnlwgt                         float64
education-num                  float64
capital-gain                   float64
capital-loss                   float64
hours-per-week                 float64
income                        category
marital-status_Non-Married    category
native-country_US             category
occupation_Other              category
race_White                    category
relationship_Non-Husband      category
sex_Male                      category
workclass_Private             category
dtype: object

In [14]:
# import random
# results = []

# dim = df.shape[0]
# random.seed(0)
# rows = random.sample(df.index.to_list(), dim)
# rows = np.sort(rows)

# df_subset = df.loc[rows]

# print(df.equals(df_subset))

# import time
# start = time.time()

# mcce = MCCE(fixed_features=fixed_features, immutables=immutables, model=ml_model, seed=1, continuous=cont_feat, categorical=cat_feat)

# mcce.fit(df_subset.drop(y_col, axis=1), dtypes)

# synth_df = mcce.generate(test_factual.drop(y_col, axis=1), k=K)

# mcce.postprocess(data=df_subset, synth=synth_df, test=test_factual, response=y_col, \
#     inverse_transform=dataset.inverse_transform, cutoff=0.5)

# timing = time.time() - start

# mcce.results_sparse['time (seconds)'] = timing

# results.append([mcce.results_sparse.L0.mean(), mcce.results_sparse.L2.mean(), mcce.results_sparse.feasibility.mean(), mcce.results_sparse.violation.mean(), mcce.results_sparse.shape[0], timing, n, 0])

True


  x = self.softmax(x)


In [15]:
# results2 = pd.DataFrame(results, columns=['L0', 'L2', 'feasibility', 'violation', 'NCE', 'timing', 's', 'n'])
# results2.mean()

L0                 3.040000
L2                 0.738469
feasibility        0.047668
violation          0.000000
NCE              100.000000
timing           107.087060
s              48831.000000
n                  0.000000
dtype: float64

In [16]:
import random 

results = []
for n in [100, 1000, 10000]: # 20000, 30000, 40000

    if n == 48832:
        random.seed(0)
        rows = random.sample(df.index.to_list(), n)
        rows = np.sort(rows)
        df_subset = df.loc[rows]
        print(df.equals(df_subset))

        import time
        start = time.time()

        mcce = MCCE(fixed_features=fixed_features, immutables=immutables, model=ml_model, seed=1,\
            continuous=cont_feat, categorical=cat_feat)

        mcce.fit(df_subset.drop(y_col, axis=1), dtypes)

        synth_df = mcce.generate(test_factual.drop(y_col, axis=1), k=K)

        # (4) Postprocess generated counterfactuals
        print("Postprocessing counterfactuals with MCCE...")
        mcce.postprocess(data=df, synth=synth_df, test=test_factual, response=y_col, \
            inverse_transform=dataset.inverse_transform, cutoff=0.5)

        timing = time.time() - start

        mcce.results_sparse['time (seconds)'] = timing

        results.append([mcce.results_sparse.L0.mean(), mcce.results_sparse.L2.mean(), mcce.results_sparse.feasibility.mean(), mcce.results_sparse.violation.mean(), mcce.results_sparse.shape[0], timing, n, 0])
    else:
        for s in range(5):

            random.seed(s)
            rows = random.sample(df.index.to_list(), n)
            rows = np.sort(rows)
            df_subset = df.loc[rows]
            print(df.equals(df_subset))

            import time
            start = time.time()

            mcce = MCCE(fixed_features=fixed_features, immutables=immutables, model=ml_model, seed=1,\
                continuous=cont_feat, categorical=cat_feat)

            mcce.fit(df_subset.drop(y_col, axis=1), dtypes)

            synth_df = mcce.generate(test_factual.drop(y_col, axis=1), k=K)

            # (4) Postprocess generated counterfactuals
            print("Postprocessing counterfactuals with MCCE...")
            mcce.postprocess(data=df, synth=synth_df, test=test_factual, response=y_col, \
                inverse_transform=dataset.inverse_transform, cutoff=0.5)

            timing = time.time() - start

            mcce.results_sparse['time (seconds)'] = timing

            results.append([mcce.results_sparse.L0.mean(), mcce.results_sparse.L2.mean(), mcce.results_sparse.feasibility.mean(), mcce.results_sparse.violation.mean(), mcce.results_sparse.shape[0], timing, n, s])


False
Postprocessing counterfactuals with MCCE...


  x = self.softmax(x)


In [None]:
results2 = pd.DataFrame(results, columns=['L0', 'L2', 'feasibility', 'violation', 'NCE', 'timing', 'Ntest', 'seed'])
results2.groupby('Ntest').mean()

In [None]:
# results2.to_csv("/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_k_{K}_n_{n_test}_with_various_training_amounts.csv")

In [None]:
# results3 = pd.read_csv("/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_k_{K}_n_{n_test}_with_various_training_amounts.csv", index_col=0)


In [None]:
# results3.groupby('Ntest').mean()

In [None]:
# mcce.results_sparse.to_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_k_{K}_n_{n_test}.csv")

In [None]:
# temp =pd.read_csv("/nr/samba/user/anr/pkg/MCCE_Python/Results/give_me_some_credit_mcce_results_k_10000_n_100_inverse_transform.csv", index_col=0)
# temp.loc[263]

In [None]:
# dataset = OnlineCatalog("adult")

# results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/adult_mcce_results_k_{K}_n_{n_test}.csv")

# dataset.inverse_transform(results.iloc[0:1])[['age', 'workclass', 'fnlwgt', 'education-num', 'marital-status', 'relationship', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']].iloc[0]


In [None]:
# data_name = 'adult'
# K = 50000
# n_test = 100
# import pandas as pd
# results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_k_{K}_n_{n_test}_inverse_transform.csv")
# print(results.L0.mean())
# print(results.L1.mean())
# print(results.L2.mean())


In [None]:
# # print(mcce.results_sparse.)
# print(results.L0.mean())
# print(results.L1.mean())
# print(results.L2.mean())
# print(results.feasibility.mean())
# print(results.violation.mean())
# print(results.success.mean())
# print(results.shape[0])

In [None]:
# results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/Results/{data_name}_mcce_results_k_{K}_n_{n_test}.csv")

# results['data'] = data_name
# results['method'] = 'mcce'
# results.rename(columns={'violation': 'violations'}, inplace=True)

# preds = ml_model.predict_proba(results)
# new_preds = []
# for x in preds:
#     new_preds.append(x[1])
# results['prediction'] = new_preds
# results = dataset.inverse_transform(results)
# results.head(1)

# results['validity'] = np.where(np.asarray(new_preds) >= 0.5, 1, 0)

# results

In [None]:
# pd.set_option('display.max_columns', None)

# dataset = OnlineCatalog("give_me_some_credit")

# results = pd.read_csv(f"/nr/samba/user/anr/pkg/MCCE_Python/give_me_some_credit_mcce_results_k_{K}_n_{n_test}.csv")

# results.rename(columns={'Unnamed: 0': 'index'}, inplace=True)
# results.set_index(['index'], inplace=True)
# temp = results.sort_values([results.index.name]).iloc[0:1]
# dataset.inverse_transform(temp)[['age', 'RevolvingUtilizationOfUnsecuredLines', 'NumberOfTime30-59DaysPastDueNotWorse','DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']].iloc[0]


In [None]:
# print(mcce.results_sparse.)
# print(mcce.results_sparse.L0.mean())
# print(mcce.results_sparse.L1.mean())
# print(mcce.results_sparse.L2.mean())
# results_sparse = mcce.results_sparse
# results_sparse.index.rename('index', inplace=True)
# results_sparse.groupby('index').size().sort_values(ascending=False)