In [1]:
import torch
import pandas as pd

from mcce import MCCE
n_test = 100
K = 100

  from .autonotebook import tqdm as notebook_tqdm


## Load raw train/test of Adult

In [2]:
train_path = "Data/adult.data"
test_path = "Data/adult.test"
train = pd.read_csv(train_path, sep=", ", header=None, \
    names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', \
        'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])
test = pd.read_csv(test_path, skiprows=1, sep=", ", header=None, \
    names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', \
        'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])

df = pd.concat([train, test], axis=0, ignore_index=True)

df = df.drop(['education'], axis=1)

  """
  


## Preprocess categorical features to have 4 levels

In [3]:
mapping = {">50K": ">50K", ">50K.": ">50K", "<=50K": "<=50K", "<=50K.": "<=50K"}

df["income"] = [mapping[item] for item in df["income"]]

In [4]:
for feature in ["workclass", "marital-status", "occupation", "relationship", \
    "sex", "race", "native-country", "income"]:
    d = df.groupby([feature]).size().sort_values(ascending=False)
    for i, ind in enumerate(d):
        if i <= 3:
            d[i] = i
        else:
            d[i] = 3
    mapping = d.to_dict()
    df[feature] = [mapping[item] for item in df[feature]]

In [5]:
df.to_csv("Data/train_not_normalized_data_from_carla.csv", index=False)

## Read data in using CARLA

In [6]:
from carla.data.catalog import CsvCatalog

continuous = ["age", "fnlwgt", "education-num", "capital-gain", "hours-per-week", "capital-loss"]
categorical = ["marital-status", "native-country", "occupation", "race", "relationship", "sex", "workclass"]
immutable = ["age", "sex"]

dataset = CsvCatalog(file_path="Data/train_not_normalized_data_from_carla.csv",
                     continuous=continuous,
                     categorical=categorical,
                     immutables=immutable,
                     target="income",
                     encoding_method="OneHot_drop_first", # This is important for non-binarized data
                     )

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


[INFO] Using Python-MIP package version 1.12.0 [model.py <module>]


In [7]:
dataset.catalog = {'target': dataset.target, 'continuous': dataset.continuous, 'categorical': dataset.categorical, 'immutable': dataset.immutables}

## Fit predictive model

In [8]:
from carla.models.catalog import MLModelCatalog
import torch
torch.manual_seed(0)

ml_model = MLModelCatalog(
        dataset, 
        model_type="ann", 
        load_online=False, 
        backend="pytorch"
    )

ml_model.train(
learning_rate=0.002,
epochs=20,
batch_size=1024,
hidden_size=[18, 9, 3],
force_train=True, # don't forget to add this or it might load an older model from disk
)

balance on test set 0.23911441129098304, balance on test set 0.23978380149045941
Epoch 0/19
----------


  x = self.softmax(x)


train Loss: 0.4603 Acc: 0.7729

test Loss: 0.3871 Acc: 0.8240

Epoch 1/19
----------
train Loss: 0.3732 Acc: 0.8264

test Loss: 0.3602 Acc: 0.8327

Epoch 2/19
----------
train Loss: 0.3576 Acc: 0.8316

test Loss: 0.3496 Acc: 0.8386

Epoch 3/19
----------
train Loss: 0.3496 Acc: 0.8345

test Loss: 0.3426 Acc: 0.8386

Epoch 4/19
----------
train Loss: 0.3419 Acc: 0.8390

test Loss: 0.3364 Acc: 0.8446

Epoch 5/19
----------
train Loss: 0.3385 Acc: 0.8418

test Loss: 0.3327 Acc: 0.8464

Epoch 6/19
----------
train Loss: 0.3338 Acc: 0.8432

test Loss: 0.3288 Acc: 0.8491

Epoch 7/19
----------
train Loss: 0.3310 Acc: 0.8446

test Loss: 0.3269 Acc: 0.8494

Epoch 8/19
----------
train Loss: 0.3290 Acc: 0.8454

test Loss: 0.3405 Acc: 0.8419

Epoch 9/19
----------
train Loss: 0.3267 Acc: 0.8474

test Loss: 0.3236 Acc: 0.8499

Epoch 10/19
----------
train Loss: 0.3256 Acc: 0.8476

test Loss: 0.3213 Acc: 0.8533

Epoch 11/19
----------
train Loss: 0.3244 Acc: 0.8476

test Loss: 0.3205 Acc: 0.8529



## Performance of predictive model

In [9]:
from sklearn import metrics

pred = ml_model.predict_proba(dataset.df_test)
pred = [row[1] for row in pred]
fpr, tpr, thresholds = metrics.roc_curve(dataset.df_test[dataset.target], pred, pos_label=1)
metrics.auc(fpr, tpr)

0.9071588643439532

## Prepare data for MCCE

In [10]:
from carla.models.negative_instances import predict_negative_instances

factuals = predict_negative_instances(ml_model, dataset.df)
test_factual = factuals.iloc[:n_test]

y_col = dataset.target
cont_feat = dataset.continuous

cat_feat = dataset.categorical
cat_feat_encoded = dataset.encoder.get_feature_names(dataset.categorical)

fixed_features = ['age', 'sex_1']

#  Create dtypes for MCCE()
dtypes = dict([(x, "float") for x in cont_feat])
for x in cat_feat_encoded:
    dtypes[x] = "category"
df = (dataset.df).astype(dtypes)


## Fit MCCE

In [None]:
# import time
# start = time.time()
# # fixed_features = names in dataset
# # categorical = original feature names

# mcce = MCCE(fixed_features=fixed_features, continuous=dataset.continuous, categorical=dataset.categorical,\
#             model=ml_model, seed=1, catalog=dataset.catalog)

# mcce.fit(df.drop(y_col, axis=1), dtypes)

# synth_df = mcce.generate(test_factual.drop(y_col, axis=1), k=K)

## Load results

In [11]:
import pandas as pd
K = 10000
n_test = 100
results_inverse = pd.read_csv(f"Results/adult_mcce_results_raw_data_k_{K}_n_{n_test}_inverse_transform.csv", index_col=0)

print(results_inverse.L0.mean())
print(results_inverse.L2.mean())
print(results_inverse.feasibility.mean())
print(results_inverse.violation.mean())
print(results_inverse.success.mean())
print(results_inverse.shape[0])
print(results_inverse['time (seconds)'].mean())

3.74
1.190302474835284
0.14752682721875376
0.0
1.0
100
1131.3876264095304


In [12]:
true_raw = pd.read_csv(f"Results/adult_raw_data_n_{n_test}.csv", index_col=0)

In [17]:
results_inverse['method'] = 'MCCE'
true_raw['method'] = 'Original'
temp = pd.concat([results_inverse, true_raw], axis=0)

cols = ['method', 'age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', \
       'hours-per-week', 'marital-status', 'native-country', \
       'occupation', 'race', 'relationship', 'sex', 'workclass']

to_write = temp[cols].loc[[1, 31, 122, 124]].sort_index()
to_write.columns = cols
# to_write.sort_values(['Method'], inplace=True, ascending=False)

to_write

Unnamed: 0,method,age,fnlwgt,education-num,capital-gain,...,occupation,race,relationship,sex,workclass
1,MCCE,50.0,65408.0,13.0,0.0,...,2,0,0,0,1
1,Original,50.0,83311.0,13.0,0.0,...,2,0,0,0,1
31,MCCE,20.0,273905.0,9.0,34095.0,...,3,0,2,0,0
31,Original,20.0,266015.0,10.0,0.0,...,3,1,2,0,0
122,MCCE,30.0,349148.0,13.0,13550.0,...,0,2,2,0,0
122,Original,30.0,77143.0,13.0,0.0,...,2,1,2,0,0
124,MCCE,19.0,247679.0,10.0,34095.0,...,3,1,2,0,3
124,Original,19.0,301606.0,10.0,0.0,...,3,1,2,0,0


In [None]:
import pandas as pd

train_path = "Data/adult.data"
test_path = "Data/adult.test"
train = pd.read_csv(train_path, sep=", ", header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])
test = pd.read_csv(test_path, skiprows=1, sep=", ", header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'])
df = pd.concat([train, test], axis=0, ignore_index=True)

In [None]:
for feature in ["workclass", "marital-status", "occupation", "relationship", "sex", "race", "native-country"]:
    d = df.groupby([feature]).size().sort_values(ascending=False)
    for i, ind in enumerate(d):
        if i <= 3:
            d[i] = i
        else:
            d[i] = 3
    mapping = d.to_dict()
    dct = {v: k for k, v in mapping.items()}

    to_write[feature] = [dct[item] for item in to_write[feature]]


In [None]:
feature = 'marital-status'
dct = {'Married-civ-spouse': 'MCS', 'Never-married': 'NM', 'Divorced': 'D', 'Married-AF-spouse': 'MAFS'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'native-country'
dct = {'United-States': 'US', 'Holand-Netherlands': 'HS'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'occupation'
dct = {'Exec-managerial': 'EM', 'Armed-Forces': 'AF', 'Prof-specialty': 'P'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'race'
dct = {'White': 'W', 'Black': 'B', 'Asian-Pac-Islander': 'API'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'relationship'
dct = {'Husband': 'H', 'Own-child': 'OC'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'sex'
dct = {'Male': 'M'}
to_write[feature] = [dct[item] for item in to_write[feature]]

feature = 'workclass'
dct = {'Self-emp-not-inc': 'SENI', 'Private': 'P', 'Never-worked': 'NW'}
to_write[feature] = [dct[item] for item in to_write[feature]]

In [None]:
to_write.head(1)
print(to_write.to_latex(index=False, float_format="%.0f", ))

In [None]:
# feature = 'workclass'
# d = df.groupby([feature]).size().sort_values(ascending=False)
# for i, ind in enumerate(d):
#     if i <= 3:
#         d[i] = i
#     else:
#         d[i] = 3

# mapping = d.to_dict()
# dct = {v: k for k, v in mapping.items()}

# to_write[feature] = [dct[item] for item in to_write[feature]]

# feature = 'marital-status'
# d = df.groupby([feature]).size().sort_values(ascending=False)
# for i, ind in enumerate(d):
#     if i <= 3:
#         d[i] = i
#     else:
#         d[i] = 3
# mapping = d.to_dict()
# dct = {v: k for k, v in mapping.items()}

# to_write[feature] = [dct[item] for item in to_write[feature]]

# feature = 'occupation'
# d = df.groupby([feature]).size().sort_values(ascending=False)
# for i, ind in enumerate(d):
#     if i <= 3:
#         d[i] = i
#     else:
#         d[i] = 3
# mapping = d.to_dict()
# dct = {v: k for k, v in mapping.items()}

# to_write[feature] = [dct[item] for item in to_write[feature]]

# feature = 'relationship'
# d = df.groupby([feature]).size().sort_values(ascending=False)
# for i, ind in enumerate(d):
#     if i <= 3:
#         d[i] = i
#     else:
#         d[i] = 3
# mapping = d.to_dict()
# dct = {v: k for k, v in mapping.items()}

# to_write[feature] = [dct[item] for item in to_write[feature]]

# feature = 'sex'
# d = df.groupby([feature]).size().sort_values(ascending=False)
# for i, ind in enumerate(d):
#     if i <= 3:
#         d[i] = i
#     else:
#         d[i] = 3
# mapping = d.to_dict()
# dct = {v: k for k, v in mapping.items()}

# to_write[feature] = [dct[item] for item in to_write[feature]]

# feature = 'race'
# d = df.groupby([feature]).size().sort_values(ascending=False)
# for i, ind in enumerate(d):
#     if i <= 3:
#         d[i] = i
#     else:
#         d[i] = 3
# mapping = d.to_dict()
# dct = {v: k for k, v in mapping.items()}

# to_write[feature] = [dct[item] for item in to_write[feature]]

# feature = 'native-country'
# d = df.groupby([feature]).size().sort_values(ascending=False)
# for i, ind in enumerate(d):
#     if i <= 3:
#         d[i] = i
#     else:
#         d[i] = 3
# mapping = d.to_dict()
# dct = {v: k for k, v in mapping.items()}

# to_write[feature] = [dct[item] for item in to_write[feature]]

In [None]:
# to_write
# feature = 'workclass'
# [dct[item] for item in to_write[feature]]

In [None]:
# feature = 'marital-status'
# dct = {'Married-civ-spouse': 'MCS', 'Never-married': 'NM', 'Divorced': 'D', 'Married-AF-spouse': 'MAFS'}
# to_write[feature] = [dct[item] for item in to_write[feature]]

# feature = 'native-country'
# dct = {'United-States': 'US', 'Holand-Netherlands': 'HS'}
# to_write[feature] = [dct[item] for item in to_write[feature]]

# feature = 'occupation'
# dct = {'Exec-managerial': 'EM', 'Armed-Forces': 'AF', 'Prof-specialty': 'P'}
# to_write[feature] = [dct[item] for item in to_write[feature]]

# feature = 'race'
# dct = {'White': 'W', 'Black': 'B', 'Asian-Pac-Islander': 'API'}
# to_write[feature] = [dct[item] for item in to_write[feature]]

# feature = 'relationship'
# dct = {'Husband': 'H', 'Own-child': 'OC'}
# to_write[feature] = [dct[item] for item in to_write[feature]]

# feature = 'sex'
# dct = {'Male': 'M'}
# to_write[feature] = [dct[item] for item in to_write[feature]]


# feature = 'workclass'
# dct = {'Self-emp-not-inc': 'SENI', 'Private': 'P', 'Never-worked': 'NW'}
# to_write[feature] = [dct[item] for item in to_write[feature]]


In [None]:
# to_write.head(1)
# print(to_write.to_latex(index=False, float_format="%.0f", ))

In [None]:
# import numpy as np
# data = df
# synth = synth_df
# test = test_factual
# response = y_col
# inverse_transform = dataset.inverse_transform
# cutoff = 0.5
# # Predict response of generated data
# synth[response] = ml_model.predict(synth)
# synth_positive = synth[synth[response]>=cutoff] # drop negative responses

# # Duplicate original test observations N times where N is number of positive counterfactuals
# n_counterfactuals = synth_positive.groupby(synth_positive.index).size()
# n_counterfactuals = pd.DataFrame(n_counterfactuals, columns = ['N'])

# test_repeated = test.copy()

# test_repeated = test_repeated.join(n_counterfactuals)
# test_repeated.dropna(inplace = True)

# test_repeated = test_repeated.reindex(test_repeated.index.repeat(test_repeated.N))
# test_repeated.drop(['N'], axis=1, inplace=True)

# from sklearn.neighbors import NearestNeighbors

# synth=synth_positive
# test=test_repeated

# features = synth.columns.to_list()
# features.remove(response)

# synth_metrics = synth.copy()
# synth.sort_index(inplace=True)

# cols = data.columns
# cols.drop(response)

# feas_results = []
# nbrs = NearestNeighbors(n_neighbors=5).fit(synth[cols].values)

# for i, row in synth[cols].iterrows():
#     knn = nbrs.kneighbors(row.values.reshape((1, -1)), 5, return_distance=True)[0]
    
#     feas_results.append(np.mean(knn))

# synth_metrics['feasibility'] = feas_results

# synth_metrics['success'] = 1

# # 6) Success
# synth_metrics['success'] = 1
# synth.sort_index(inplace=True)

# categorical_encoded = []
# for x in dataset.df.columns:
#     if x not in dataset.continuous:
#         if x not in dataset.target:
#             categorical_encoded.append(x)

# pd.set_option('display.max_columns', None)
# len(synth.index.unique())
# test.loc[1][features].iloc[1:2]

# 1) Distance: Sparsity and Euclidean distance
# factual = test[features].sort_index().to_numpy()
# counterfactuals = synth[features].sort_index().to_numpy()

# cfs_continuous = synth[dataset.continuous].sort_index().to_numpy()
# cfs_categorical = synth[categorical_encoded].sort_index().to_numpy()

# factual_continuous = test[dataset.continuous].sort_index().to_numpy()
# factual_categorical = test[categorical_encoded].sort_index().to_numpy()

# delta_cont = factual_continuous - cfs_continuous
# delta_cat = factual_categorical - cfs_categorical

# delta_cat = np.where(np.abs(delta_cat) > 0, 1, 0)

# delta = np.concatenate((delta_cont, delta_cat), axis=1)
# d1 = np.sum(np.invert(np.isclose(delta, np.zeros_like(delta), atol=1e-5)), axis=1, dtype=float).tolist() # sparsity
# d2 = np.sum(np.abs(delta), axis=1, dtype=float).tolist() # manhatten distance
# d3 = np.sum(np.square(np.abs(delta)), axis=1, dtype=np.float).tolist() # euclidean distance

# synth_metrics['L0'] = d1
# synth_metrics['L1'] = d2
# synth_metrics['L2'] = d3

In [None]:
# df_decoded_cfs = pd.DataFrame(scaler.inverse_transform(synth[continuous]), columns=continuous)
# df_decoded_cfs.index = synth.index

# df_decoded_cfs = pd.concat([df_decoded_cfs, synth[categorical]], axis=1)
# df_decoded_cfs

In [None]:
# def transform(df, continuous, categorical, scaler):
#     df_transform = scaler.transform(df[continuous])
#     df_transform = pd.DataFrame(df_transform, columns=continuous, index=df.index)
#     return pd.concat([df_transform, df[categorical]], axis=1)


# def inverse_transform(df, continuous, categorical, scaler):
#     df_transform = scaler.inverse_transform(df[continuous])
#     df_transform = pd.DataFrame(df_transform, columns=continuous, index=df.index)
#     return pd.concat([df_transform, df[categorical]], axis=1)

In [None]:
# def intersection(lst1, lst2):
#     return list(set(lst1) & set(lst2))


# df_decoded_cfs = dataset.inverse_transform(synth)

# df_factuals = dataset.inverse_transform(test)

# # check continuous using np.isclose to allow for very small numerical differences
# cfs_continuous_immutable = df_decoded_cfs[
#     intersection(dataset.continuous, fixed_features)
# ]

# factual_continuous_immutable = df_factuals[
#     intersection(dataset.continuous, dataset.immutables)
# ]

# continuous_violations = np.invert(
#     np.isclose(cfs_continuous_immutable, factual_continuous_immutable)
# )
# continuous_violations = np.sum(continuous_violations, axis=1).reshape(
#     (-1, 1)
# ) 

# # check categorical by boolean comparison
# cfs_categorical_immutable = df_decoded_cfs[
#     intersection(dataset.categorical, dataset.immutables)
# ]
# # print(cfs_categorical_immutable)
# factual_categorical_immutable = df_factuals[
#     intersection(dataset.categorical, dataset.immutables)
# ]


# cfs_categorical_immutable.sort_index(inplace=True)
# factual_categorical_immutable.sort_index(inplace=True)
# cfs_categorical_immutable.index.name = None

# categorical_violations = cfs_categorical_immutable != factual_categorical_immutable
# categorical_violations = np.sum(categorical_violations.values, axis=1).reshape(
#     (-1, 1)
# )  # sum over features

# synth_metrics['violation'] = continuous_violations + categorical_violations


In [None]:
# pd.set_option('display.max_columns', None)

In [None]:
# results = synth_metrics.copy()
# results_sparse = pd.DataFrame(columns=results.columns)

# for idx in list(set(results.index)):
#     idx_df = results.loc[idx]
#     if(isinstance(idx_df, pd.DataFrame)): # If you have multiple rows
#         sparse = min(idx_df.L0) # 1) find least # features changed
#         sparse_df = idx_df[idx_df.L0 == sparse] 
#         closest = min(sparse_df.L2) # find smallest Gower distance
#         close_df = sparse_df[sparse_df.L2 == closest]

#         if(close_df.shape[0]>1):
#             highest_feasibility = max(close_df.feasibility) #  3) find most feasible
#             close_df = close_df[close_df.feasibility == highest_feasibility].head(1)

#     else: # if you have only one row - return that row
#         close_df = idx_df.to_frame().T
        
#     results_sparse = pd.concat([results_sparse, close_df], axis=0)


In [None]:
# results_sparse[['L0', 'L1', 'L2', 'feasibility', 'violation', 'success']].mean()