## Aggregating results to DataFrame

In [37]:
import os
import lib
import numpy as np
import pandas as pd

DATASETS = [
    "abalone",
    "adult",
    "buddy",
    "california",
    "cardio",
    "churn2",
    "default",
    "diabetes",
    "fb-comments",
    "gesture",
    "higgs-small",
    "house",
    "insurance",
    "king",
    "miniboone",
    "wilt"
]

_REGRESSION = [
    "abalone",
    "california",
    "fb-comments",
    "house",
    "insurance",
    "king",
]


method2exp = {
    "real": "exp/{}/ddpm_cb_best/",
    "tab-ddpm": "exp/{}/ddpm_cb_best/"
    # "smote": "exp/{}/smote/",
    # "ctabgan+": "exp/{}/ctabgan-plus/",
    # "ctabgan": "exp/{}/ctabgan/",
    # "tvae": "exp/{}/tvae/"
}

eval_file = "eval_catboost.json"
show_std = False
df = pd.DataFrame(columns=["method"] + [_[:3].upper() for _ in DATASETS])

for algo in method2exp: 
    algo_res = []
    for ds in DATASETS:
        if not os.path.exists(os.path.join(method2exp[algo].format(ds), eval_file)):
            algo_res.append("--")
            continue
        metric = "r2" if ds in _REGRESSION else "f1"
        res_dict = lib.load_json(os.path.join(method2exp[algo].format(ds), eval_file))

        if algo == "real":
            res = f'{res_dict["real"]["test"][metric + "-mean"]:.4f}' 
            if show_std: res += f'+-{res_dict["real"]["test"][metric + "-std"]:.4f}'
        else:
            res = f'{res_dict["synthetic"]["test"][metric + "-mean"]:.4f}'
            if show_std: res += f'+-{res_dict["synthetic"]["test"][metric + "-std"]:.4f}'

        algo_res.append(res)
    df.loc[len(df)] = [algo] + algo_res

In [38]:
df

Unnamed: 0,method,ABA,ADU,BUD,CAL,CAR,CHU,DEF,DIA,FB-,GES,HIG,HOU,INS,KIN,MIN,WIL
0,real,0.5562,0.8152,0.9063,0.8568,0.7379,0.7403,0.688,0.7849,0.8371,0.6365,0.7238,0.6616,0.8137,0.907,0.9342,0.8982
1,tab-ddpm,0.5499,0.7951,0.9057,0.8362,0.7374,0.7548,0.691,0.7398,0.7128,0.5967,0.7218,0.6766,0.8092,0.8331,0.9362,0.9045


In [39]:
res_dict

{'synthetic': {'val': {'acc-count': 50.0,
   'acc-mean': 0.9963,
   'acc-std': 0.0012,
   'f1-count': 50.0,
   'f1-mean': 0.982,
   'f1-std': 0.0056,
   'roc_auc-count': 50.0,
   'roc_auc-mean': 0.9994,
   'roc_auc-std': 0.0004},
  'test': {'acc-count': 50.0,
   'acc-mean': 0.9805,
   'acc-std': 0.0019,
   'f1-count': 50.0,
   'f1-mean': 0.9045,
   'f1-std': 0.009,
   'roc_auc-count': 50.0,
   'roc_auc-mean': 0.9923,
   'roc_auc-std': 0.0006}},
 'real': {'val': {'acc-count': 10.0,
   'acc-mean': 0.9907,
   'acc-std': 0.001,
   'f1-count': 10.0,
   'f1-mean': 0.9532,
   'f1-std': 0.0045,
   'roc_auc-count': 10.0,
   'roc_auc-mean': 0.9964,
   'roc_auc-std': 0.002},
  'test': {'acc-count': 10.0,
   'acc-mean': 0.9801,
   'acc-std': 0.0009,
   'f1-count': 10.0,
   'f1-mean': 0.8982,
   'f1-std': 0.0063,
   'roc_auc-count': 10.0,
   'roc_auc-mean': 0.9883,
   'roc_auc-std': 0.0019}}}

In [32]:
import zero
dir(zero)

['CONF',
 'Circuit',
 'DESCRIPTION',
 'LOGGER',
 'PROGRAM',
 'ZeroConfig',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_version',
 'add_log_handler',
 'circuit',
 'components',
 'config',
 'elements',
 'format',
 'logging',
 'misc',
 'noise',
 'rcParams',
 'set_log_verbosity']

In [2]:
df

Unnamed: 0,method,ABA,ADU,BUD,CAL,CAR,CHU,DEF,DIA,FB-,GES,HIG,HOU,INS,KIN,MIN,WIL
0,real,0.5562,0.8152,0.9063,0.8568,0.7379,0.7403,0.688,0.7849,0.8371,0.6365,0.7238,0.6616,0.8137,0.9070,0.9342,0.8982
1,tab-ddpm,0.5499,0.7951,0.9057,0.8362,0.7374,0.7548,0.691,0.7398,0.7128,0.5967,0.7218,0.6766,0.8092,0.8331,0.9362,0.9045
2,smote,0.5486,0.7912,0.8906,0.8397,0.7323,0.7432,0.693,0.6835,0.8035,0.6579,0.7219,0.6625,0.8119,0.8416,0.9323,0.9127
3,ctabgan+,0.4672,0.7724,0.8844,0.5247,0.7327,0.7024,0.6865,0.7339,0.5088,0.4055,0.6639,0.5040,0.7966,0.4438,0.892,0.7983
4,ctabgan,--,0.7831,0.8552,--,0.7171,0.6875,0.6437,0.731,--,0.3922,0.5748,--,--,--,0.8892,0.906
5,tvae,0.4328,0.781,0.8638,0.7518,0.7174,0.7317,0.6564,0.7136,0.6853,0.434,0.6378,0.4926,0.7842,0.8238,0.9125,0.5006


In [54]:
import numpy as np
test = np.load('data/diabetes/X_cat_train.npy')
print(test)


FileNotFoundError: [Errno 2] No such file or directory: 'data/diabetes/X_cat_train.npy'

In [45]:
len(test)

491

In [None]:
np_load_old = np.load
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)
np.load = np_load_old

In [None]:
# np_load_old = np.load
# np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)
# np.load = np_load_old
import numpy as np
t2 = np.load('data/abalone/X_num_train.npy')
print(t2)

In [82]:
np.load('data/diabetes/y_train.npy')

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0,

In [42]:
from rdt.transformers.numerical import ClusterBasedNormalizer
transformer = ClusterBasedNormalizer()

In [43]:
dir(transformer)

['INITIAL_FIT_STATE',
 'INPUT_SDTYPE',
 'IS_GENERATOR',
 'STD_MULTIPLIER',
 'SUPPORTED_SDTYPES',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_columns_to_data',
 '_bgm_transformer',
 '_build_output_columns',
 '_dtype',
 '_fit',
 '_get_columns_data',
 '_get_current_random_seed',
 '_get_output_to_property',
 '_max_value',
 '_min_value',
 '_raise_out_of_bounds_error',
 '_reverse_transform',
 '_reverse_transform_helper',
 '_rounding_digits',
 '_set_fitted_parameters',
 '_set_missing_value_generation',
 '_set_missing_value_replacement',
 '_set_model_missing_values',
 '_set_seed',
 '_store_columns',
 '_transform',
 '_validate_values_within_bounds',
 'column_prefix',


In [44]:
import pywavefront
scene = pywavefront.Wavefront('/Users/pigr/Desktop/uzh论文/pythonProject/tab-ddpm/exp/diabetes/tvae/tvae.obj', collect_faces=True)


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte

In [46]:
import pandas as pd
t = pd.read_json('/Users/pigr/Desktop/uzh论文/pythonProject/tab-ddpm/exp/diabetes/ddpm_cb_best/results_catboost.json')
t

Unnamed: 0,eval_type,dataset,metrics
train,synthetic,data/diabetes/,"{'0': {'precision': 0.8668555240793201, 'recal..."
val,synthetic,data/diabetes/,"{'0': {'precision': 0.772727272727272, 'recall..."
test,synthetic,data/diabetes/,"{'0': {'precision': 0.8073394495412841, 'recal..."


In [51]:
t.iloc[2]['metrics']['0']

{'precision': 0.8073394495412841,
 'recall': 0.88,
 'f1-score': 0.842105263157894,
 'support': 100.0}

In [53]:
pd.DataFrame(t.iloc[2]['metrics']['0'],index=[0])

Unnamed: 0,precision,recall,f1-score,support
0,0.807339,0.88,0.842105,100.0


In [63]:
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
def privacy_metrics(real_path,fake_path, data_percent=15):

    """
    Returns privacy metrics

    Inputs:
    1) real_path -> path to real data
    2) fake_path -> path to corresponding synthetic data
    3) data_percent -> percentage of data to be sampled from real and synthetic datasets for computing privacy metrics
    Outputs:
    1) List containing the 5th percentile distance to closest record (DCR) between real and synthetic as well as within real and synthetic datasets
    along with 5th percentile of nearest neighbour distance ratio (NNDR) between real and synthetic as well as within real and synthetic datasets

    """
    task_type = lib.load_json(real_path + "/info.json")["task_type"]
    X_num_real, X_cat_real, y_real = lib.read_pure_data(real_path, 'train')
    X_num_fake, X_cat_fake, y_fake = lib.read_pure_data(fake_path, 'train')

    if task_type == 'regression':
        X_num_real = np.concatenate([X_num_real, y_real[:, np.newaxis]], axis=1)
        X_num_fake = np.concatenate([X_num_fake, y_fake[:, np.newaxis]], axis=1)
    else:
        if X_cat_fake is None:
            X_cat_real = y_real[:, np.newaxis].astype(int).astype(str)
            X_cat_fake = y_fake[:, np.newaxis].astype(int).astype(str)
        else:
            X_cat_real = np.concatenate([X_cat_real, y_real[:, np.newaxis].astype(int).astype(str)], axis=1)
            X_cat_fake = np.concatenate([X_cat_fake, y_fake[:, np.newaxis].astype(int).astype(str)], axis=1)

    if len(y_real) > 50000:
        ixs = np.random.choice(len(y_real), 50000, replace=False)
        X_num_real = X_num_real[ixs]
        X_cat_real = X_cat_real[ixs] if X_cat_real is not None else None

    if len(y_fake) > 50000:
        ixs = np.random.choice(len(y_fake), 50000, replace=False)
        X_num_fake = X_num_fake[ixs]
        X_cat_fake = X_cat_fake[ixs] if X_cat_fake is not None else None


    mm = MinMaxScaler().fit(X_num_real)
    X_real = mm.transform(X_num_real)
    X_fake = mm.transform(X_num_fake)
    if X_cat_real is not None:
        ohe = OneHotEncoder().fit(X_cat_real)
        X_cat_real = ohe.transform(X_cat_real) / np.sqrt(2)
        X_cat_fake = ohe.transform(X_cat_fake) / np.sqrt(2)

        X_real = np.concatenate([X_real, X_cat_real.todense()], axis=1)
        X_fake = np.concatenate([X_fake, X_cat_fake.todense()], axis=1)

    # X_real = np.unique(X_real, axis=0)
    # X_fake = np.unique(X_fake, axis=0)
    X_fake = np.asarray(X_fake)
    X_real = np.asarray(X_real)
    # Computing pair-wise distances between real and synthetic
    dist_rf = pairwise_distances(X_fake, Y=X_real, metric='l2', n_jobs=-1)
    # Computing pair-wise distances within real
    # dist_rr = pairwise_distances(X_real, Y=None, metric='l2', n_jobs=-1)
    # Computing pair-wise distances within synthetic
    # dist_ff = pairwise_distances(X_fake, Y=None, metric='l2', n_jobs=-1)


    # Removes distances of data points to themselves to avoid 0s within real and synthetic
    # rd_dist_rr = dist_rr[~np.eye(dist_rr.shape[0],dtype=bool)].reshape(dist_rr.shape[0],-1)
    # rd_dist_ff = dist_ff[~np.eye(dist_ff.shape[0],dtype=bool)].reshape(dist_ff.shape[0],-1)

    # Computing first and second smallest nearest neighbour distances between real and synthetic
    smallest_two_indexes_rf = [dist_rf[i].argsort()[:2] for i in range(len(dist_rf))]
    smallest_two_rf = [dist_rf[i][smallest_two_indexes_rf[i]] for i in range(len(dist_rf))]
    # Computing first and second smallest nearest neighbour distances within real
    # smallest_two_indexes_rr = [rd_dist_rr[i].argsort()[:2] for i in range(len(rd_dist_rr))]
    # smallest_two_rr = [rd_dist_rr[i][smallest_two_indexes_rr[i]] for i in range(len(rd_dist_rr))]
    # Computing first and second smallest nearest neighbour distances within synthetic
    # smallest_two_indexes_ff = [rd_dist_ff[i].argsort()[:2] for i in range(len(rd_dist_ff))]
    # smallest_two_ff = [rd_dist_ff[i][smallest_two_indexes_ff[i]] for i in range(len(rd_dist_ff))]


    # Computing 5th percentiles for DCR and NNDR between and within real and synthetic datasets
    min_dist_rf = np.array([i[0] for i in smallest_two_rf])
    fifth_perc_rf = np.percentile(min_dist_rf,5)
    # min_dist_rr = np.array([i[0] for i in smallest_two_rr])
    # fifth_perc_rr = np.percentile(min_dist_rr,5)
    # min_dist_ff = np.array([i[0] for i in smallest_two_ff])
    # fifth_perc_ff = np.percentile(min_dist_ff,5)
    # nn_ratio_rf = np.array([i[0]/i[1] for i in smallest_two_rf])
    # nn_fifth_perc_rf = np.percentile(nn_ratio_rf,5)
    # nn_ratio_rr = np.array([i[0]/i[1] for i in smallest_two_rr])
    # nn_fifth_perc_rr = np.percentile(nn_ratio_rr,5)
    # nn_ratio_ff = np.array([i[0]/i[1] for i in smallest_two_ff])
    # nn_fifth_perc_ff = np.percentile(nn_ratio_ff,5)

    # return np.array([fifth_perc_rf,fifth_perc_rr,fifth_perc_ff,nn_fifth_perc_rf,nn_fifth_perc_rr,nn_fifth_perc_ff]).reshape(1,6)
    return min_dist_rf # , min_dist_rr

In [67]:
dists=privacy_metrics(real_path='data/diabetes',fake_path='exp/diabetes/ctabgan-plus', data_percent=15)



In [68]:
privacy_val = np.median(dists)
privacy_val

np.float64(0.19242138529445127)

In [76]:
from collections import defaultdict
from tqdm import tqdm
# datasets = ['abalone','adult','buddy','california','cardio','churn2','diabetes','fb-comments','gesture','higgs-small','house','insurance','king','miniboone','wilt']
# models = ['ddmp_cb_best','ctabgan-plus','tvae','smote']
models = ['ddpm_cb_best','ctabgan-plus','tvae','smote']
# pbar = tqdm(datasets)
mbar = tqdm(models)
data = 'cardio'

res = defaultdict(int)
for model in mbar:
    real_path = 'data/'+data
    fake_path = 'exp/'+data+'/'+model
    dists = privacy_metrics(real_path=real_path,fake_path=fake_path, data_percent=15)
    privacy_val = np.median(dists)
    res[model] = privacy_val
    mbar.set_description(f"{model} is completed")


  0%|          | 0/4 [00:06<?, ?it/s]


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/pigr/Desktop/uzh论文/pythonProject/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3550, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/1l/7pvpcs552dg7h4wlrb9tvv8h0000gn/T/ipykernel_18926/545055158.py", line 14, in <module>
    dists = privacy_metrics(real_path=real_path,fake_path=fake_path, data_percent=15)
  File "/var/folders/1l/7pvpcs552dg7h4wlrb9tvv8h0000gn/T/ipykernel_18926/1606525946.py", line 19, in privacy_metrics
    X_num_fake, X_cat_fake, y_fake = lib.read_pure_data(fake_path, 'train')
  File "/Users/pigr/Desktop/uzh论文/pythonProject/tab-ddpm/lib/data.py", line 678, in read_pure_data
  File "/Users/pigr/Desktop/uzh论文/pythonProject/lib/python3.9/site-packages/numpy/lib/_npyio_impl.py", line 484, in load
  File "/Users/pigr/Desktop/uzh论文/pythonProject/lib/python3.9/site-packages/numpy/lib/format.py", line 827, in read_array
  File "/Users/pigr/Desktop/uzh论文/pythonProject

In [71]:
res

defaultdict(dict,
            {'diabetes': {'ddpm_cb_best': np.float64(0.23167739911266447)}})

In [74]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, help='choose the model')
parser.add_argument('--dataset', type=str, help='choose the dataset')
args = parser.parse_args()
args.model

usage: ipykernel_launcher.py [-h] [--model MODEL] [--dataset DATASET]
ipykernel_launcher.py: error: unrecognized arguments: -f /Users/pigr/Library/Jupyter/runtime/kernel-6a51cf5a-f213-401e-a7dd-c02f8a1df5c9.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [83]:
import torch
model = torch.load('exp/diabetes/ddpm_cb_best/model.pt')

In [84]:
dir(model)

['__class__',
 '__class_getitem__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__ior__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__or__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__ror__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '_metadata',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'move_to_end',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

In [91]:
X_num={'train': np.array([[ 1.50108595, -1.58338678,  1.83391464, ...,  0.57789749,
        -0.34561825,  1.11077162],
       [-0.72791329, -1.52070719, -1.14229039, ...,  0.2533471 ,
         1.57054024, -0.62292572],
       [ 0.96742157,  1.69465522, -0.62292572, ..., -1.41570209,
         0.78390897,  0.1397103 ],
       ...,
       [-0.34069483, -1.34762888, -0.52440051, ...,  0.73365686,
         0.68243484, -0.62292572],
       [-0.72791329, -1.50756199,  0.62292572, ..., -0.95720947,
         0.5642429 , -1.11077162],
       [-0.34069483,  0.02089009, -1.24445179, ..., -0.72791329,
         0.2533471 , -0.2533471 ]])}

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (7,) + inhomogeneous part.

In [None]:
X_num={'train': np.array([[ 1.50108595, -1.58338678,  1.83391464, ...,  0.57789749,
        -0.34561825,  1.11077162],
       [-0.72791329, -1.52070719, -1.14229039, ...,  0.2533471 ,
         1.57054024, -0.62292572],
       [ 0.96742157,  1.69465522, -0.62292572, ..., -1.41570209,
         0.78390897,  0.1397103 ],
       ...,
       [-0.34069483, -1.34762888, -0.52440051, ...,  0.73365686,
         0.68243484, -0.62292572],
       [-0.72791329, -1.50756199,  0.62292572, ..., -0.95720947,
         0.5642429 , -1.11077162],
       [-0.34069483,  0.02089009, -1.24445179, ..., -0.72791329,
         0.2533471 , -0.2533471 ]])}

In [None]:
!pip uninstall tornado

In [None]:
1