In [1]:
from model.ctabgan import CTABGAN
from model.eval.evaluation import get_utility_metrics, stat_sim, privacy_metrics
import numpy as np
import pandas as pd
import glob

In [2]:
num_exp = 5
dataset = "New-Thyroid"
real_path = "Real_Datasets/new-thyroid.csv"
fake_file_root = "Fake_Datasets"

In [3]:
synthesizer =  CTABGAN(raw_csv_path = real_path,
                 test_ratio = 0.20,
                 categorical_columns = ['Class'], 
                 log_columns = [],
                 mixed_columns= {},
                 general_columns = ["Total_serum_thyroxin", "Total_serum_triiodothyronine", "Basal_TSH", "Maximal_TSH"],
                 non_categorical_columns = [],
                 integer_columns = ['T3_resin_uptake'],
                 problem_type= {"Classification": 'Class'}) 

for i in range(num_exp):
    synthesizer.fit()
    syn = synthesizer.generate_samples()
    syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)

100%|██████████| 150/150 [02:59<00:00,  1.19s/it]


Finished training in 179.3764214515686  seconds.


100%|██████████| 150/150 [02:55<00:00,  1.17s/it]


Finished training in 176.18929362297058  seconds.


100%|██████████| 150/150 [02:56<00:00,  1.18s/it]


Finished training in 176.6688358783722  seconds.


100%|██████████| 150/150 [02:55<00:00,  1.17s/it]


Finished training in 176.12769603729248  seconds.


100%|██████████| 150/150 [02:54<00:00,  1.16s/it]

Finished training in 174.90432476997375  seconds.





In [4]:
fake_paths = glob.glob(fake_file_root+"/"+dataset+"/"+"*")

In [5]:
model_dict =  {"Classification":["lr","dt","rf","mlp","svm"]}
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",model_dict, test_ratio = 0.20)

result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = list(model_dict.values())[0]
result_df

Unnamed: 0,Acc,AUC,F1_Score
lr,13.023256,0.335094,0.222329
dt,46.976744,0.409419,0.475265
rf,27.906977,0.286299,0.337677
mlp,7.906977,0.26867,0.144034
svm,25.116279,0.321047,0.36659


In [6]:
thyroid_categorical = ['Class']
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,thyroid_categorical)
    stat_res_avg.append(stat_res)

stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]
stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
stat_results

column:  T3_resin_uptake WD:  0.05881660288489845
column:  Total_serum_thyroxin WD:  0.060704296071638505
column:  Total_serum_triiodothyronine WD:  0.07543929858574325
column:  Basal_TSH WD:  0.035117110141842545
column:  Maximal_TSH WD:  0.05653479634864335
column:  Class JSD:  0.15772165912072805
column:  T3_resin_uptake WD:  0.04757138651751547
column:  Total_serum_thyroxin WD:  0.05551922744767432
column:  Total_serum_triiodothyronine WD:  0.05303924597152668
column:  Basal_TSH WD:  0.03562194439128771
column:  Maximal_TSH WD:  0.02805404077836904
column:  Class JSD:  0.06399512596424002
column:  T3_resin_uptake WD:  0.08007065057403592
column:  Total_serum_thyroxin WD:  0.13744572753398293
column:  Total_serum_triiodothyronine WD:  0.04036759961030467
column:  Basal_TSH WD:  0.03870906405961224
column:  Maximal_TSH WD:  0.03628067811816528
column:  Class JSD:  0.1511529235421584
column:  T3_resin_uptake WD:  0.04262584633500148
column:  Total_serum_thyroxin WD:  0.069512837110206

Unnamed: 0,Average WD (Continuous Columns,Average JSD (Categorical Columns),Correlation Distance
0,0.05376,0.136536,2.433817


In [7]:
priv_res_avg = []
for fake_path in fake_paths:
    priv_res = privacy_metrics(real_path,fake_path)
    priv_res_avg.append(priv_res)
    
privacy_columns = ["DCR between Real and Fake (5th perc)","DCR within Real(5th perc)","DCR within Fake (5th perc)","NNDR between Real and Fake (5th perc)","NNDR within Real (5th perc)","NNDR within Fake (5th perc)"]
privacy_results = pd.DataFrame(np.array(priv_res_avg).mean(axis=0).reshape(1,6),columns=privacy_columns)
privacy_results

Unnamed: 0,DCR between Real and Fake (5th perc),DCR within Real(5th perc),DCR within Fake (5th perc),NNDR between Real and Fake (5th perc),NNDR within Real (5th perc),NNDR within Fake (5th perc)
0,0.62372,0.271198,0.57658,0.518865,0.586215,0.518044


In [9]:
import os
import pandas as pd

from Evaluation.cdf_tail_metrics import CDFTailMetrics
from Evaluation.support_coverage import SupportCoverage
from Evaluation.rare_event_recall import RareEventRecall

n_experiments = 5

cdf_eval = CDFTailMetrics(label_col="Class", tau=0.9)
sc_eval  = SupportCoverage(label_col="Class", n_bins=5, rare_threshold=0.01, include_label_in_combo=True)
rer_eval = RareEventRecall(label_col="Class")

rows = []

for exp in range(n_experiments):
    syn_path = os.path.join(fake_file_root, dataset, f"{dataset}_fake_{exp}.csv")
    if not os.path.exists(syn_path):
        print("Missing:", syn_path)
        continue

    res = {"exp": exp, "syn_path": syn_path}
    res.update(cdf_eval.evaluate_paths(real_path, syn_path))
    res.update(sc_eval.evaluate_paths(real_path, syn_path))
    res.update(rer_eval.evaluate_paths(real_path, syn_path))
    rows.append(res)

results_df = pd.DataFrame(rows)
results_df


Unnamed: 0,exp,syn_path,cdf_tail_div_T3_resin_uptake,cdf_tail_div_Total_serum_thyroxin,cdf_tail_div_Total_serum_triiodothyronine,cdf_tail_div_Basal_TSH,cdf_tail_div_Maximal_TSH,cdf_tail_div_mean,support_coverage,num_rare_combos,rare_threshold,n_bins,include_label,rare_class,rare_event_recall,rare_class_count_real,n_classes_real
0,0,Fake_Datasets/New-Thyroid/New-Thyroid_fake_0.csv,0.088372,0.102326,0.195349,0.106977,0.218605,0.142326,0.017143,175,0.01,5,1.0,3.0,0.566667,30,3
1,1,Fake_Datasets/New-Thyroid/New-Thyroid_fake_1.csv,0.051163,0.051163,0.07907,0.069767,0.07907,0.066047,0.022857,175,0.01,5,1.0,3.0,0.066667,30,3
2,2,Fake_Datasets/New-Thyroid/New-Thyroid_fake_2.csv,0.102326,0.102326,0.04186,0.07907,0.093023,0.083721,0.034286,175,0.01,5,1.0,3.0,0.333333,30,3
3,3,Fake_Datasets/New-Thyroid/New-Thyroid_fake_3.csv,0.07907,0.046512,0.055814,0.2,0.181395,0.112558,0.034286,175,0.01,5,1.0,3.0,0.6,30,3
4,4,Fake_Datasets/New-Thyroid/New-Thyroid_fake_4.csv,0.055814,0.102326,0.148837,0.130233,0.125581,0.112558,0.028571,175,0.01,5,1.0,3.0,0.233333,30,3
