In [1]:
from model.fctgan import FCTGAN
from model.eval.evaluation import get_utility_metrics,stat_sim,privacy_metrics
import numpy as np
import pandas as pd
import glob

In [2]:
num_exp = 1
dataset = "Adult"
real_path = "Real_Datasets/Adult.csv"
fake_file_root = "Fake_Datasets"

In [3]:
synthesizer =  FCTGAN(raw_csv_path = real_path,
                 test_ratio = 0.20,
                 categorical_columns = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income'], 
                 log_columns = [],
                 mixed_columns= {'capital-loss':[0.0],'capital-gain':[0.0]},
                 general_columns = ["age"],
                 non_categorical_columns = [],
                 integer_columns = ['age', 'fnlwgt','capital-gain', 'capital-loss','hours-per-week'],
                 problem_type= {"Classification": 'income'}) 

for i in range(num_exp):
    synthesizer.fit()

In [4]:
for i in range(3):
    syn = synthesizer.generate_samples()
    syn.to_csv(fake_file_root+"/"+dataset+"/"+ dataset+"_fake_{exp}.csv".format(exp=i), index= False)
    
fake_paths = glob.glob(fake_file_root+"/"+dataset+"/"+"*")

In [5]:
# ML utility
model_dict = {"Classification":["lr","dt","rf","mlp","svm"]}
result_mat = get_utility_metrics(real_path,fake_paths,"MinMax",model_dict, test_ratio = 0.20)

result_df  = pd.DataFrame(result_mat,columns=["Acc","AUC","F1_Score"])
result_df.index = list(model_dict.values())[0]
result_df

Unnamed: 0,Acc,AUC,F1_Score
lr,0.71314,0.004609,0.015289
dt,1.699253,0.037113,0.030966
rf,2.214488,0.027903,0.037367
mlp,0.948579,0.012153,0.011448
svm,-0.119425,0.010989,-0.058133


In [6]:
# Statistical similarity 
adult_categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country', 'income']
stat_res_avg = []
for fake_path in fake_paths:
    stat_res = stat_sim(real_path,fake_path,adult_categorical)
    stat_res_avg.append(stat_res)

stat_columns = ["Average WD (Continuous Columns","Average JSD (Categorical Columns)","Correlation Distance"]
stat_results = pd.DataFrame(np.array(stat_res_avg).mean(axis=0).reshape(1,3),columns=stat_columns)
stat_results

column:  workclass JSD:  0.03154151227194534
column:  occupation JSD:  0.050341316424578335
column:  gender JSD:  0.006005960807513372
column:  hours-per-week WD:  0.007146743059043361
column:  marital-status JSD:  0.035607382304239966
column:  relationship JSD:  0.036528548537697804
column:  race JSD:  0.03875986118772108
column:  native-country JSD:  0.05966432605872128
column:  age WD:  0.015327309249337954
column:  capital-gain WD:  0.003708720392555884
column:  education JSD:  0.05824565739446568
column:  capital-loss WD:  0.0038093353170540857
column:  fnlwgt WD:  0.007338431016060179
column:  income JSD:  0.02393941579230806
column:  workclass JSD:  0.03241094062327032
column:  occupation JSD:  0.046745241291436514
column:  gender JSD:  0.006557172737421196
column:  hours-per-week WD:  0.006873893245107711
column:  marital-status JSD:  0.03697911964207673
column:  relationship JSD:  0.03357806697241575
column:  race JSD:  0.03204392586517034
column:  native-country JSD:  0.05962

Unnamed: 0,Average WD (Continuous Columns,Average JSD (Categorical Columns),Correlation Distance
0,0.007556,0.037385,0.379249
