In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import numpy as np

train_time = np.array([182.87606406211853, 163.16899490356445, 160.77447295188904, 169.1652810573578, 161.25752568244934])
sample_time = np.array([6.197754621505737, 5.553525447845459, 5.609967231750488, 5.5619001388549805, 5.4711527824401855])

print("CTGAN method:")
print(f" - Train time: {(train_time.mean()/60).round(2)} min.")
print(f" - Sampling time: {sample_time.mean().round(2)} sec.")

train_time = np.array([342.2909104824066, 342.98024106025696, 342.817191362381, 344.6774263381958, 344.17060708999634])
sample_time = np.array([1.0080089569091797, 1.030515193939209, 1.0197803974151611, 1.0102648735046387, 1.0290708541870117])

print("\nCopulaGAN method:")
print(f" - Train time: {(train_time.mean()/60).round(2)} min.")
print(f" - Sampling time: {sample_time.mean().round(2)} sec.")

vae_train_time = np.array([26.7655]) # min
train_time = np.array([390.08692240715027])
sample_time = np.array([4.811263084411621, 4.804065704345703, 4.816697359085083, 4.792133092880249, 4.804851770401001])

print("\nTabSyn method:")
print(f" - Train time: {(vae_train_time+(train_time/60)).round(2)[0]} min.")
print(f" - Sampling time: {sample_time.mean().round(2)} sec.")

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def encode_cat_cols(df):
  label_dict = {}
  for col in df.columns:
    sample = np.nan
    for i in range(len(df[col])):
      if df[col][i]!=df[col][i]: continue
      else:
        sample = df[col][i]
        break
    if isinstance(sample, str):
      label_dict[col] = {}
      le = LabelEncoder()
      le.fit(list(df[col]))
      labels = list(le.classes_)
      for i in range(len(labels)):
        label_dict[col][labels[i]] = i

  return label_dict

In [None]:
filename = "california_housing"

train = pd.read_csv(f'../data_to_eval/train/{filename}.csv') # train dataset

categorical_columns = []
for col in list(train.columns):
  sample = np.nan
  for i in range(len(train[col])):
    if train[col][i]!=train[col][i]: continue
    else:
      sample = train[col][i]
      break
  if isinstance(sample, str):
    categorical_columns.append(col)
label_dictionary = encode_cat_cols(train)
train = train.dropna().reset_index(drop=True)

test = pd.read_csv(f'../data_to_eval/test/{filename}.csv') # test dataset
test = test.dropna().reset_index(drop=True)

ctgan = []
for i in range(5):
  df = pd.read_csv(f'../data_to_eval/ctgan/{filename}_{i}.csv')
  df = df.dropna().reset_index(drop=True)
  ctgan.append(df)

copulagan = []
for i in range(5):
  df = pd.read_csv(f'../data_to_eval/copulagan/{filename}_{i}.csv')
  df = df.dropna().reset_index(drop=True)
  copulagan.append(df)

tabsyn = []
for i in range(5):
  df = pd.read_csv(f'../data_to_eval/tabsyn/{filename}_{i}.csv')
  df = df.dropna().reset_index(drop=True)
  tabsyn.append(df)

great = []
for i in range(5):
  df = pd.read_csv(f'../data_to_eval/great/{filename}_{i}.csv')
  df = df.dropna().reset_index(drop=True)
  great.append(df)

paft = []
for i in range(5):
  df = pd.read_csv(f'../data_to_eval/paft/{filename}_{i}.csv')
  df = df.dropna().reset_index(drop=True)
  paft.append(df)

In [None]:
print("\nCTGAN dataset")
total_lack = np.array([0]*5)
total_col_involved = np.array([0]*5)
for i in range(5):
  for col in label_dictionary.keys():
    lack = list(set(label_dictionary[col].keys()- set(ctgan[i][col])))
    if len(lack):
      total_col_involved[i] += 1
    total_lack[i] += len(lack)
print(f"=> Total missed: {int(np.median(total_lack))} values, {int(np.median(total_col_involved))} columns involved.")

print("\nCopulaGAN dataset")
total_lack = np.array([0]*5)
total_col_involved = np.array([0]*5)
for i in range(5):
  for col in label_dictionary.keys():
    lack = list(set(label_dictionary[col].keys()- set(copulagan[i][col])))
    if len(lack):
      total_col_involved[i] += 1
    total_lack[i] += len(lack)
print(f"=> Total missed: {int(np.median(total_lack))} values, {int(np.median(total_col_involved))} columns involved.")

print("\nTabSyn dataset")
total_lack = np.array([0]*5)
total_col_involved = np.array([0]*5)
for i in range(5):
  for col in label_dictionary.keys():
    lack = list(set(label_dictionary[col].keys()- set(tabsyn[i][col])))
    if len(lack):
      total_col_involved[i] += 1
    total_lack[i] += len(lack)
print(f"=> Total missed: {int(np.median(total_lack))} values, {int(np.median(total_col_involved))} columns involved.")

print("\nGreat dataset")
total_lack = np.array([0]*5)
total_col_involved = np.array([0]*5)
for i in range(5):
  for col in label_dictionary.keys():
    lack = list(set(label_dictionary[col].keys()- set(great[i][col])))
    if len(lack):
      total_col_involved[i] += 1
    total_lack[i] += len(lack)
print(f"=> Total missed: {int(np.median(total_lack))} values, {int(np.median(total_col_involved))} columns involved.")

print("\nPAFT dataset")
total_lack = np.array([0]*5)
total_col_involved = np.array([0]*5)
for i in range(5):
  for col in label_dictionary.keys():
    lack = list(set(label_dictionary[col].keys()- set(paft[i][col])))
    if len(lack):
      total_col_involved[i] += 1
    total_lack[i] += len(lack)
print(f"=> Total missed: {int(np.median(total_lack))} values, {int(np.median(total_col_involved))} columns involved.")

In [None]:
# Knowledge check: median housing price should be 14999-500001
print("Knowledge check: median housing price should be 14999-500001")

wrong = [0]*5
print("\nCTGAN dataset")
for i in range(5):
  for j in range(len(ctgan[i])):
    if ctgan[i]['median_house_value'][j]<14999 or ctgan[i]['median_house_value'][j]>500001:
      wrong[i] += 1
wrong = np.array(wrong)/len(ctgan[i])*100
print("\n - CTGAN method:")
print(f"    -> Error: {np.mean(wrong).round(2)} % (+/-{np.std(wrong).round(2)})")

wrong = [0]*5
print("\nCopulaGAN dataset")
for i in range(5):
  for j in range(len(copulagan[i])):
    if copulagan[i]['median_house_value'][j]<14999 or copulagan[i]['median_house_value'][j]>500001:
      wrong[i] += 1
wrong = np.array(wrong)/len(copulagan[i])*100
print("\n - CopulaGAN method:")
print(f"    -> Error: {np.mean(wrong).round(2)} % (+/-{np.std(wrong).round(2)})")

wrong = [0]*5
print("\nTabSyn dataset")
for i in range(5):
  for j in range(len(tabsyn[i])):
    if tabsyn[i]['median_house_value'][j]<14999 or tabsyn[i]['median_house_value'][j]>500001:
      wrong[i] += 1
wrong = np.array(wrong)/len(tabsyn[i])*100
print(f"    -> Error: {np.mean(wrong).round(2)} % (+/-{np.std(wrong).round(2)})")

wrong = [0]*5
print("\n - GReaT method:")
for i in range(5):
  for j in range(len(great[i])):
    if great[i]['median_house_value'][j]<14999 or great[i]['median_house_value'][j]>500001:
      wrong[i] += 1
wrong = np.array(wrong)/len(great[i])*100
print(f"    -> Error: {np.mean(wrong).round(2)} % (+/-{np.std(wrong).round(2)})")

wrong = [0]*5
print("\nPAFT dataset")
for i in range(5):
  for j in range(len(paft[i])):
    if paft[i]['median_house_value'][j]<14999 or paft[i]['median_house_value'][j]>500001:
      wrong[i] += 1
wrong = np.array(wrong)/len(paft[i])*100
print("\n - PAFT method:")
print(f"    -> Error: {np.mean(wrong).round(2)} % (+/-{np.std(wrong).round(2)})")


In [None]:
train.replace(label_dictionary, inplace=True)
test.replace(label_dictionary, inplace=True)

for i in range(5):
  ctgan[i] = ctgan[i][list(train.columns)] # re-order columns
  ctgan[i].replace(label_dictionary, inplace=True)

  copulagan[i] = copulagan[i][list(train.columns)] # re-order columns
  copulagan[i].replace(label_dictionary, inplace=True)

  tabsyn[i] = tabsyn[i][list(train.columns)] # re-order columns
  tabsyn[i].replace(label_dictionary, inplace=True)

  great[i] = great[i][list(train.columns)] # re-order columns
  great[i].replace(label_dictionary, inplace=True)

  paft[i] = paft[i][list(train.columns)] # re-order columns
  paft[i].replace(label_dictionary, inplace=True)

## Evaluation - ML Efficieny

In [None]:
# train with generated data and test with ground truth
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import accuracy_score, mean_absolute_percentage_error

def MLE(train_df, test_df, label_col, label_col_discrete):
  X_train = []
  y_train = []
  X_test = []
  y_test = []
  for i in range(len(train_df)):
    df = train_df[i].copy().drop([label_col], axis=1)
    X_train.append(df)
    y_train.append(train_df[i][label_col])

    df = test_df.copy().drop([label_col], axis=1)
    X_test.append(df)
    y_test.append(test_df[label_col])

  if label_col_discrete:
    RF = RandomForestClassifier()
    accuracy = []
    for i in range(len(train_df)):
      RF.fit(X_train[i], y_train[i])
      y_pred = RF.predict(X_test[i])
      accuracy.append(accuracy_score(y_test[i], y_pred))
    accuracy = np.array(accuracy)*100
    print(f'  -> RF accuracy: {np.mean(accuracy).round(2)} % (+/-{np.std(accuracy).round(2)})')
    
    LR = LogisticRegression()
    accuracy = []
    for i in range(len(train_df)):
      LR.fit(X_train[i], y_train[i])
      y_pred = LR.predict(X_test[i])
      accuracy.append(accuracy_score(y_test[i], y_pred))
    accuracy = np.array(accuracy)*100
    print(f'  -> LR accuracy: {np.mean(accuracy).round(2)} % (+/-{np.std(accuracy).round(2)})')

    NN = MLPClassifier(solver='adam', hidden_layer_sizes=(150, 100, 50), max_iter=300, activation='relu')
    accuracy = []
    for i in range(len(train_df)):
      NN.fit(X_train[i], y_train[i])
      y_pred = NN.predict(X_test[i])
      accuracy.append(accuracy_score(y_test[i], y_pred))
    accuracy = np.array(accuracy)*100
    print(f'  -> NN accuracy: {np.mean(accuracy).round(2)} % (+/-{np.std(accuracy).round(2)})')
  else:
    RF = RandomForestRegressor()
    mape = []
    for i in range(len(train_df)):
      RF.fit(X_train[i], y_train[i])
      y_pred = RF.predict(X_test[i])
      mape.append(mean_absolute_percentage_error(y_test[i], y_pred))
    mape = np.array(mape)
    print(f'  -> RF mape: {np.mean(mape).round(2)} % (+/-{np.std(mape).round(2)})')

    LR = LinearRegression()
    mape = []
    for i in range(len(train_df)):
      LR.fit(X_train[i], y_train[i])
      y_pred = LR.predict(X_test[i])
      mape.append(mean_absolute_percentage_error(y_test[i], y_pred))
    mape = np.array(mape)
    print(f'  -> LR mape: {np.mean(mape).round(2)} % (+/-{np.std(mape).round(2)})')

    NN = MLPRegressor(solver='adam', hidden_layer_sizes=(150, 100, 50), max_iter=300, activation='relu')
    mape = []
    for i in range(len(train_df)):
      NN.fit(X_train[i], y_train[i])
      y_pred = NN.predict(X_test[i])
      mape.append(mean_absolute_percentage_error(y_test[i], y_pred))
    mape = np.array(mape)
    print(f"  -> NN mape: {np.mean(mape).round(2)} % (+/-{np.std(mape).round(2)})")

print("\nMachine Learning Effienciency:")

print("\n - Original data:")
MLE([train], test, 'median_house_value', label_col_discrete=False)

print("\n - CTGAN method:")
MLE(ctgan, test, 'median_house_value', label_col_discrete=False)

print("\n - CopulaGAN method:")
MLE(copulagan, test, 'median_house_value', label_col_discrete=False)

print("\n - TabSyn method:")
MLE(tabsyn, test, 'median_house_value', label_col_discrete=False)

print("\n - GReaT method:")
MLE(great, test, 'median_house_value', label_col_discrete=False)

print("\n - PAFT method:")
MLE(paft, test, 'median_house_value', label_col_discrete=False)

## Evaluation - Discriminator

In [None]:
# train with ground truth + random data (as different as possible), then test generated data to see if its real/fake
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

def Discriminator(train_df, test_df, generated_df):
  train_data = train_df.copy()
  train_data['real'] = 1
  random = {}
  float_cols = []
  for col in list(train_data.columns):
    random[col] = np.random.choice(train_data[col], len(train_data))
  random = pd.DataFrame(random)
  for col in float_cols:
    random[col] = random[col].astype(float)
  random['real'] = 0
  X_train = shuffle(pd.concat([train_data, random])).reset_index(drop=True)
  y_train = X_train['real']
  X_train = X_train.drop(['real'], axis=1)
  
  X_test = []
  y_test = []
  for i in range(5):
    generated_data = generated_df[i].copy()
    generated_data['real'] = 0
    test_data = test_df.copy()
    test_data['real'] = 1
    generated_data = generated_data.sample(len(test_data))
    X_test.append(shuffle(pd.concat([generated_data, test_data])).reset_index(drop=True))
    y_test.append(X_test[i]['real'])
    X_test[i] = X_test[i].drop(['real'], axis=1)

  RF = RandomForestClassifier()
  accuracy = []
  for i in range(5):
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test[i])
    accuracy.append(accuracy_score(y_test[i], y_pred))
  accuracy = np.array(accuracy)*100
  print(f'    -> RF accuracy: {np.mean(accuracy).round(2)} % (+/-{np.std(accuracy).round(2)})')

print("\nDiscriminator (Training with Real + Random data. Closer to 50% accuracy is better):")

print("\n - CTGAN method:")
Discriminator(train, test, ctgan)

print("\n - CopulaGAN method:")
Discriminator(train, test, copulagan)

print("\n - TabSyn method:")
Discriminator(train, test, tabsyn)

print("\n - GReaT method:")
Discriminator(train, test, great)

print("\n - PAFT method:")
Discriminator(train, test, paft)

## Distribution (single/multi-variate)

In [None]:
train_df = pd.read_csv(f'../data_to_eval/train/{filename}.csv') # train dataset
train = train.dropna().reset_index(drop=True)

ctgan_df = []
copulagan_df = []
tabsyn_df = []
great_df = []
paft_df = []
for i in range(5):
    df = pd.read_csv(f'../data_to_eval/ctgan/{filename}_{i}.csv')
    df = df[list(train.columns)] # re-order columns
    df = df.dropna().reset_index(drop=True)
    ctgan_df.append(df)

    df = pd.read_csv(f'../data_to_eval/copulagan/{filename}_{i}.csv')
    df = df[list(train.columns)] # re-order columns
    df = df.dropna().reset_index(drop=True)
    copulagan_df.append(df)

    df = pd.read_csv(f'../data_to_eval/tabsyn/{filename}_{i}.csv')
    df = df[list(train.columns)] # re-order columns
    df = df.dropna().reset_index(drop=True)
    tabsyn_df.append(df)

    df = pd.read_csv(f'../data_to_eval/great/{filename}_{i}.csv')
    df = df[list(train.columns)] # re-order columns
    df = df.dropna().reset_index(drop=True)
    great_df.append(df)

    df = pd.read_csv(f'../data_to_eval/paft/{filename}_{i}.csv')
    df = df[list(train.columns)] # re-order columns
    df = df.dropna().reset_index(drop=True)
    paft_df.append(df)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math

def distribution(train, col):
    fig, axes = plt.subplots(1, 5, figsize=(30, 5))
    
    train_data = train_df.copy()
    train_data["method"] = "Real"
    for i in range(5):
        ctgan_data = ctgan_df[i].copy()
        copulagan_data = copulagan_df[i].copy()
        tabsyn_data = tabsyn_df[i].copy()
        great_data = great_df[i].copy()
        paft_data = paft_df[i].copy()

        ctgan_data["method"] = "CTGAN"
        copulagan_data["method"] = "CopulaGAN"
        tabsyn_data["method"] = "TabSyn"
        great_data["method"] = "GReaT"
        paft_data["method"] = "PAFT"
        df = pd.concat([train_data, ctgan_data, copulagan_data, tabsyn_data, great_data, paft_data]).reset_index(drop=True)

        if col in categorical_columns:
            sns.countplot(data=df, x=col, hue="method", ax=axes[i], palette=['r', '#b9f2f0', '#d0bbff', '#8de5a1', '#FFE48F', 'b'])
            axes[i].tick_params(axis='x', rotation=90)
        else:
            sns.kdeplot(data=df, x=col, hue="method", ax=axes[i], shade=True, palette=['r', '#b9f2f0', '#d0bbff', '#8de5a1', '#FFE48F', 'b'])
    
    plt.savefig(f'./distribution/{filename}_{col}.png')
    plt.show()

print("\nDistribution:")

for col in list(train.columns):
  print(f" - {col} column:")
  distribution(train, col)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math

def distribution(train, columns, indexes):
    fig, axes = plt.subplots(2, 1, figsize=(7, 15))
    
    train_data = train_df.copy()
    train_data["method"] = "Real"
    
    for col, i in zip(columns, indexes):
        ctgan_data = ctgan_df[i].copy()
        copulagan_data = copulagan_df[i].copy()
        tabsyn_data = tabsyn_df[i].copy()
        great_data = great_df[i].copy()
        paft_data = paft_df[i].copy()

        ctgan_data["method"] = "CTGAN"
        copulagan_data["method"] = "CopulaGAN"
        tabsyn_data["method"] = "TabSyn"
        great_data["method"] = "GReaT"
        paft_data["method"] = "PAFT"
        df = pd.concat([train_data, ctgan_data, copulagan_data, tabsyn_data, great_data, paft_data]).reset_index(drop=True)

        if col in categorical_columns:
            sns.countplot(data=df, x=col, hue="method", ax=axes[1], palette=['r', '#b9f2f0', '#d0bbff', '#8de5a1', '#FFE48F', 'b'])
            axes[1].set_xlabel(col, fontsize=20)
            axes[1].set_ylabel('# Count', fontsize=20)
            axes[1].legend(loc='upper right')
        else:
            sns.kdeplot(data=df, x=col, hue="method", ax=axes[0], palette=['r', '#b9f2f0', '#d0bbff', '#8de5a1', '#FFE48F', 'b'], fill=True)
            axes[0].set_xlabel(col, fontsize=20)
            axes[0].set_ylabel('Density', fontsize=20)
            axes[0].set_title('California', fontsize=24)
    
    plt.show()

print("\nDistribution:")

columns = ['ocean_proximity', 'median_house_value']
distribution(train, columns, [2, 0])