In [1]:
import sys 
sys.path.append("..")
from src.dataset import Dataset
import pandas as pd
import numpy as np
from definitions import *
import copy

import matplotlib.pyplot as plt

import warnings

# Suppress LightGBM categorical_feature warning
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature keyword has been found*")
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature in param dict is overridden*")

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb






In [2]:
dataset_name = "adult"


dataset_name_latex = "\\"+dataset_name


if dataset_name=="credit":
    dataset_name_latex += "dataset"
dataset_generator = Dataset(dataset_name)
all_data = dataset_generator.original_dataframe.copy()

Dataset adult_fnlwgt_educational-num has ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country'] categorical and ['age', 'capital-gain', 'capital-loss', 'hours-per-week'] numerical columns.


In [3]:
generative_methods = ["gaussian_copula", "ctgan", "tvae", "cart", "smote"]
num_folds = 3
num_repeats = 5
fit_times_all_mean = []
fit_times_all_std = []
sample_times_all_mean = []
sample_times_all_std = []

both_times_all_mean = []
both_times_all_std = []

size = 10000
import time

for generative_method in generative_methods:
    print(generative_method)
    target = dataset_generator.target
    fit_times = []
    sample_times = []
    both_times = []
    rkf = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats, random_state=42)
    for i, (train_index, test_index) in enumerate(rkf.split(all_data)):    

        data_train, data_test = all_data.loc[train_index], all_data.loc[test_index]
        data_train_encoded = dataset_generator.encode(data_train, keep_dtypes=True)
        data_test_encoded = dataset_generator.encode(data_test)


        X_train_real = data_train.copy().drop(columns=[target])

        y_train_real = data_train_encoded[target].copy().astype("int")

        class_split_df = X_train_real.copy()
        class_split_df.drop('sex', axis=1, inplace=True)
        start_time = time.time()

        if generative_method=="tvae" or generative_method=="ctgan" or generative_method=="gaussian_copula":
            split_synthesizer = dataset_generator.train_synthesizer(generative_method, class_split_df, encode=False, random_state=i) 
        else:
            split_synthesizer = dataset_generator.train_synthesizer(generative_method, class_split_df, encode=True, random_state=i) 
        end_time = time.time()
        time_taken1 = end_time - start_time
        fit_times.append(time_taken1)
        start_time = time.time()

        if generative_method=="tvae" or generative_method=="ctgan" or generative_method=="gaussian_copula":
            split_synthetic_data = dataset_generator.generate_data(split_synthesizer, num=size, name=generative_method, decode=False, random_state=i)
        else:
            split_synthetic_data = dataset_generator.generate_data(split_synthesizer, num=size, random_state=i)
        end_time = time.time()
        time_taken = end_time - start_time
        sample_times.append(time_taken)
        both_times.append(time_taken1 + time_taken)

    fit_times_all_mean.append(np.mean(fit_times))
    fit_times_all_std.append(np.std(fit_times))

    sample_times_all_mean.append(np.mean(sample_times))
    sample_times_all_std.append(np.std(sample_times))

    both_times_all_mean.append(np.mean(both_times))
    both_times_all_std.append(np.std(both_times))

gaussian_copula
ctgan




tvae


  0.19817217]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
  data.iloc[:, 0] = selected_normalized_value
 -0.00760162]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
  data.iloc[:, 0] = selected_normalized_value
  0.01937258]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
  data.iloc[:, 0] = selected_normalized_value
 -0.14519947]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
  data.iloc[:, 0] = selected_normalized_value
  0.30258611]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
  data.iloc[:, 0] = selected_normalized_value
 -0.00103099]' has dtype incompatible with float32, please explicitly cast to a compatible dtype first.
  data.iloc[:, 0] = selected_normalized_value
  0.01014797]' has dtype incompatible with float32, please explicitly cast to a compatible dtype fir

cart
smote


In [4]:
print(generative_methods)
print(fit_times_all_mean)
print(fit_times_all_std)

print(sample_times_all_mean)
print(sample_times_all_std)

print(both_times_all_mean)
print(both_times_all_std)

['gaussian_copula', 'ctgan', 'tvae', 'cart', 'smote']
[2.2590940316518147, 215.49478891690572, 73.52785964012146, 1.0198370774586996, 0.016304747263590495]
[0.137682621283486, 49.118277427023344, 7.855912847482573, 0.01790332718083524, 0.004422130339486343]
[0.16322725613911945, 0.15528831481933594, 0.15989872614542644, 0.32647811571756996, 18.955435276031494]
[0.01396410154343295, 0.027763671747503042, 0.27184605944230217, 0.0020027551568974223, 1.6070019652443603]
[2.4223212877909344, 215.65007723172505, 73.68775836626689, 1.3463151931762696, 18.971740023295084]
[0.13851753560966126, 49.132941493935654, 7.957795919995051, 0.018595690990526163, 1.6092161336178314]
