In [1]:
import sys 
sys.path.append("..")
from src.dataset import Dataset
import pandas as pd
import numpy as np
from definitions import *
import copy

import matplotlib.pyplot as plt

import warnings

# Suppress LightGBM categorical_feature warning
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature keyword has been found*")
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature in param dict is overridden*")

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb






In [2]:
dataset_name = "adult"


dataset_name_latex = "\\"+dataset_name


if dataset_name=="credit":
    dataset_name_latex += "dataset"
dataset_generator = Dataset(dataset_name)
all_data = dataset_generator.original_dataframe.copy()

Dataset adult_fnlwgt_educational-num has ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] categorical and ['age', 'capital-gain', 'capital-loss', 'hours-per-week'] numerical columns.


In [3]:
# generative_methods = ["tabfairgan", "gaussian_copula", "ctgan", "tvae", "cart", "smote"]
generative_methods = ["tabfairgan"]
num_folds = 3
num_repeats = 3
fit_times_all_mean = []
fit_times_all_std = []
sample_times_all_mean = []
sample_times_all_std = []

both_times_all_mean = []
both_times_all_std = []

size = 10000
import time


target = dataset_generator.target
target_class_desired = dataset_generator.target_class_desired
dtype_map = dataset_generator.dtype_map

tab_fair_gan_args = {"verbose": False, "protected_attributes":["sex"],
                                        "target": target, "target_class_desired": target_class_desired}


for generative_method in generative_methods:
    print(generative_method)
    target = dataset_generator.target
    fit_times = []
    sample_times = []
    both_times = []
    rkf = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats, random_state=42)
    for i, (train_index, test_index) in enumerate(rkf.split(all_data)):    

        data_train, data_test = all_data.loc[train_index], all_data.loc[test_index]
        data_train_encoded = dataset_generator.encode(data_train, keep_dtypes=True)
        data_test_encoded = dataset_generator.encode(data_test)


        X_train_real = data_train.copy().drop(columns=[target])

        y_train_real = data_train_encoded[target].copy().astype("int")

        class_split_df = X_train_real.copy()
        class_split_df.drop('sex', axis=1, inplace=True)
        start_time = time.time()

        if generative_method=="tvae" or generative_method=="ctgan" or generative_method=="gaussian_copula":
            split_synthesizer = dataset_generator.train_synthesizer(generative_method, class_split_df, encode=False, random_state=i) 
        elif generative_method=="tabfairgan":
            split_synthesizer = TabFairGAN(seed=i, dtype_map=dtype_map, **tab_fair_gan_args)
            split_synthesizer.fit(data_train)
        else:
            split_synthesizer = dataset_generator.train_synthesizer(generative_method, class_split_df, encode=True, random_state=i) 
        end_time = time.time()
        time_taken1 = end_time - start_time
        fit_times.append(time_taken1)
        start_time = time.time()

        if generative_method=="tvae" or generative_method=="ctgan" or generative_method=="gaussian_copula":
            split_synthetic_data = dataset_generator.generate_data(split_synthesizer, num=size, name=generative_method, decode=False, random_state=i)
        elif generative_method=="tabfairgan":
            split_synthetic_data = split_synthesizer.generate(int(size))
        else:
            split_synthetic_data = dataset_generator.generate_data(split_synthesizer, num=size, random_state=i)
        end_time = time.time()
        time_taken = end_time - start_time
        sample_times.append(time_taken)
        both_times.append(time_taken1 + time_taken)

    fit_times_all_mean.append(np.mean(fit_times))
    fit_times_all_std.append(np.std(fit_times))

    sample_times_all_mean.append(np.mean(sample_times))
    sample_times_all_std.append(np.std(sample_times))

    both_times_all_mean.append(np.mean(both_times))
    both_times_all_std.append(np.std(both_times))

tabfairgan


Training epochs TabFairGAN: 100%|██████████| 200/200 [03:02<00:00,  1.10it/s]
Training epochs TabFairGAN: 100%|██████████| 200/200 [03:12<00:00,  1.04it/s]
Training epochs TabFairGAN: 100%|██████████| 200/200 [03:14<00:00,  1.03it/s]
Training epochs TabFairGAN: 100%|██████████| 200/200 [02:57<00:00,  1.13it/s]
Training epochs TabFairGAN: 100%|██████████| 200/200 [03:04<00:00,  1.09it/s]
Training epochs TabFairGAN: 100%|██████████| 200/200 [03:10<00:00,  1.05it/s]
Training epochs TabFairGAN: 100%|██████████| 200/200 [02:54<00:00,  1.15it/s]
Training epochs TabFairGAN: 100%|██████████| 200/200 [03:16<00:00,  1.02it/s]
Training epochs TabFairGAN: 100%|██████████| 200/200 [03:14<00:00,  1.03it/s]


In [4]:
print(generative_methods)
print(fit_times_all_mean)
print(fit_times_all_std)

print(sample_times_all_mean)
print(sample_times_all_std)

print(both_times_all_mean)
print(both_times_all_std)

['tabfairgan']
[187.52189183235168]
[7.641595471229717]
[0.011350525750054253]
[0.005235342892258026]
[187.53324235810175]
[7.6401043596181495]
