In [1]:
from sdv.single_table import TVAESynthesizer
from sdv.metadata import SingleTableMetadata
import json
import pandas as pd
import numpy as np
import sdv
from src.utils_methods import encode, get_var_metadata, post_process, transform_df_to_json, generate_plots, check_constraints

RUNNING ON GPU


In [2]:
main_folder = ''
res_folder = main_folder + "results/VAE/"
data_folder = main_folder + "data/"  

with open("real_structures.json") as f:
    real_orig = json.load(f)
DATA_ARRAY = [real_orig]
DATA_NAMES = ["real"]

In [3]:
brace_dict = {    
    "NONE": 0,
    "H": 1,
    "Z": 2,
    "IZ": 3,
    "ZH": 4,
    "IZH": 5,
    "K": 6,
    "X": 7,
    "XH": 8,
    "nan": 9,
}

brace_dict_inv = dict(zip(brace_dict.values(), brace_dict.keys()))

N_BRACES = len(brace_dict)


In [4]:
max_layers = max(d.get('n_layers', 0) for sublist in DATA_ARRAY for d in sublist)
transformed_columns = ["legs", "total_height", "radius_bottom", "radius_top", "n_layers"]
brace_cols = ["brace" + str(i) for i in range(max_layers-1)] 
layer_cols = ["layer_height" + str(i) for i in range(max_layers-2)]
transformed_columns += brace_cols
transformed_columns += layer_cols

dataframes = []
for dataset, name in zip(DATA_ARRAY, DATA_NAMES):
    encoding = [encode(d, max_layers, brace_dict, N_BRACES, one_hot=False, native=True, normalize_layer_heights=True) for d in dataset]
    df_ = pd.DataFrame(encoding, columns=transformed_columns)
    df_["label"] = [name]*len(df_)
    dataframes.append(df_.copy())
    
train_original = pd.concat(dataframes, axis=0, ignore_index=True)

nominal_features = brace_cols
ordinal_features = ["n_layers", "legs"]
BERNOULLI = ["legs"]

discrete_features = nominal_features + ordinal_features


continuous_features = list(set(transformed_columns) - set(nominal_features) - set(ordinal_features))

train_original[ordinal_features] = train_original[ordinal_features].astype("int")
train_original[continuous_features] = train_original[continuous_features].astype("float")

# synth_original[ordinal_features] = synth_original[ordinal_features].astype("int")
# synth_original[continuous_features] = synth_original[continuous_features].astype("float")

train = train_original.drop("label", axis=1)

In [5]:

dtypes_dict = {'continuous': float, 'categorical': str, 'ordinal': int,\
              'bernoulli': int, 'binomial': int}
    
dtypes_dict_famd = {'continuous': float, 'categorical': str, 'ordinal': str,\
              'bernoulli': str, 'binomial': str}

var_distrib, var_transform_only, le_dict, brace_cols, unique_braces = get_var_metadata(train, train_original, brace_cols, BERNOULLI)
p = train.shape[1]
dtype = {train.columns[j]: dtypes_dict_famd[var_transform_only[j]] for j in range(p)}

In [6]:
from vae.tvae_ours import TVAE
from sdv.single_table.utils import detect_discrete_columns

In [7]:
dicrete_dtypes = train_original.copy()
dicrete_dtypes["legs"] = dicrete_dtypes["legs"].astype("str")
dicrete_dtypes["n_layers"] = dicrete_dtypes["n_layers"].astype("str")


In [8]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=train_original)
print(metadata)
# print(train.dtypes)
# print(train_original.dtypes)
# synthesizer = TVAESynthesizer(metadata)
discrete_columns = detect_discrete_columns(metadata, train)
print(discrete_columns)

{
    "columns": {
        "legs": {
            "sdtype": "numerical"
        },
        "total_height": {
            "sdtype": "numerical"
        },
        "radius_bottom": {
            "sdtype": "numerical"
        },
        "radius_top": {
            "sdtype": "numerical"
        },
        "n_layers": {
            "sdtype": "numerical"
        },
        "brace0": {
            "sdtype": "categorical"
        },
        "brace1": {
            "sdtype": "categorical"
        },
        "brace2": {
            "sdtype": "categorical"
        },
        "brace3": {
            "sdtype": "categorical"
        },
        "brace4": {
            "sdtype": "categorical"
        },
        "layer_height0": {
            "sdtype": "numerical"
        },
        "layer_height1": {
            "sdtype": "numerical"
        },
        "layer_height2": {
            "sdtype": "numerical"
        },
        "layer_height3": {
            "sdtype": "numerical"
        },
        "label":

In [9]:
# numm = 50
# num2 = 50
# synthesizer_ours = TVAE(embedding_dim=2, compress_dims=(numm,num2), decompress_dims=(num2,numm))
synthesizer_ours = TVAE(embedding_dim=2)
synthesizer_ours.fit(train, discrete_columns=discrete_columns)

(128, 128)
(128, 128)
23044
22876


In [10]:
# import time
# execution_times = []
# for _ in range(5):
#     synthesizer_ours = TVAE(embedding_dim=2)
#     start_time = time.time()
#     synthesizer_ours.fit(train, discrete_columns=discrete_columns)
#     end_time = time.time()
#     execution_time = end_time - start_time
#     execution_times.append(execution_time)
# print(np.mean(execution_times))

In [11]:
target_nb_pseudo_obs = 1000
# sample 5 times because we loose some from constraints
nb_points= target_nb_pseudo_obs

cols = train.columns
layer_cols_ = np.array([i for i, s in enumerate(cols) if s.startswith('layer_height')])
brace_cols_ = np.array([i for i, s in enumerate(cols) if s.startswith('brace')])


other_continuous_cols = list(set(np.where(var_distrib == 'continuous')[0]) - set(layer_cols))
min_values = train.min()
max_values = train.max()

ranges = np.array(list(zip(min_values, max_values)))
ranges_cont = [ranges[other_continuous_cols], other_continuous_cols]


def get_samples(synthesizer, target_nb_pseudo_obs, var_distrib, le_dict, layer_cols_, brace_cols_, ranges_cont):
    total_nb_obs_generated = 0
    nb_pseudo_obs = 0
    y_new_all = []
    nb_points = target_nb_pseudo_obs
    while nb_pseudo_obs <= target_nb_pseudo_obs:
        y_new = synthesizer.sample(samples=nb_points)
        mask = np.apply_along_axis(lambda x: check_constraints(x, var_distrib, le_dict, cols, layer_cols_, brace_cols_, continuous_ranges=ranges_cont), axis=1, arr=y_new)
        filtered_rows = y_new[mask]
        y_new_all.append(filtered_rows)

        total_nb_obs_generated += len(y_new)

        nb_pseudo_obs = len(np.concatenate(y_new_all))
        nb_points = target_nb_pseudo_obs - nb_pseudo_obs + target_nb_pseudo_obs//10

    # Keep target_nb_pseudo_obs pseudo-observations
    y_new_all = np.concatenate(y_new_all)
    y_new_all = y_new_all[:target_nb_pseudo_obs]
    return y_new_all

y_new_all = get_samples(synthesizer_ours, target_nb_pseudo_obs, var_distrib, le_dict, layer_cols_, brace_cols_, ranges_cont)
pred = pd.DataFrame(y_new_all, columns = train.columns) 

In [12]:
pred

Unnamed: 0,legs,total_height,radius_bottom,radius_top,n_layers,brace0,brace1,brace2,brace3,brace4,layer_height0,layer_height1,layer_height2,layer_height3
0,1.0,53.440403,14.999383,8.669183,3.0,1.0,0.0,1.0,1.0,1.0,0.294616,0.600353,0.785857,0.999703
1,1.0,53.373935,21.572395,12.655244,3.0,1.0,0.0,1.0,1.0,1.0,0.368346,0.599615,0.885198,0.999933
2,1.0,53.597140,17.297660,6.875897,1.0,1.0,0.0,1.0,1.0,1.0,0.255153,0.490946,0.785924,0.999634
3,1.0,63.446582,20.568390,15.721328,2.0,1.0,0.0,1.0,2.0,1.0,0.408644,0.701818,0.997866,0.999910
4,1.0,50.648011,18.332917,8.521645,3.0,1.0,0.0,1.0,1.0,1.0,0.209134,0.545788,0.744488,0.999884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1.0,49.931788,21.351023,8.716566,3.0,1.0,0.0,1.0,1.0,1.0,0.369962,0.542148,0.855061,0.999909
996,1.0,45.676806,15.338427,7.565653,3.0,1.0,0.0,1.0,1.0,1.0,0.274879,0.509877,0.765964,0.999540
997,1.0,40.518593,20.074015,8.647101,3.0,1.0,0.0,1.0,1.0,1.0,0.255073,0.542651,0.853758,0.999908
998,1.0,49.893226,16.542276,7.816276,3.0,1.0,0.0,1.0,1.0,1.0,0.303275,0.523787,0.782815,0.999841


In [13]:
pred_post, pred_post_famd = post_process(pred, train.columns, le_dict, discrete_features, brace_cols, dtype, layer_cols_)

In [14]:
import json

json_output = transform_df_to_json(pred_post)
out_name = "vae("+"_".join(DATA_NAMES) + ")_" + str(target_nb_pseudo_obs) + ".json"
output_filename = res_folder + out_name
with open(output_filename, "w") as json_file:
    json.dump(json_output, json_file, indent=4)

In [15]:
# import os
# def generate_seeds(target_nb_pseudo_obs, num_seeds=5):
#     for seed in range(num_seeds):
#         y_new_all = get_samples(synthesizer_ours, target_nb_pseudo_obs, var_distrib, le_dict, layer_cols_, brace_cols_, ranges_cont)
#         pred = pd.DataFrame(y_new_all, columns = train.columns) 
#         pred_post, pred_post_famd = post_process(pred, train.columns, le_dict, discrete_features, brace_cols, dtype, layer_cols_)
#         json_output = transform_df_to_json(pred_post)
#         outfolder = res_folder + "seeds/" + str(target_nb_pseudo_obs) +"/"
#         if not os.path.exists(outfolder):
#             os.makedirs(outfolder)
#         out_name = "vae("+"_".join(DATA_NAMES) + ")_" + str(target_nb_pseudo_obs) + "_seed_" + str(seed) + ".json"
#         output_filename = outfolder + out_name
#         with open(output_filename, "w") as json_file:
#             json.dump(json_output, json_file, indent=4)

# checkpoints = [100, 500, 1000, 1500, 2000]
# for checkpoint in checkpoints:
#     generate_seeds(checkpoint, 5)

In [16]:

# zz_train, _, _ = synthesizer_ours.encode(train)
# zz_pred, _, _ = synthesizer_ours.encode(pred_post_famd)

# generate_plots(["real", "synthetic (GA)", "synthetic"], train_original, pred_post, zz_train, zz_pred, var_distrib, le_dict, brace_cols, unique_braces, res_folder, percentage=False)        

In [22]:
import time
import numpy as np
import matplotlib.pyplot as plt

# Initialize lists to store results
target_values = range(100, 10001, 100)
target_values = [100]
num_iterations = 5 # Number of times to run the function for each target value
execution_times = []

for target_nb_pseudo_obs in target_values:
    times_for_target = []
    for iteration in range(num_iterations):
        print(f"Processing target_nb_pseudo_obs = {target_nb_pseudo_obs}, Iteration {iteration+1}/{num_iterations}...")
        start_time = time.time()
        y_new_all = get_samples(synthesizer_ours, target_nb_pseudo_obs, var_distrib, le_dict, layer_cols_, brace_cols_, ranges_cont)
        end_time = time.time()
        execution_time = end_time - start_time
        times_for_target.append(execution_time)

    execution_times.append(times_for_target)

# Calculate mean and standard deviation for each target value
mean_execution_times = [np.mean(times) for times in execution_times]
std_execution_times = [np.std(times) for times in execution_times]

print(mean_execution_times)


Processing target_nb_pseudo_obs = 100, Iteration 1/5...
Processing target_nb_pseudo_obs = 100, Iteration 2/5...
Processing target_nb_pseudo_obs = 100, Iteration 3/5...
Processing target_nb_pseudo_obs = 100, Iteration 4/5...
Processing target_nb_pseudo_obs = 100, Iteration 5/5...
[0.11726069450378418]


In [18]:
# # Save results to a file
# with open(res_folder+'execution_times.txt', 'w') as f:
#     for target_nb_pseudo_obs, mean_time, std_time in zip(target_values, mean_execution_times, std_execution_times):
#         f.write(f'Target obs: {target_nb_pseudo_obs}, Mean Time: {mean_time:.4f}, Std Dev: {std_time:.4f}\n')



In [19]:
# # Plot the results with mean and std
# plt.plot(target_values, mean_execution_times, label='Mean Execution Time', color='b')
# plt.fill_between(target_values, 
#                  np.array(mean_execution_times) - np.array(std_execution_times), 
#                  np.array(mean_execution_times) + np.array(std_execution_times), 
#                  color='b', alpha=0.2, label='Standard Deviation')
# plt.xlabel('Target Pseudo Observations')
# plt.ylabel('Execution Time (seconds)')
# plt.title('Execution Time with Uncertainty vs. Target Pseudo Observations')
# plt.legend()
# plt.show()