In [1]:
from be_great import GReaT
import json
import pandas as pd
import numpy as np
from src.utils_methods import encode, get_var_metadata, post_process, transform_df_to_json, generate_plots, check_constraints


RUNNING ON GPU


In [2]:
main_folder = ''
res_folder = main_folder + "results/LLM/"
data_folder = main_folder + "data/"  

with open(data_folder + "real_structures.json") as f:
    real_orig = json.load(f)
DATA_ARRAY = [real_orig]
DATA_NAMES = ["real"]

USE_INFORMATIVE_NAMES = True

In [3]:
brace_dict = {    
    "NONE": 0,
    "H": 1,
    "Z": 2,
    "IZ": 3,
    "ZH": 4,
    "IZH": 5,
    "K": 6,
    "X": 7,
    "XH": 8,
    "nan": 9,
}

brace_dict_inv = dict(zip(brace_dict.values(), brace_dict.keys()))

N_BRACES = len(brace_dict)


In [4]:
max_layers = max(d.get('n_layers', 0) for sublist in DATA_ARRAY for d in sublist)
transformed_columns = ["legs", "total_height", "radius_bottom", "radius_top", "n_layers"]
brace_cols = ["brace" + str(i) for i in range(max_layers-1)] 
layer_cols = ["layer_height" + str(i) for i in range(max_layers-2)]
transformed_columns += brace_cols
transformed_columns += layer_cols

dataframes = []
for dataset, name in zip(DATA_ARRAY, DATA_NAMES):
    encoding = [encode(d, max_layers, brace_dict, N_BRACES, one_hot=False, native=True, normalize_layer_heights=True) for d in dataset]
    df_ = pd.DataFrame(encoding, columns=transformed_columns)
    df_["label"] = [name]*len(df_)
    dataframes.append(df_.copy())
    
train_original = pd.concat(dataframes, axis=0, ignore_index=True)

nominal_features = brace_cols
ordinal_features = ["n_layers", "legs"]
BERNOULLI = ["legs"]

discrete_features = nominal_features + ordinal_features


continuous_features = list(set(transformed_columns) - set(nominal_features) - set(ordinal_features))

train_original[ordinal_features] = train_original[ordinal_features].astype("int")
train_original[continuous_features] = train_original[continuous_features].astype("float")

# synth_original[ordinal_features] = synth_original[ordinal_features].astype("int")
# synth_original[continuous_features] = synth_original[continuous_features].astype("float")

train = train_original.drop("label", axis=1)


def get_ordinal(n):
    if 10 <= n % 100 <= 20:
        suffix = 'th'
    else:
        suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
    return f"{n}{suffix}"

In [5]:

dtypes_dict = {'continuous': float, 'categorical': str, 'ordinal': int,\
              'bernoulli': int, 'binomial': int}
    
dtypes_dict_famd = {'continuous': float, 'categorical': str, 'ordinal': str,\
              'bernoulli': str, 'binomial': str}

var_distrib, var_transform_only, le_dict, brace_cols, unique_braces = get_var_metadata(train, train_original, brace_cols, BERNOULLI)
p = train.shape[1]
dtype = {train.columns[j]: dtypes_dict_famd[var_transform_only[j]] for j in range(p)}

In [6]:
model = GReaT(llm='distilgpt2', batch_size=32, epochs=100)

In [7]:
brace_cols_informative = ["brace type of {} layer".format(get_ordinal(i+1)) for i in range(len(brace_cols))]
layer_cols_informative = ["{} layer height".format(get_ordinal(i+1)) for i in range(len(layer_cols))]
columns_informative = ["number of vertical legs", "total height", "bottom radius", "top radius", "number of layers"]

columns_informative += brace_cols_informative
columns_informative += layer_cols_informative
if USE_INFORMATIVE_NAMES:
    train_informative = train.copy()
    train_informative.columns = columns_informative
    print(columns_informative)
    model.fit(train_informative)
else:
    model.fit(train)

['number of vertical legs', 'total height', 'bottom radius', 'top radius', 'number of layers', 'brace type of 1st layer', 'brace type of 2nd layer', 'brace type of 3rd layer', 'brace type of 4th layer', 'brace type of 5th layer', '1st layer height', '2nd layer height', '3rd layer height', '4th layer height']


  0%|          | 0/400 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'train_runtime': 121.2799, 'train_samples_per_second': 82.454, 'train_steps_per_second': 3.298, 'train_loss': 0.8250865173339844, 'epoch': 100.0}


In [8]:

target_nb_pseudo_obs = 20
# sample 5 times because we loose some from constraints


cols = train.columns


layer_cols_ = [train.columns.get_loc(col) for col in layer_cols]
brace_cols_ = [train.columns.get_loc(col) for col in brace_cols]



other_continuous_cols = list(set(np.where(var_distrib == 'continuous')[0]) - set(layer_cols))
min_values = train.min()
max_values = train.max()

ranges = np.array(list(zip(min_values, max_values)))
ranges_cont = [ranges[other_continuous_cols], other_continuous_cols]


def get_samples(target_nb_pseudo_obs):
    y_new_all = []
    total_nb_obs_generated = 0
    nb_pseudo_obs = 0
    nb_points = target_nb_pseudo_obs
    while nb_pseudo_obs <= target_nb_pseudo_obs:
        y_new = model.sample(n_samples=nb_points,max_length=4000)
        if USE_INFORMATIVE_NAMES:
            y_new.columns = train.columns
        mask = np.apply_along_axis(lambda x: check_constraints(x, var_distrib, le_dict, cols, layer_cols_, brace_cols_, continuous_ranges=ranges_cont), axis=1, arr=y_new)
        filtered_rows = y_new[mask]
        y_new_all.append(filtered_rows)

        total_nb_obs_generated += len(y_new)

        nb_pseudo_obs = len(np.concatenate(y_new_all))

        nb_points = target_nb_pseudo_obs - nb_pseudo_obs + target_nb_pseudo_obs//10

    # Keep target_nb_pseudo_obs pseudo-observations
    y_new_all = np.concatenate(y_new_all)
    y_new_all = y_new_all[:target_nb_pseudo_obs]
    return y_new_all

# y_new_all = get_samples(target_nb_pseudo_obs)
# pred = pd.DataFrame(y_new_all, columns = train.columns) 

In [9]:
mask = np.apply_along_axis(lambda x: check_constraints(x, var_distrib, le_dict, cols, layer_cols_, brace_cols_, continuous_ranges=ranges_cont), axis=1, arr=y_new)


NameError: name 'y_new' is not defined

In [None]:
pred.shape

In [None]:
pred_post, pred_post_famd = post_process(pred, train.columns, le_dict, discrete_features, brace_cols, dtype, layer_cols_)


In [None]:
import json
json_output = transform_df_to_json(pred_post)

prfx = "_informative" if USE_INFORMATIVE_NAMES else ""
out_name = "llm("+"_".join(DATA_NAMES) + ")_" + str(target_nb_pseudo_obs) + prfx +".json"
output_filename = res_folder + out_name
with open(output_filename, "w") as json_file:
    json.dump(json_output, json_file, indent=4)

In [None]:

generate_plots(["real", "synthetic (GA)", "synthetic"], train_original, pred_post, None, None, var_distrib, le_dict, brace_cols, unique_braces, res_folder)        

In [10]:
import time
import numpy as np
import matplotlib.pyplot as plt

# Initialize lists to store results
target_values = [100, 500, 1000]
num_iterations = 1  # Number of times to run the function for each target value
execution_times = []

for target_nb_pseudo_obs in target_values:
    times_for_target = []
    print(f"Processing target_nb_pseudo_obs = {target_nb_pseudo_obs}")
    start_time = time.time()
    y_new_all = get_samples(target_nb_pseudo_obs)
    end_time = time.time()
    execution_time = end_time - start_time

    execution_times.append(execution_time)


Processing target_nb_pseudo_obs = 100


100%|██████████| 100/100 [01:06<00:00,  1.51it/s]
57it [00:46,  1.22it/s]                        
100%|██████████| 28/28 [00:28<00:00,  1.01s/it]
100%|██████████| 16/16 [00:13<00:00,  1.15it/s]


Processing target_nb_pseudo_obs = 500


501it [06:47,  1.23it/s]                         
324it [04:20,  1.24it/s]                         
193it [02:49,  1.14it/s]                         
96it [01:19,  1.21it/s]                        
100%|██████████| 50/50 [00:41<00:00,  1.20it/s]


Processing target_nb_pseudo_obs = 1000


1001it [13:50,  1.21it/s]                         
100%|██████████| 609/609 [08:54<00:00,  1.14it/s]
328it [04:24,  1.24it/s]                         
180it [02:20,  1.28it/s]                         


In [12]:
# Save results to a file
with open(res_folder+'execution_times.txt', 'w') as f:
    for target_nb_pseudo_obs, mean_time in zip(target_values, execution_times):
        f.write(f'Target obs: {target_nb_pseudo_obs}, Mean Time: {mean_time:.4f}\n')