In [1]:
from be_great import GReaT
import json
import pandas as pd
import numpy as np
from src.utils_methods import encode, get_var_metadata, post_process, transform_df_to_json, generate_plots, check_constraints
import be_great

RUNNING ON GPU


In [2]:
print(be_great.__file__)

/home/panagiotou/anaconda3/envs/samplestructures/lib/python3.11/site-packages/be_great/__init__.py


In [3]:
main_folder = ''
USE_INFORMATIVE_NAMES = False
if USE_INFORMATIVE_NAMES:
    res_folder = main_folder + "results/LLM_informative/"
else:
    res_folder = main_folder + "results/LLM/"
data_folder = main_folder + "data/"  

with open("real_structures.json") as f:
    real_orig = json.load(f)
DATA_ARRAY = [real_orig]
DATA_NAMES = ["real"]


In [4]:
brace_dict = {    
    "NONE": 0,
    "H": 1,
    "Z": 2,
    "IZ": 3,
    "ZH": 4,
    "IZH": 5,
    "K": 6,
    "X": 7,
    "XH": 8,
    "nan": 9,
}

brace_dict_inv = dict(zip(brace_dict.values(), brace_dict.keys()))

N_BRACES = len(brace_dict)


In [5]:
max_layers = max(d.get('n_layers', 0) for sublist in DATA_ARRAY for d in sublist)
transformed_columns = ["legs", "total_height", "radius_bottom", "radius_top", "n_layers"]
brace_cols = ["brace" + str(i) for i in range(max_layers-1)] 
layer_cols = ["layer_height" + str(i) for i in range(max_layers-2)]
transformed_columns += brace_cols
transformed_columns += layer_cols

dataframes = []
for dataset, name in zip(DATA_ARRAY, DATA_NAMES):
    encoding = [encode(d, max_layers, brace_dict, N_BRACES, one_hot=False, native=True, normalize_layer_heights=True) for d in dataset]
    df_ = pd.DataFrame(encoding, columns=transformed_columns)
    df_["label"] = [name]*len(df_)
    dataframes.append(df_.copy())
    
train_original = pd.concat(dataframes, axis=0, ignore_index=True)

nominal_features = brace_cols
ordinal_features = ["n_layers", "legs"]
BERNOULLI = ["legs"]

discrete_features = nominal_features + ordinal_features


continuous_features = list(set(transformed_columns) - set(nominal_features) - set(ordinal_features))

train_original[ordinal_features] = train_original[ordinal_features].astype("int")
train_original[continuous_features] = train_original[continuous_features].astype("float")

# synth_original[ordinal_features] = synth_original[ordinal_features].astype("int")
# synth_original[continuous_features] = synth_original[continuous_features].astype("float")

train = train_original.drop("label", axis=1)


def get_ordinal(n):
    if 10 <= n % 100 <= 20:
        suffix = 'th'
    else:
        suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
    return f"{n}{suffix}"

In [6]:

dtypes_dict = {'continuous': float, 'categorical': str, 'ordinal': int,\
              'bernoulli': int, 'binomial': int}
    
dtypes_dict_famd = {'continuous': float, 'categorical': str, 'ordinal': str,\
              'bernoulli': str, 'binomial': str}

var_distrib, var_transform_only, le_dict, brace_cols, unique_braces = get_var_metadata(train, train_original, brace_cols, BERNOULLI)
p = train.shape[1]
dtype = {train.columns[j]: dtypes_dict_famd[var_transform_only[j]] for j in range(p)}

In [7]:
model = GReaT(llm='distilgpt2', batch_size=32, epochs=100)

In [8]:
pytorch_total_params = sum(p.numel() for p in model.model.parameters() if p.requires_grad)
print(pytorch_total_params)

81912576


In [9]:
import torch
def get_flat_params(model):
    return torch.cat([p.flatten() for p in model.parameters()])
initial_params = get_flat_params(model.model).detach().clone()

In [10]:
train.shape

(100, 14)

In [11]:
brace_cols_informative = ["brace type of {} layer".format(get_ordinal(i+1)) for i in range(len(brace_cols))]
layer_cols_informative = ["{} layer height".format(get_ordinal(i+1)) for i in range(len(layer_cols))]
columns_informative = ["number of vertical legs", "total height", "bottom radius", "top radius", "number of layers"]

columns_informative += brace_cols_informative
columns_informative += layer_cols_informative
if USE_INFORMATIVE_NAMES:
    train_informative = train.copy()
    train_informative.columns = columns_informative
    print(columns_informative)
    model.fit(train_informative)
else:
    model.fit(train)

  0%|          | 0/400 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'train_runtime': 97.5673, 'train_samples_per_second': 102.493, 'train_steps_per_second': 4.1, 'train_loss': 0.9344205474853515, 'epoch': 100.0}


In [12]:
# Get final parameters
final_params = get_flat_params(model.model).cpu()

# Find the indices where the parameters changed
changed_indices = torch.nonzero(initial_params != final_params)
changed_params = final_params[changed_indices]

print("Num of changed parameters:", len(changed_params))

Num of changed parameters: 81208978


In [13]:

target_nb_pseudo_obs = 10
# sample 5 times because we loose some from constraints


cols = train.columns


layer_cols_ = [train.columns.get_loc(col) for col in layer_cols]
brace_cols_ = [train.columns.get_loc(col) for col in brace_cols]



other_continuous_cols = list(set(np.where(var_distrib == 'continuous')[0]) - set(layer_cols))
min_values = train.min()
max_values = train.max()

ranges = np.array(list(zip(min_values, max_values)))
ranges_cont = [ranges[other_continuous_cols], other_continuous_cols]


def get_samples(target_nb_pseudo_obs):
    y_new_all = []
    total_nb_obs_generated = 0
    nb_pseudo_obs = 0
    nb_points = target_nb_pseudo_obs
    while nb_pseudo_obs <= target_nb_pseudo_obs:
        y_new = model.sample(n_samples=nb_points,max_length=4000)
        if USE_INFORMATIVE_NAMES:
            y_new.columns = train.columns
        mask = np.apply_along_axis(lambda x: check_constraints(x, var_distrib, le_dict, cols, layer_cols_, brace_cols_, continuous_ranges=ranges_cont), axis=1, arr=y_new)
        filtered_rows = y_new[mask]
        y_new_all.append(filtered_rows)

        total_nb_obs_generated += len(y_new)

        nb_pseudo_obs = len(np.concatenate(y_new_all))

        nb_points = target_nb_pseudo_obs - nb_pseudo_obs + target_nb_pseudo_obs//10

    # Keep target_nb_pseudo_obs pseudo-observations
    y_new_all = np.concatenate(y_new_all)
    y_new_all = y_new_all[:target_nb_pseudo_obs]
    return y_new_all

y_new_all = get_samples(target_nb_pseudo_obs)
pred = pd.DataFrame(y_new_all, columns = train.columns) 

22it [00:00, 26.25it/s]               
15it [00:01,  9.66it/s]                      
27it [00:00, 35.38it/s]              


In [14]:
# mask = np.apply_along_axis(lambda x: check_constraints(x, var_distrib, le_dict, cols, layer_cols_, brace_cols_, continuous_ranges=ranges_cont), axis=1, arr=y_new)


In [15]:
pred.shape

(10, 14)

In [16]:
pred_post, pred_post_famd = post_process(pred, train.columns, le_dict, discrete_features, brace_cols, dtype, layer_cols_)


In [17]:
import json
json_output = transform_df_to_json(pred_post)

prfx = "_informative" if USE_INFORMATIVE_NAMES else ""
out_name = "llm("+"_".join(DATA_NAMES) + ")_" + str(target_nb_pseudo_obs) + prfx +".json"
output_filename = res_folder + out_name
with open(output_filename, "w") as json_file:
    json.dump(json_output, json_file, indent=4)

In [18]:
# import os
# def generate_seeds(target_nb_pseudo_obs, num_seeds=5):
#     for seed in range(num_seeds):
#         y_new_all = get_samples(target_nb_pseudo_obs)
#         pred = pd.DataFrame(y_new_all, columns = train.columns) 
#         pred_post, pred_post_famd = post_process(pred, train.columns, le_dict, discrete_features, brace_cols, dtype, layer_cols_)
#         json_output = transform_df_to_json(pred_post)
#         outfolder = res_folder + "seeds/" + str(target_nb_pseudo_obs) +"/"
#         if not os.path.exists(outfolder):
#             os.makedirs(outfolder)

#         pre = "llm"
#         if USE_INFORMATIVE_NAMES:
#             pre = "llm_informative"
#         out_name = pre + "("+"_".join(DATA_NAMES) + ")_" + str(target_nb_pseudo_obs) + "_seed_" + str(seed) + ".json"
#         output_filename = outfolder + out_name
#         with open(output_filename, "w") as json_file:
#             json.dump(json_output, json_file, indent=4)

# checkpoints = [1500, 2000]
# for checkpoint in checkpoints:
#     generate_seeds(checkpoint, 5)

In [19]:

# generate_plots(["real", "synthetic (GA)", "synthetic"], train_original, pred_post, None, None, var_distrib, le_dict, brace_cols, unique_braces, res_folder)        

In [22]:
import time
import numpy as np
import matplotlib.pyplot as plt

# Initialize lists to store results
target_values = [100, 500, 1000]
num_iterations = 3  # Number of times to run the function for each target value

for target_nb_pseudo_obs in target_values:
    times_for_target = []
    execution_times = []
    for _ in range(num_iterations):
        print(f"Processing target_nb_pseudo_obs = {target_nb_pseudo_obs}")
        start_time = time.time()
        y_new_all = get_samples(target_nb_pseudo_obs)
        end_time = time.time()
        execution_time = end_time - start_time
        execution_times.append(execution_time)
    print(target_nb_pseudo_obs, np.mean(execution_times))

Processing target_nb_pseudo_obs = 100


118it [00:06, 19.50it/s]                        
58it [00:04, 12.48it/s]                        
100%|██████████| 22/22 [00:01<00:00, 14.07it/s]
18it [00:00, 23.19it/s]               


Processing target_nb_pseudo_obs = 100


108it [00:06, 17.81it/s]                        
62it [00:03, 19.35it/s]                        
28it [00:01, 18.19it/s]                        
18it [00:01, 11.51it/s]                       


Processing target_nb_pseudo_obs = 100


118it [00:05, 22.13it/s]                        
76it [00:03, 24.53it/s]                        
36it [00:02, 15.25it/s]                        
17it [00:01, 10.85it/s]                       


100 12.636540730794271
Processing target_nb_pseudo_obs = 500


504it [00:32, 15.64it/s]                         
100%|██████████| 279/279 [00:14<00:00, 18.78it/s]
129it [00:06, 18.44it/s]                         
59it [00:03, 19.09it/s]                        


Processing target_nb_pseudo_obs = 500


501it [00:29, 17.21it/s]                         
278it [00:15, 18.03it/s]                         
126it [00:07, 17.89it/s]                         
63it [00:03, 16.01it/s]                        


Processing target_nb_pseudo_obs = 500


502it [00:29, 17.14it/s]                         
280it [00:16, 16.94it/s]                         
121it [00:08, 13.90it/s]                         
63it [00:03, 19.88it/s]                        


500 56.99722329775492
Processing target_nb_pseudo_obs = 1000


1006it [00:47, 21.27it/s]                         
548it [00:29, 18.50it/s]                         
265it [00:17, 15.27it/s]                         
134it [00:07, 18.54it/s]                         


Processing target_nb_pseudo_obs = 1000


1007it [00:58, 17.10it/s]                         
560it [00:30, 18.42it/s]                         
261it [00:17, 15.02it/s]                         
130it [00:06, 19.61it/s]                         


Processing target_nb_pseudo_obs = 1000


1006it [00:55, 18.03it/s]                         
562it [00:35, 15.76it/s]                         
268it [00:16, 16.14it/s]                         
100%|██████████| 108/108 [00:07<00:00, 14.30it/s]

1000 110.56907526652019





In [21]:
# Save results to a file
with open(res_folder+'execution_times_new.txt', 'w') as f:
    for target_nb_pseudo_obs, mean_time in zip(target_values, execution_times):
        f.write(f'Target obs: {target_nb_pseudo_obs}, Mean Time: {mean_time:.4f}\n')