In [1]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoConfig, AutoTokenizer, T5Tokenizer, Trainer, TrainingArguments, PreTrainedTokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments, convert_slow_tokenizer
from utils import filter_function, preprocess_function, encode_rare_chars, tokenize, create_metrics_computer
import torch
import wandb
import json

In [2]:
config = AutoConfig.from_pretrained("google/t5-efficient-tiny")
# print(config)
# Modify parameters
# config.num_layers = 3  # Set number of encoder layers
# config.num_decoder_layers = 3  # Set number of decoder layers
config.num_heads = 4  # Set number of attention heads
config.d_model = 128  # Set embedding dimension
config.d_ff = 64  # Set feed-forward dimension
config.d_kv = 32
# config.dropout = 0.2

# Initialize the model with the modified configuration
model = AutoModelForSeq2SeqLM.from_config(config)

print("Modified config:", config)

# Initialize the model from scratch using the configuration
model = AutoModelForSeq2SeqLM.from_config(config)

Modified config: T5Config {
  "_name_or_path": "google/t5-efficient-tiny",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 64,
  "d_kv": 32,
  "d_model": 128,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 4,
  "num_heads": 4,
  "num_layers": 4,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "torch_dtype": "float32",
  "transformers_version": "4.47.1",
  "use_cache": true,
  "vocab_size": 32128
}



In [3]:
# tokenizer = AutoTokenizer.from_pretrained("google/t5-efficient-tiny")
tokenizer = PreTrainedTokenizerFast(tokenizer_object=convert_slow_tokenizer.convert_slow_tokenizer(T5Tokenizer("tokenizers/sp_512_bpe_encoded.model", legacy=False, load_from_cache_file=False)))
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# tokenizer = T5Tokenizer(vocab_file="tokenizers/sp_16k_bpe_1.model", legacy=False, load_from_cache_file=False)
model.resize_token_embeddings(len(tokenizer))

Embedding(512, 128)

In [4]:
tokens = tokenizer.encode("How often did germany win gold in the 1994 olympics?[SEP]name[SEP]team[SEP]country[SEP]ikhasbd")
print(tokens)
print([tokenizer.decode(token) for token in tokens])

[268, 51, 137, 79, 166, 127, 8, 345, 13, 356, 40, 12, 127, 322, 106, 21, 160, 306, 46, 341, 356, 345, 352, 55, 340, 372, 3, 337, 74, 3, 335, 54, 3, 344, 208, 3, 338, 374, 342, 43, 354, 346, 2]
['How', 'of', 'ten', 'd', 'id', 'g', 'er', 'm', 'an', 'y', 'w', 'in', 'g', 'old', 'in', 'the', '19', '94', 'o', 'l', 'y', 'm', 'p', 'ic', 's', '?', '[SEP]', 'n', 'ame', '[SEP]', 't', 'eam', '[SEP]', 'c', 'ountry', '[SEP]', 'i', 'k', 'h', 'as', 'b', 'd', '</s>']


In [5]:
token_embedding = model.shared  # Shared token embedding layer
num_token_embedding_params = sum(p.numel() for p in token_embedding.parameters() if p.requires_grad)
print(f"Number of trainable parameters in the token embedding layer: {num_token_embedding_params}")

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total number of trainable parameters: {total_params}")

Number of trainable parameters in the token embedding layer: 65536
Total number of trainable parameters: 986112


In [6]:
path = '../datasets/wikisql'
dataset = load_dataset(path+'/data')

In [7]:
preprocessed_dataset = dataset.map(preprocess_function, batched=True, batch_size=2048)

In [8]:
mapping_file_path = 'mapping.json'
reverse_mapping_file_path = 'reverse_mapping.json'

with open(mapping_file_path, 'r', encoding='utf-8') as mapping_file:
    mapping = json.load(mapping_file)

with open(reverse_mapping_file_path, 'r', encoding='utf-8') as reverse_mapping_file:
    reverse_mapping = json.load(reverse_mapping_file)

In [9]:
preprocessed_dataset = preprocessed_dataset.map(lambda batch: encode_rare_chars(batch, mapping), batched=True, batch_size=2048)

In [10]:
tokenized_dataset = preprocessed_dataset.map(lambda batch: tokenize(batch, tokenizer), batched=True, batch_size=2048)

In [11]:
train_data = tokenized_dataset["train"]
val_data = tokenized_dataset["validation"]
train_data

Dataset({
    features: ['phase', 'question', 'table', 'sql', 'input_text', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 56355
})

In [12]:
# train_data = train_data.filter(lambda sample: filter_function(sample, tokenizer), batched=False)

In [13]:
def experiment(project, experiment_name, lr=2e-4, batch_size=128):
    seeds = [1337] # [1337, 69, 42]
    compute_metrics = create_metrics_computer(val_data, tokenizer, path+'/tables/validation/dev.db', reverse_mapping)
    full_metrics = []
    for run in range(len(seeds)):
        model = AutoModelForSeq2SeqLM.from_config(config)
        model.resize_token_embeddings(len(tokenizer))
        run_name = experiment_name + "_" + str(run+1)
        training_args = Seq2SeqTrainingArguments(
            output_dir="./results/"+run_name,
            run_name=run_name,
            report_to="wandb",
            save_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=True,
            eval_strategy="epoch",
            num_train_epochs=50,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=256,
            learning_rate=lr,
            # weight_decay=experiment[3],
            predict_with_generate=True,
            generation_max_length=64,
            generation_num_beams=5,
            seed=seeds[run],
            optim="lion_32bit",
            # adam_beta2=0.99,
            lr_scheduler_type="constant"
            # warmup_steps=(56355//batch_size+1)*4,
            # lr_scheduler_kwargs={"num_cycles": 3}
        )

        # Trainer
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=train_data,
            eval_dataset=val_data.shuffle(seed=42).select(range(500)), # evaluation is slow, do it on subset
            compute_metrics=compute_metrics
        )

        # Train
        wandb.init(project=project, group=experiment_name, name=run_name)
        trainer.train()
        # Evaluate on the full dataset after training
        full_metrics.append(trainer.evaluate(eval_dataset=val_data))
        wandb.finish()

In [14]:
experiment("ablation-studies2", "4_heads_2e-4_lr_constant_512MappingTokenizer_128_bs_64_dff_32_kv_128d", 2e-4) # "cosine_with_restarts_and_warmup"

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: bruno-heberle (afy_shk). Use `wandb login --relogin` to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Overall Accuracy,Sel Accuracy,Agg Accuracy,Conds Accuracy,Execution Accuracy
1,No log,1.265791,0.0,0.188,0.314,0.002,0.002
2,1.666700,1.003437,0.0,0.222,0.442,0.006,0.002
3,1.215600,0.840438,0.004,0.272,0.562,0.036,0.006
4,1.007000,0.732142,0.018,0.318,0.626,0.058,0.008
5,0.881700,0.653053,0.022,0.378,0.742,0.066,0.024
6,0.795400,0.598671,0.034,0.426,0.746,0.072,0.036
7,0.729600,0.55297,0.052,0.434,0.788,0.076,0.046
8,0.676800,0.510354,0.064,0.474,0.8,0.092,0.052
9,0.676800,0.470452,0.078,0.518,0.806,0.106,0.074
10,0.625800,0.429132,0.108,0.6,0.836,0.138,0.088


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


0,1
eval/agg_accuracy,▁▃▄▅▆▇▇▇▇▇██████████████████████████████
eval/conds_accuracy,▁▁▁▂▂▂▂▂▂▃▃▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇███████████
eval/execution_accuracy,▁▁▁▁▁▂▂▂▂▃▄▄▅▅▆▇▇▇▇▇▇▇▇▇▇███████████████
eval/loss,█▆▅▄▄▄▃▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/overall_accuracy,▁▁▁▁▁▂▂▂▃▃▃▄▅▅▅▆▆▆▇▇▇▇▇▇▇▇██████████████
eval/runtime,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█
eval/samples_per_second,▄▇▃▃▄▆▆▅▆█▅▁▃▁▄▇▅▆▇█▆▆▇▇▅▇▆█▇█▇▆▇█▇▂▇▆▂▅
eval/sel_accuracy,▁▁▂▂▃▃▄▄▅▆▆▆▇▇▇▇█▇▇█████████████████████
eval/steps_per_second,▆█▃▃▃▆▆▆██▆▁▁▆▆▆████▆▆██▆█▆█████▆██▃█▆▃▃
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▆▆▆▆▇█████

0,1
eval/agg_accuracy,0.89799
eval/conds_accuracy,0.74599
eval/execution_accuracy,0.53462
eval/loss,0.04268
eval/overall_accuracy,0.64434
eval/runtime,525.7207
eval/samples_per_second,16.018
eval/sel_accuracy,0.93267
eval/steps_per_second,0.063
total_flos,1991601930240000.0


In [None]:
for i in range(3, 0, -1):
    factor = 2**i
    lr = 1e-4 * factor**0.5
    batch_size = round(32 * factor)
    print(lr, batch_size)
    experiment("ablation-studies2", f"2_heads_{lr:.3e}_lr_constant_512MappingTokenizer_{batch_size}_bs_redo2", lr, batch_size)

In [None]:
compute_metrics = create_metrics_computer(val_data, tokenizer, path+'/tables/validation/dev.db', reverse_mapping)
samples = 56355
for i in range(3, 0, -1):
    factor = 2**i
    lr = 1e-4 * factor**0.5
    batch_size = round(32 * factor)
    batches_per_epoch = int(samples/batch_size)+1
    total_batches = batches_per_epoch*25
    run_name = f"2_heads_{lr:.3e}_lr_constant_512MappingTokenizer_{batch_size}_bs_1"
    checkpoint = f"./results/{run_name}/checkpoint-{total_batches}"
    model = AutoModelForSeq2SeqLM.from_config(config)
    model.resize_token_embeddings(len(tokenizer))

    training_args = Seq2SeqTrainingArguments(
            output_dir="./results/"+run_name,
            run_name=run_name,
            report_to="wandb",
            save_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=True,
            eval_strategy="epoch",
            num_train_epochs=50,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=256,
            learning_rate=lr,
            # weight_decay=experiment[3],
            predict_with_generate=True,
            generation_max_length=64,
            generation_num_beams=5,
            seed=1337,
            optim="lion_32bit",
            # adam_beta2=0.99,
            lr_scheduler_type="constant"
            # warmup_steps=(56355//batch_size+1)*4,
            # lr_scheduler_kwargs={"num_cycles": 3}
        )
    
    trainer = Trainer(
        model=model,                     # Your model instance
        args=training_args,              # Training arguments
        train_dataset=train_data,        # Your training dataset
        eval_dataset=val_data.shuffle(seed=42).select(range(500)), # evaluation is slow, do it on subset
        compute_metrics=compute_metrics
    )
    
    # Resume training from the checkpoint
    trainer.train(resume_from_checkpoint=checkpoint)
    trainer.evaluate(eval_dataset=val_data)
    wandb.finish()

In [None]:
average_metrics = {key: sum(run[key] for run in full_metrics) / len(full_metrics) for key in full_metrics[0]}
average_metrics

In [None]:
# Function to log or update the summary table
def update_results_table(experiment_name, metrics):
    artifact_name = "experiment_results"
    try:
        artifact = wandb.Api().artifact(project_name + "/" + artifact_name + ":latest")
        artifact_table = artifact.get("results_table")
    except:
        # If no artifact exists yet, start a new table
        artifact = wandb.Artifact(artifact_name, type="results_summary")
        artifact_table = wandb.Table(columns=["Experiment"] + list(metrics.keys()))
    
    # Unpack the metrics dictionary values as a row
    artifact_table.add_data(experiment_name, *[metrics[key] for key in metrics])
    
    # Create a new artifact with the updated table    
    artifact.add(artifact_table, "results_table", overwrite=True)
    
    # Log the updated artifact
    wandb.log_artifact(artifact)

update_results_table(experiment_name, average_metrics)
wandb.finish()

In [None]:
# manually validate model
input_ids = tokenized_val_data["input_ids"]
labels = tokenized_val_data["labels"]

# Run the model to generate predictions
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient computation
    predictions = model.generate(input_ids=torch.tensor(input_ids).to(torch.device("cuda")))

print(predictions, labels)

In [None]:
# Decode predictions and labels
input_text = [tokenizer.decode(inputs, skip_special_tokens=True) for inputs in input_ids]
predictions_text = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
labels_text = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]
print(input_text)
print(predictions_text)
print(labels_text)

In [None]:
wandb.init(project="ablation-studies", name="predictions_table")
# Initialize the wandb.Table
table = wandb.Table(columns=["Input", "Prediction", "Correct Output", "Match"])

# Add rows to the table
for inp, pred, correct in zip(input_text, predictions_text, labels_text):
    match = pred == correct
    print(f"Adding row: {idx}, {pred}, {correct}, {match}")  # Debugging
    table.add_data(inp, pred, correct, match)

# Log the table
wandb.log({"Predictions Table": table})


In [None]:
checkpoint_dir = 'results/lion_32bit_bs16_3/checkpoint-3523'

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_dir)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./" + checkpoint_dir + "/eval",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    num_train_epochs=25,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=512,
    predict_with_generate=True,
    generation_max_length=48,
    generation_num_beams=5,
    optim="lion_32bit"
)

compute_metrics = create_metrics_computer(tokenized_val_data, tokenizer, path+'/tables/validation/dev.db')

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    compute_metrics=compute_metrics
)

In [None]:
trainer.evaluate()

In [None]:
test = {"hello": "world"}

In [None]:
for i in range(3, -3, -1):
    print(2**i)