In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoConfig, AutoTokenizer, T5Tokenizer, Trainer, TrainingArguments, PreTrainedTokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments, convert_slow_tokenizer
from utils import filter_function, preprocess_function, encode_rare_chars, tokenize, create_metrics_computer
import torch
import wandb
import json

In [None]:
config = AutoConfig.from_pretrained("google/t5-efficient-tiny")
# print(config)
# Modify parameters
# config.num_layers = 3  # Set number of encoder layers
# config.num_decoder_layers = 3  # Set number of decoder layers
config.num_heads = 4  # Set number of attention heads
config.d_model = 128  # Set embedding dimension
config.d_ff = 64  # Set feed-forward dimension
config.d_kv = 32
# config.dropout = 0.2

print("Modified config:", config)

# Initialize the model from scratch using the configuration
model = AutoModelForSeq2SeqLM.from_config(config)

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("google/t5-efficient-tiny")
tokenizer = PreTrainedTokenizerFast(tokenizer_object=convert_slow_tokenizer.convert_slow_tokenizer(T5Tokenizer("tokenizers/sp_512_bpe_encoded.model", legacy=False, load_from_cache_file=False)))
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# tokenizer = T5Tokenizer(vocab_file="tokenizers/sp_16k_bpe_1.model", legacy=False, load_from_cache_file=False)
model.resize_token_embeddings(len(tokenizer))

In [None]:
tokens = tokenizer.encode("How often did germany win gold in the 1994 olympics?[SEP]name[SEP]team[SEP]country[SEP]ikhasbd")
print(tokens)
print([tokenizer.decode(token) for token in tokens])

In [None]:
token_embedding = model.shared  # Shared token embedding layer
num_token_embedding_params = sum(p.numel() for p in token_embedding.parameters() if p.requires_grad)
print(f"Number of trainable parameters in the token embedding layer: {num_token_embedding_params}")

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total number of trainable parameters: {total_params}")

In [None]:
path = '../datasets/wikisql'
dataset = load_dataset(path+'/data')

In [None]:
preprocessed_dataset = dataset.map(preprocess_function, batched=True, batch_size=2048)

In [None]:
mapping_file_path = 'mapping.json'
reverse_mapping_file_path = 'reverse_mapping.json'

with open(mapping_file_path, 'r', encoding='utf-8') as mapping_file:
    mapping = json.load(mapping_file)

with open(reverse_mapping_file_path, 'r', encoding='utf-8') as reverse_mapping_file:
    reverse_mapping = json.load(reverse_mapping_file)

In [None]:
preprocessed_dataset = preprocessed_dataset.map(lambda batch: encode_rare_chars(batch, mapping), batched=True, batch_size=2048)

In [None]:
tokenized_dataset = preprocessed_dataset.map(lambda batch: tokenize(batch, tokenizer), batched=True, batch_size=2048)

In [None]:
train_data = tokenized_dataset["train"]
val_data = tokenized_dataset["validation"]
train_data

In [None]:
# train_data = train_data.filter(lambda sample: filter_function(sample, tokenizer), batched=False)

In [None]:
def experiment(project, experiment_name, lr=2e-4, batch_size=128):
    seeds = [1337] # [1337, 69, 42]
    compute_metrics = create_metrics_computer(val_data, tokenizer, path+'/tables/validation/dev.db', reverse_mapping)
    full_metrics = []
    for run in range(len(seeds)):
        model = AutoModelForSeq2SeqLM.from_config(config)
        model.resize_token_embeddings(len(tokenizer))
        run_name = experiment_name + "_" + str(run+1)
        training_args = Seq2SeqTrainingArguments(
            output_dir="./results/"+run_name,
            run_name=run_name,
            report_to="wandb",
            save_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=True,
            eval_strategy="epoch",
            num_train_epochs=50,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=256,
            learning_rate=lr,
            # weight_decay=experiment[3],
            predict_with_generate=True,
            generation_max_length=64,
            generation_num_beams=5,
            seed=seeds[run],
            optim="lion_32bit",
            # adam_beta2=0.99,
            lr_scheduler_type="constant"
            # warmup_steps=(56355//batch_size+1)*4,
            # lr_scheduler_kwargs={"num_cycles": 3}
        )

        # Trainer
        trainer = Seq2SeqTrainer(
            model=model,
            args=training_args,
            train_dataset=train_data,
            eval_dataset=val_data.shuffle(seed=42).select(range(500)), # evaluation is slow, do it on subset
            compute_metrics=compute_metrics
        )

        # Train
        wandb.init(project=project, group=experiment_name, name=run_name)
        trainer.train()
        # Evaluate on the full dataset after training
        full_metrics.append(trainer.evaluate(eval_dataset=val_data))
        wandb.finish()

In [None]:
experiment("ablation-studies2", "4_heads_1.5e-4_lr_constant_512MappingTokenizer_128_bs_64_dff_32kv_128d", 1.5e-4) # "cosine_with_restarts_and_warmup"

In [None]:
for i in range(3, 0, -1):
    factor = 2**i
    lr = 1e-4 * factor**0.5
    batch_size = round(32 * factor)
    print(lr, batch_size)
    experiment("ablation-studies2", f"2_heads_{lr:.3e}_lr_constant_512MappingTokenizer_{batch_size}_bs_redo2", lr, batch_size)

In [None]:
compute_metrics = create_metrics_computer(val_data, tokenizer, path+'/tables/validation/dev.db', reverse_mapping)
samples = 56355
for i in range(3, 0, -1):
    factor = 2**i
    lr = 1e-4 * factor**0.5
    batch_size = round(32 * factor)
    batches_per_epoch = int(samples/batch_size)+1
    total_batches = batches_per_epoch*25
    run_name = f"2_heads_{lr:.3e}_lr_constant_512MappingTokenizer_{batch_size}_bs_1"
    checkpoint = f"./results/{run_name}/checkpoint-{total_batches}"
    model = AutoModelForSeq2SeqLM.from_config(config)
    model.resize_token_embeddings(len(tokenizer))

    training_args = Seq2SeqTrainingArguments(
            output_dir="./results/"+run_name,
            run_name=run_name,
            report_to="wandb",
            save_strategy="epoch",
            save_total_limit=1,
            load_best_model_at_end=True,
            eval_strategy="epoch",
            num_train_epochs=50,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=256,
            learning_rate=lr,
            # weight_decay=experiment[3],
            predict_with_generate=True,
            generation_max_length=64,
            generation_num_beams=5,
            seed=1337,
            optim="lion_32bit",
            # adam_beta2=0.99,
            lr_scheduler_type="constant"
            # warmup_steps=(56355//batch_size+1)*4,
            # lr_scheduler_kwargs={"num_cycles": 3}
        )
    
    trainer = Trainer(
        model=model,                     # Your model instance
        args=training_args,              # Training arguments
        train_dataset=train_data,        # Your training dataset
        eval_dataset=val_data.shuffle(seed=42).select(range(500)), # evaluation is slow, do it on subset
        compute_metrics=compute_metrics
    )
    
    # Resume training from the checkpoint
    trainer.train(resume_from_checkpoint=checkpoint)
    trainer.evaluate(eval_dataset=val_data)
    wandb.finish()

In [None]:
# manually validate model
input_ids = tokenized_val_data["input_ids"]
labels = tokenized_val_data["labels"]

# Run the model to generate predictions
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient computation
    predictions = model.generate(input_ids=torch.tensor(input_ids).to(torch.device("cuda")))

print(predictions, labels)

In [None]:
# Decode predictions and labels
input_text = [tokenizer.decode(inputs, skip_special_tokens=True) for inputs in input_ids]
predictions_text = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
labels_text = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]
print(input_text)
print(predictions_text)
print(labels_text)

In [None]:
wandb.init(project="ablation-studies2", name="wrong predictions")
# Initialize the wandb.Table
table = wandb.Table(columns=["Input", "Prediction", "Correct Output"])

# Add rows to the table
for inp, pred, correct in zip(input_text, predictions_text, labels_text):
    match = pred == correct
    if match: continue
    print(f"Adding row: {idx}, {pred}, {correct}")  # Debugging
    table.add_data(inp, pred, correct)

# Log the table
wandb.log({"Predictions Table": table})


In [None]:
checkpoint_dir = 'results/lion_32bit_bs16_3/checkpoint-3523'

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_dir)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./" + checkpoint_dir + "/eval",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    num_train_epochs=25,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=512,
    predict_with_generate=True,
    generation_max_length=48,
    generation_num_beams=5,
    optim="lion_32bit"
)

compute_metrics = create_metrics_computer(tokenized_val_data, tokenizer, path+'/tables/validation/dev.db')

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    compute_metrics=compute_metrics
)

In [None]:
trainer.evaluate()