In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoConfig, AutoTokenizer, Trainer, TrainingArguments
from utils import filter_function, preprocess_function, create_metrics_computer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import wandb

In [None]:
# Load the configuration for the model
config = AutoConfig.from_pretrained("google/t5-efficient-tiny")

# Initialize the model from scratch using the configuration
model = AutoModelForSeq2SeqLM.from_config(config)
# Calculate the total number of parameters
total_params = sum(p.numel() for p in model.parameters())
# Print the total number of parameters
print(f"Total number of parameters: {total_params}")

tokenizer = AutoTokenizer.from_pretrained("google/t5-efficient-tiny")

In [None]:
path = '../datasets/wikisql'
dataset = load_dataset(path+'/data')
train_data = dataset["train"]
val_data = dataset["validation"].select(range(1024))

In [None]:
train_data = train_data.filter(lambda sample: filter_function(sample, tokenizer), batched=False)

In [None]:
tokenized_train_data = train_data.map(lambda batch: preprocess_function(batch, tokenizer), batched=True, batch_size=2048)
tokenized_val_data = val_data.map(lambda batch: preprocess_function(batch, tokenizer), batched=True, batch_size=2048)
tokenized_val_data

In [None]:
# Training arguments
name = "t5-tiny-bs32-lr0.0001-lossFixed"
training_args = Seq2SeqTrainingArguments(
    output_dir="./results/"+name,
    report_to="wandb",
    save_strategy="epoch",
    save_total_limit=1,
    load_best_model_at_end=True,
    eval_strategy="epoch",
    num_train_epochs=25,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=1e-4,
    weight_decay=0.01,
    predict_with_generate=True,
    generation_max_length=48,
    generation_num_beams=5,
    seed=1337
)

compute_metrics = create_metrics_computer(tokenized_val_data, tokenizer, path+'/tables/validation/dev.db')
# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_val_data,
    compute_metrics=compute_metrics
)

In [None]:
# Train
wandb.init(project="test-learning-rate", name = name)
trainer.train()

In [None]:
training_args.num_train_epochs = 30
trainer.train(resume_from_checkpoint=True)

In [None]:
# Prepare input for the model
input_ids = tokenized_val_data["input_ids"]
labels = tokenized_val_data["labels"]

# Run the model to generate predictions
model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient computation
    predictions = model.generate(input_ids=torch.tensor(input_ids).to(torch.device("cuda")))

print(predictions, labels)

In [None]:
# Decode predictions and labels
predictions_text = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions]
labels_text = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]
print(predictions_text)
print(labels_text)

In [None]:
checkpoint_dir = 'results/checkpoint-13230'

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_dir)

In [None]:
trainer.evaluate()