# Try Train a T5 Network

In [2]:
import pandas as pd
import numpy as np

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
import evaluate

In [3]:
df = pd.read_csv('questions.csv')

In [5]:
tokenizer = AutoTokenizer.from_pretrained("voidful/context-only-question-generator")
model = AutoModelForSeq2SeqLM.from_pretrained("voidful/context-only-question-generator")


In [6]:
model.to('cuda');

In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
metric = evaluate.load("bleu")

In [8]:
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

In [12]:
def preprocess_function(examples):
    return tokenizer(examples["context"], truncation=True)

In [13]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions
    print(pred)
    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    return metric.compute(predictions=decoded_preds, references=decoded_labels)

In [14]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

                                                   

In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_model",
    predict_with_generate=True,
    save_strategy= "epoch",
    per_device_train_batch_size= 16,
    per_device_eval_batch_size= 16,
    learning_rate= .1e-4,
    weight_decay= 0.01,
    num_train_epochs= 10,
    warmup_steps= 0,
    save_total_limit= 3,
    evaluation_strategy= "epoch",
    # eval_steps: 15
    logging_dir= "./seq2seq/logs",
    logging_steps= 10,
    metric_for_best_model= "eval_loss",
    warmup_ratio= 0.01,
    push_to_hub= False,
)

In [17]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [18]:
trainer.train()

  0%|          | 0/340 [00:00<?, ?it/s]You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values,encoder_last_hidden_state. For reference, the inputs it received are input_ids,attention_mask.