In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, GenerationConfig, pipeline
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, Trainer, EarlyStoppingCallback
from datasets import Dataset, DatasetDict, load_metric
import evaluate
import torch
import os
from os import listdir
from os.path import isfile, join
import json
import re
import numpy as np
import pandas as pd

In [None]:
torch.cuda.is_available()

True

In [None]:
with open("sample1k_train.json", "r") as f:
    train_data = json.load(f)

with open("sample1k_val.json", "r") as f:
    val_data = json.load(f)

with open("sample1k_test.json", "r") as f:
    test_data = json.load(f)

In [None]:
df_train = pd.DataFrame(train_data)
df_val = pd.DataFrame(val_data)
df_test = pd.DataFrame(test_data)

In [None]:
def context_template(df):
    df["context"] = f"""Context:
{df["text"]}

Question:
{df["questions"]}"""
    return df

df_train = df_train.apply(context_template, axis=1)
df_val = df_val.apply(context_template, axis=1)
df_test = df_test.apply(context_template, axis=1)

In [None]:
df_train.shape, df_val.shape, df_test.shape

((4131, 4), (885, 4), (886, 4))

In [None]:
df_train.duplicated().sum(), df_val.duplicated().sum(), df_test.duplicated().sum()

(0, 0, 0)

In [None]:
train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

# Define Model

In [None]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
prefix = ""
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["context"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(text_target=examples["answers"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/4131 [00:00<?, ? examples/s]

Map:   0%|          | 0/885 [00:00<?, ? examples/s]

Map:   0%|          | 0/886 [00:00<?, ? examples/s]

# Training

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    weight_decay=0.03,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True,
    # logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="exact_match",
    greater_is_better=True,
)

In [None]:
metric = evaluate.load('exact_match')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Directly decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # decoded_preds = [" ".join(decoded_pred.split()) for decoded_pred in decoded_preds]
    # decoded_labels = [" ".join(decoded_label.split()) for decoded_label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)

    return result

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)

In [None]:
trainer.evaluate(tokenized_dataset["test"])



{'eval_loss': 2.3595364093780518,
 'eval_exact_match': 0.05191873589164785,
 'eval_runtime': 41.5051,
 'eval_samples_per_second': 21.347,
 'eval_steps_per_second': 2.674}

In [None]:
torch.cuda.empty_cache()

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Exact Match
0,1.4807,1.245798,0.131073
2,1.2612,1.229409,0.132203
4,1.2148,1.227856,0.132203


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=1290, training_loss=1.3401988125586695, metrics={'train_runtime': 1095.2883, 'train_samples_per_second': 18.858, 'train_steps_per_second': 1.178, 'total_flos': 3833062206996480.0, 'train_loss': 1.3401988125586695, 'epoch': 4.990328820116054})

In [None]:
eval_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print(eval_results)



{'eval_loss': 1.3321141004562378, 'eval_exact_match': 0.12641083521444696, 'eval_runtime': 38.1251, 'eval_samples_per_second': 23.239, 'eval_steps_per_second': 2.911, 'epoch': 4.990328820116054}


In [None]:
tokenizer.push_to_hub("t5_news_qa", token="------------")
model.push_to_hub("t5_news_qa", token="-------------")

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/andreanstev/t5_news_qa/commit/519ac0549be79da60b5019ed71fd67da8fe36412', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='519ac0549be79da60b5019ed71fd67da8fe36412', pr_url=None, pr_revision=None, pr_num=None)

# Inference

In [None]:
qa = pipeline("text2text-generation", model="andreanstev/t5_news_qa", device ='cuda')

config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/118 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.69k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.67k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [None]:
ARTICLE = """\'SINDH KALAY\', England (CNN) -- The aroma of freshly baking flatbread wafts through the air as a unit of British soldiers position themselves for a quick patrol around the village of Sindh Kalay. A British soldier on patrol in the mock Afghan village of Sindh Kalay. Market vendors hawk grapes and melons, as a group of village elders sit smoking water pipes and suspicious-looking men lurk beside battered motorcycles. What should the soldiers do? Conduct a weapons search? Approach the village elders first? In the complex political and cultural terrain of Afghanistan, what is the best course of action? Except this is not Afghanistan. It\'s Norfolk, England. Instead of the Hindu Kush mountains, it is the green ladscape and tidy farmhouses of the English countryside that stretch out behind them. Welcome to the British Army\'s state-of-the art training ground. It cost more than $20 million to build and every British soldier serving in Afghanistan will do his or her training here. "I think it\'s the closest thing you are going to get short of being in Afghanistan itself," says Col. David Colthup of the 2nd Battalion of the Yorkshire Regiment. His troops have already served one tour of duty in Afghanistan\'s Helmand province and are training for another. British troops serving in Helmand province are tasked with mentoring and training Afghan security forces. Not an easy job in a Taliban stronghold and Afghanistan\'s center of opium production. "Ultimately, a soldier joins the army and trains to fight. That\'s what a soldier trains to do. But today, it\'s a much, much more complex environment," explains Colthup. "The business of being able to interact either through an interpreter or through Afghan security forces, whether they are police or army. And to understand how the people operate and how we can interact better with them. Because ultimately, that\'s what it\'s about," he says. The most distinctive features of Sindh Kalay are the high three-meter walls that make up the village compound, creating narrow alleyways difficult for troops to patrol. The village is staffed with Afghan asylum-seekers, many of whom have fled the Taliban. They play the roles of market vendors, village elders and sometimes Afghan security forces. Several Afghan women are also on hand, useful for training British soldiers on the religious and cultural sensitivities of entering an Afghan home.  Watch British troops training in mock Afghan village » The Taliban insurgents are played by Nepalese Ghurkha soldiers authorized to handle weapons. They play their roles silently, unable to partake in the Pashtun banter among the Afghans. Fazel Beria is also an asylum-seeker from Afghanistan. He is responsible for recruiting and for creating the sights and smells of Sindh Kalay and is easily identifiable as the only Afghan in the market in Western clothes. He beams with pride walking down the bazaar and clearly relishes his role in training the British Army. "Everything with the culture comes up with the issue of hearts and minds," he explains. "If you want to win that, you need to know about their culture. You need to respect their culture, their religion and their way of life." He gives high marks to the soldiers training so far. After each exercise, the Afghan actors talk directly to the soldiers about what went wrong and what went right. Sometimes, it\'s the little things that count. "Yes, there have been quite a lot of surprises," Beria says. Like Afghan will sit cross legged for hours. "The British soldier cannot do that," he laughs. "The Afghan will be sitting very comfortable and the British soldier is not. So, they have to get used to it."  See photos of British troops on patrol in Sindh Kalay -- and for real in Afghanistan » Previously, the army trained on farmhouses and in urban neighborhoods that resembled Northern Ireland more than Afghanistan. But Sindh Kalay does more than mimic the physical reality of Afghanistan. It also mirrors the changing tactics on the ground. Troops are grilled in"""
question = "Where will British troops be deployed?"
input = f"""Context:
{ARTICLE}

Question:
{question}"""
res = qa(input)
res

Token indices sequence length is longer than the specified maximum sequence length for this model (904 > 512). Running this sequence through the model will result in indexing errors


[{'generated_text': 'Helmand province'}]