In [1]:
!pip install transformers #Using HuggingFace for getting the Google Flan-T5-small model
!pip install sentencepiece #T5 tokenizer uses SentencePiece tokenizer
!pip install transformers datasets evaluate

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m35.4 MB/s[0m eta [36m0:00:0

In [30]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoConfig
from transformers.data.metrics.squad_metrics import compute_exact, compute_f1

from datasets import load_metric, load_dataset
import torch
import tqdm

In [3]:
model_name = "google/flan-t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [6]:
dataset = load_dataset('McGill-NLP/TopiOCQA')
train_dataset = dataset['train']
val_dataset = dataset['validation']

Downloading builder script:   0%|          | 0.00/4.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/55.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.77M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [7]:
# Initialize metrics
metric = load_metric("accuracy")
f1_metric = load_metric("f1")

# Evaluate the model
model.eval()
# Initialize lists to store predictions and targets
predictions = []
references = []

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

In [15]:
c = val_dataset.__len__()

In [16]:
for example in tqdm.tqdm(val_dataset):
    input_text = example["Rationale"]
    question = example["Question"]
    reference = example["Answer"]

    input_text = f"question: {question} context: {input_text}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {k: v for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model.generate(**inputs)

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    predictions.append(answer)
    references.append(reference)

100%|██████████| 2514/2514 [36:06<00:00,  1.16it/s]


In [31]:
# Initialize lists to store evaluation results
all_exact_scores = []
all_f1_scores = []

for i in range(len(val_dataset)):
    # Calculate exact match (EM) and F1 score
    exact_score = compute_exact(references[i], predictions[i])
    f1_score = compute_f1(references[i], predictions[i])

    all_exact_scores.append(exact_score)
    all_f1_scores.append(f1_score)

In [33]:
# Calculate average metrics
average_exact_score = sum(all_exact_scores) / len(all_exact_scores)
average_f1_score = sum(all_f1_scores) / len(all_f1_scores)

print(f"Average Exact Match (EM) score: {average_exact_score}")
print(f"Average F1 score: {average_f1_score}")

Average Exact Match (EM) score: 0.15672235481304694
Average F1 score: 0.3191207599427022
