# Fine-Tuning pre-trained T5 Question-Answering model by Christian Di Maio and Giacomo Nunziati

In [None]:
# Use this because of dependancy error
!pip uninstall transformers accelerate

!pip install transformers[torch]


Found existing installation: transformers 4.41.2
Uninstalling transformers-4.41.2:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.10/dist-packages/transformers-4.41.2.dist-info/*
    /usr/local/lib/python3.10/dist-packages/transformers/*
Proceed (Y/n)? y
  Successfully uninstalled transformers-4.41.2
[0mCollecting transformers[torch]
  Downloading transformers-4.41.2-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (f

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
tokenizer = AutoTokenizer.from_pretrained("MaRiOrOsSi/t5-base-finetuned-question-answering")
model = AutoModelForSeq2SeqLM.from_pretrained("MaRiOrOsSi/t5-base-finetuned-question-answering")


# Loading SQuAD v1.1 dataset from datasets library

In [None]:
# Install this and restart the run-time because of pyarrow dependency error
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19

## Prepare SQuAD v1.1 dataset

In [None]:
from datasets import load_dataset

squad = load_dataset("squad")


Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

## Tokenize data

In [None]:
def preprocess_function(examples):
    inputs = [q + " " + c for q, c in zip(examples["question"], examples["context"])]
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)

    # Tokenize the targets
    targets = [answer['text'][0] for answer in examples['answers']]
    with tokenizer.as_target_tokenizer():
        model_inputs["labels"] = tokenizer(targets, max_length=64, padding="max_length", truncation=True)["input_ids"]
    return model_inputs

# Apply the preprocessing function to the dataset
# tokenized_squad = squad.map(preprocess_function, batched=True)

In [None]:
# Apply the preprocessing function to the dataset
tokenized_squad = squad.map(preprocess_function, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
# The codes below are to make sure that the dataset is as required
# Function to decode tokenized examples
def decode_example(tokenized_example):
    input_ids = tokenized_example['input_ids']
    labels = tokenized_example['labels']
    input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
    target_text = tokenizer.decode(labels, skip_special_tokens=True)
    return input_text, target_text

# Inspect the first few examples
for i in range(3):
    input_text, target_text = decode_example(tokenized_squad['train'][i])
    print(f"Example {i+1}:")
    print(f"Input: {input_text}")
    print(f"Target: {target_text}")
    print("\n")

Example 1:
Input: To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France? Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
Target: Saint Bernadette Soubirous


Example 2:
Input: What is in front of the Notre Dame Main Building? Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue o

# Just using 5000 data items because of resource constraints

In [None]:
from datasets import load_dataset
import random

# Load the SQuAD dataset
squad = load_dataset("squad")

# Shuffle the dataset
squad = squad.shuffle(seed=42)

# Take a subset of 5000 examples for training
train_dataset = squad["train"].select(range(5000))

# Take a subset of 1000 examples for validation
validation_dataset = squad["validation"].select(range(1000))

# Print sizes of subsets
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")


Train dataset size: 5000
Validation dataset size: 1000


In [None]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_validation_dataset = validation_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/VivekaHackathon2024/QAmodelsTrained",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,    # Adjust batch size as needed
    per_device_eval_batch_size=8,     # Adjust batch size as needed
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='/content/drive/MyDrive/VivekaHackathon2024/QAlogs',
    logging_steps=100,
)


# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_validation_dataset,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)



Epoch,Training Loss,Validation Loss
1,0.3226,0.233877
2,0.1144,0.077044


Epoch,Training Loss,Validation Loss
1,0.3226,0.233877
2,0.1144,0.077044
3,0.0698,0.071023


{'eval_loss': 0.07102292776107788, 'eval_runtime': 47.3901, 'eval_samples_per_second': 21.101, 'eval_steps_per_second': 2.638, 'epoch': 3.0}


# Inference

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Step 1: Load Model and Tokenizer from Google Drive
model_path = '/content/drive/MyDrive/VivekaHackathon2024/QAmodelsTrained/checkpoint-1500'
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)




In [15]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text2text-generation", model=model, tokenizer = tokenizer)

In [16]:
import re

def clean_and_format_text(text):
    # Remove extra whitespace and newlines
    text = ' '.join(text.split())

    # Optionally, normalize punctuation (depends on tokenizer requirements)
    text = re.sub(r'([.,!?])', r' \1 ', text)
    text = re.sub(r'\s{2,}', ' ', text)  # Remove multiple spaces

    # Lowercase the text (if necessary)
    text = text.lower()

    return text


In [72]:
context = '''
Lung cancer is a kind of cancer that starts as a growth of cells in the lungs. The lungs are two spongy organs in the chest that control breathing.

Lung cancer is the leading cause of cancer deaths worldwide.

People who smoke have the greatest risk of lung cancer. The risk of lung cancer increases with the length of time and number of cigarettes smoked. Quitting smoking, even after smoking for many years, significantly lowers the chances of developing lung cancer. Lung cancer also can happen in people who have never smoked.

Products & Services
A Book: Mayo Clinic Family Health Book
Show more products from Mayo Clinic
Symptoms
Lung cancer typically doesn't cause symptoms early on. Symptoms of lung cancer usually happen when the disease is advanced.

Signs and symptoms of lung cancer that happen in and around the lungs may include:

A new cough that doesn't go away.
Chest pain.
Coughing up blood, even a small amount.
Hoarseness.
Shortness of breath.
Wheezing.
Signs and symptoms that happen when lung cancer spreads to other parts of the body may include:

Bone pain.
Headache.
Losing weight without trying.
Loss of appetite.
Swelling in the face or neck.
'''
question = "How can I be safe from lung cancer?"


In [73]:
input_text = f"question: {question} context: {context}"

In [74]:
result = pipe(input_text)

# Perform text generation (which in this case will answer the question)
generated_text = pipe(input_text, max_length = 200)

# Extract the generated answer from the output
answer = generated_text[0]['generated_text'].strip()

# Print the question, context, and answer
print("Question:", question)
print("Context:", context)
print("Answer:", answer)

Question: How can I be safe from lung cancer?
Context: 
Lung cancer is a kind of cancer that starts as a growth of cells in the lungs. The lungs are two spongy organs in the chest that control breathing.

Lung cancer is the leading cause of cancer deaths worldwide.

People who smoke have the greatest risk of lung cancer. The risk of lung cancer increases with the length of time and number of cigarettes smoked. Quitting smoking, even after smoking for many years, significantly lowers the chances of developing lung cancer. Lung cancer also can happen in people who have never smoked.

Products & Services
A Book: Mayo Clinic Family Health Book
Show more products from Mayo Clinic
Symptoms
Lung cancer typically doesn't cause symptoms early on. Symptoms of lung cancer usually happen when the disease is advanced.

Signs and symptoms of lung cancer that happen in and around the lungs may include:

A new cough that doesn't go away.
Chest pain.
Coughing up blood, even a small amount.
Hoarseness.


# Thank You!!