In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
squad_dataset = load_dataset("squad")

In [4]:
# Tokenize
tokenizer = T5Tokenizer.from_pretrained("t5-base")
tokenizer.pad_token = tokenizer.eos_token

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
for split in ['train', 'validation']:
    squad_dataset[split] = squad_dataset[split].map(
        lambda example: {
            **example,
            'input_ids': tokenizer.encode(
                f"question: {example['question']} context: {example['context']}",
                add_special_tokens=True,
                max_length=512,  # Set max_length to truncate long sequences.
                truncation=True,
            ),
            'target_ids': tokenizer.encode(
                example['answers']['text'][0],
                add_special_tokens=True,
                max_length=512,  # Set max_length to truncate long sequences.
                truncation=True,
            ) if example['answers'] else [],
        }
    )

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [6]:
# custom PyTorch Dataset
class SquadDataset(Dataset):
    def __init__(self, input_ids, target_ids):
        self.input_ids = input_ids
        self.target_ids = target_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.input_ids[idx], dtype=torch.long)
        target_ids = torch.tensor(self.target_ids[idx], dtype=torch.long)
        return {'input_ids': input_ids, 'labels': target_ids}

def collate_batch(batch):
    input_ids = pad_sequence([example['input_ids'] for example in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    labels = pad_sequence([example['labels'] for example in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    
    return {'input_ids': input_ids, 'labels': labels}

In [7]:
# Extract tokenized data
input_ids_train = squad_dataset['train']['input_ids']
target_ids_train = squad_dataset['train']['target_ids']

# Create SquadDataset
squad_train_dataset = SquadDataset(input_ids_train, target_ids_train)

# Create DataLoader
squad_train_dataloader = DataLoader(squad_train_dataset, batch_size=4, collate_fn=collate_batch)


In [8]:
# T5 model and optimizer
config = T5Config.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base", config=config)
optimizer = AdamW(model.parameters(), lr=5e-5)




In [9]:
# Create a linear schedule with warmup
num_warmup_steps = 0
num_training_steps = len(squad_train_dataloader)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)


In [10]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./t5-squad-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=squad_train_dataloader,
)


In [11]:
# Training loop
model.to(device)
model.train()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [12]:
for epoch in range(training_args.num_train_epochs):
    progress_bar = tqdm(squad_train_dataloader, desc=f"Epoch {epoch + 1}/{training_args.num_train_epochs}")

    for batch in progress_bar:
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=inputs, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'Loss': loss.item()})


Epoch 1/3: 100%|██████████| 21900/21900 [57:05<00:00,  6.39it/s, Loss=0.317]   
Epoch 2/3: 100%|██████████| 21900/21900 [56:34<00:00,  6.45it/s, Loss=0.305]   
Epoch 3/3: 100%|██████████| 21900/21900 [1:02:16<00:00,  5.86it/s, Loss=0.16]   


In [16]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the trained model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-squad-finetuned/t5-squad-finetuned")
tokenizer = T5Tokenizer.from_pretrained("t5-squad-finetuned/t5-squad-finetuned")

# Example input
input_text = "question: Explain the concept of quantum entanglement in the context of theoretical physics. context: Quantum entanglement is a phenomenon in quantum mechanics where two or more particles become connected in such a way that the state of one particle cannot be independently described without reference to the state of the others, even when separated by large distances. This non-local correlation challenges classical intuitions and has been a subject of extensive study in theoretical physics, with applications in quantum computing and communication."

# Tokenize the input text
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate output
output = model.generate(input_ids, max_length=50, num_beams=4, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:", generated_text)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Generated Text: <pad> non-local correlation challenges classical intuitions


In [17]:
from nltk.translate.bleu_score import sentence_bleu

reference = ["The cat is on the mat"]
candidate = "The cat is sitting on the mat"

bleu_score = sentence_bleu(reference, candidate)
print(f"BLEU Score: {bleu_score}")


BLEU Score: 0.6787145495013929


In [25]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_bleu_score(reference_texts, candidate_text):
    # Tokenize reference and candidate texts
    reference_tokens = [tokenizer.tokenize(reference_text) for reference_text in reference_texts]
    candidate_tokens = tokenizer.tokenize(candidate_text)

    # Calculate BLEU score
    smoothing_function = SmoothingFunction().method1  # Choose a smoothing function
    bleu_score = sentence_bleu(reference_tokens, candidate_tokens, smoothing_function=smoothing_function)

    return bleu_score

# Example usage:
reference_texts = ["The cat is on the mat.", "There is a cat on the mat."]
candidate_text = "A cat is sitting on the mat."

bleu_score = calculate_bleu_score(reference_texts, candidate_text)
print(f"BLEU Score: {bleu_score}")


BLEU Score: 0.4111336169005197


In [13]:
model.save_pretrained("t5-squad-finetuned/t5-squad-finetuned")
tokenizer.save_pretrained("t5-squad-finetuned/t5-squad-finetuned")

('t5-squad-finetuned/t5-squad-finetuned\\tokenizer_config.json',
 't5-squad-finetuned/t5-squad-finetuned\\special_tokens_map.json',
 't5-squad-finetuned/t5-squad-finetuned\\spiece.model',
 't5-squad-finetuned/t5-squad-finetuned\\added_tokens.json')