In [1]:
import torch
from transformers import T5Tokenizer
from models import EmbeddingLayer, KnowledgeDistillation, PrimalDualEncoder
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = T5Tokenizer.from_pretrained('t5-small', model_max_length = 512)
pretrained_t5_name = 't5-small'
d_model = 512 # for t5-small

In [3]:
# Example passage, answer, and question
passage = "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France."
answer = "Eiffel Tower"
question = "What is the famous iron lattice tower located in Paris?"

In [4]:
# Tokenize the passage, answer, and question
passage_tokens = tokenizer.tokenize(passage)

In [5]:
# Create a mask for the input tokens
def create_distillation_mask(tokens, mask_rate=0.15):
    mask = [random.random() < mask_rate for _ in tokens]
    return mask

In [6]:
# Create input_ids for KD
input_ids = tokenizer.convert_tokens_to_ids(passage_tokens)
input_ids = torch.tensor([input_ids])  # Add batch dimension

In [7]:
# Create task IDs tensor (0 for question generation, 1 for question answering, 2 for KD)
task_id = 2  # Knowledge distillation
task_ids = torch.tensor([[task_id] * len(input_ids[0])])

# Create segment IDs tensor (0 for passage)
segment_ids = torch.tensor([[0] * len(passage_tokens)])

In [8]:
# Forward pass through the embedding layer
embedding_layer = EmbeddingLayer(pretrained_t5_name, d_model)
embeddings = embedding_layer(input_ids, task_ids, segment_ids)

In [9]:
def create_attention_mask(input_ids):
    return (input_ids != tokenizer.pad_token_id)
attention_mask = create_attention_mask(input_ids)

In [10]:
# Forward pass through PrimalDualEncoder
primal_dual_encoder = PrimalDualEncoder(pretrained_t5_name)
encoder_outputs = primal_dual_encoder(embeddings, attention_mask)

In [11]:
# Instantiate the KnowledgeDistillation
kd_layer = KnowledgeDistillation(pretrained_t5_name, d_model, tokenizer.vocab_size)

# Create a mask for distillation
distillation_mask = create_distillation_mask(passage_tokens)
distillation_mask_tensor = torch.tensor([distillation_mask], dtype=torch.bool)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [12]:
y_en, y_pre = kd_layer(input_ids, distillation_mask_tensor, encoder_outputs)