In [24]:
import math
from typing import Tuple
from transformers import BertTokenizer, BertModel
import numpy as np
import torch
from torch import nn, Tensor
from torch.utils.data import dataset

In [25]:
device = torch.device('cpu')#'cuda' if torch.cuda.is_available() else 'cpu')

In [26]:
class BERTSentenceEncoder:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)

    def encode_sentences(self, input_sentences):
        tokenized_input = self.tokenizer(input_sentences, return_tensors='pt', padding=True, truncation=True)
        with torch.no_grad():
            outputs = self.model(**tokenized_input)
        encoded_sentences = outputs.last_hidden_state

        # take only the CLS mode
        context_vector = encoded_sentences[:, 0,:]
        return tokenized_input, encoded_sentences, context_vector

In [41]:
# Define a simple model using PyTorch's TransformerDecoder
class SimpleTransformerDecoderModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dim_feedforward):
        super(SimpleTransformerDecoderModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer_decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model, nhead, dim_feedforward), 
            num_layers
        )
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, tgt, memory):
        tgt = self.embedding(tgt) * np.sqrt(d_model)
        tgt = self.pos_encoder(tgt)
        output = self.transformer_decoder(tgt, memory)
        output = self.fc_out(output)
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0).transpose(0, 1)

    def forward(self, x):
        return x + self.encoding[:x.size(0), :]

# Parameters
vocab_size = 101  # For integers 0-100
d_model = 128  # smaller d_model for simplicity
nhead = 4  # ensure d_model is divisible by nhead
num_layers = 3
dim_feedforward = 512

# Model
model = SimpleTransformerDecoderModel(vocab_size, d_model, nhead, num_layers, dim_feedforward)

# Example input
tgt_seq = torch.tensor([[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]]).T  # Transpose for correct shape
memory = torch.rand(10, 1, d_model)  # Random memory, in practice comes from the encoder

# Forward pass
output = model(tgt_seq, memory)
print("Output shape:", output.shape)  # [sequence length, batch size, vocab size]

Output shape: torch.Size([10, 1, 101])


In [35]:
bert_encoder = BERTSentenceEncoder()

input_sentences = ["Your first sentence", "Your second sentence . This at the test"]

tokenized_input, encoded_sentences, context_vector = bert_encoder.encode_sentences(input_sentences)

print(encoded_sentences.shape)

torch.Size([2, 10, 768])


In [36]:
print(encoded_sentences.shape)
print(tokenized_input)
print(context_vector.shape)
#tokenized_input, encoded_sentences, context_vector

torch.Size([2, 10, 768])
{'input_ids': tensor([[ 101, 2115, 2034, 6251,  102,    0,    0,    0,    0,    0],
        [ 101, 2115, 2117, 6251, 1012, 2023, 2012, 1996, 3231,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([2, 768])


In [37]:
import torch
import torch.nn as nn
import torch.optim as optim

# Sample data
input_sequence = encoded_sentences#torch.randint(0, 100, (10,))  # Input sequence of length 10 with token IDs from 0 to 99
target_sequence = torch.randint(0, 110, (10,))  # Target sequence of length 10 with token IDs from 0 to 109

# Parameters
target_vocab_size = 101  # For integers 0-100
d_model = 128  # smaller d_model for simplicity
nhead = 4  # ensure d_model is divisible by nhead
num_layers = 3
dim_feedforward = 512


# Instantiate the model
model = TransformerDecoder(target_vocab_size, d_model, nhead, num_layers, dim_feedforward)


In [38]:
# Get BERT encoder output
with torch.no_grad():
    bert_output = model(input_sequence.unsqueeze(0))[0]

# Generate output during inference
output_sequence = model.inference(input_sequences, bert_output)

print("Input Text:", input_text)
print("Output Sequence:", output_sequence)

TypeError: TransformerDecoder.forward() missing 1 required positional argument: 'memory'