In [155]:
import torch
import torch.nn as nn
import torch.nn.functional as F

sentences = [
    "The product arrived damaged and is unusable.",
    "I received the wrong item in my order.",
    "The delivery was significantly delayed.",
    "The customer service representative was rude and unhelpful.",
    "I was charged for an item I did not purchase.",
    "The product quality did not meet my expectations.",
    "My order was incomplete and missing several items.",
    "I encountered technical issues while using the product.",
    "The packaging was inadequate and caused the product to break.",
    "The website was difficult to navigate and place an order.",
    "I was overcharged for shipping and handling.",
    "The product I received was different from what was advertised.",
    "I did not receive a confirmation email for my order.",
    "The return process was too complicated and time-consuming.",
    "I had trouble reaching customer support via phone.",
    "The product was defective and stopped working after a few uses.",
    "I was not informed about the additional fees before purchasing.",
    "The item was missing from my shipment.",
    "The store's policies were not clearly stated on the website.",
    "The quality of customer support was poor and not satisfactory."
]

vocab = list(set(' '.join(sentences).split()))
vocab_size = len(vocab)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}

d_model = 786
d_ff = 16
seq_len = max(len(sentence.split()) for sentence in sentences)
batch_size = len(sentences)
tokenized_sentences = [[word_to_idx[word] for word in sentence.split()] for sentence in sentences]
padded_sentences = [tokens + [0] * (seq_len - len(tokens)) for tokens in tokenized_sentences]
input_ids = torch.tensor(padded_sentences)

class Embeddings(nn.Module):
    def __init__(self, vocab_size, d_model, seq_len):
        super(Embeddings, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, d_model)
        self.position_embeddings = nn.Embedding(seq_len, d_model)

    def forward(self, input_ids):
        word_embeds = self.word_embeddings(input_ids)
        position_ids = torch.arange(0, seq_len).unsqueeze(0).repeat(batch_size, 1)
        position_embeds = self.position_embeddings(position_ids)
        embeddings = word_embeds + position_embeds

        print("Word Embeddings Shape:", word_embeds.shape)
        print("Position Embeddings Shape:", position_embeds.shape)
        print("Final Input Embeddings Shape:", embeddings.shape)
        print("\n")

        return embeddings

class EncoderLayer(nn.Module):
    def __init__(self, d_model, d_ff):
        super(EncoderLayer, self).__init__()
        self.attention = nn.Linear(d_model, d_model * 3) 
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model)
        )

    def forward(self, x):
        QKV = self.attention(x)
        Q, K, V = torch.chunk(QKV, 3, dim=-1)

        print("Q Shape:", Q.shape)
        print("K Shape:", K.shape)
        print("V Shape:", V.shape)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / (d_model ** 0.5)
        attention_scores = torch.matmul(scores, V)

        print("Attention Scores Shape:", attention_scores.shape)
        print("\n")

        residual = x + attention_scores
        norm1_output = self.layer_norm1(residual)

        print("Residual Matrix Shape:", residual.shape)
        print("Layer Norm 1 Output Shape:", norm1_output.shape)
        print("\n")

        ffn_output = self.ffn(norm1_output)

        for idx, layer in enumerate(self.ffn):
            if isinstance(layer, nn.Linear):
                print(f"FFN Layer {idx+1} Weight Shape: {layer.weight.shape}")
                print(f"FFN Layer {idx+1} Bias Shape: {layer.bias.shape}")

        print("FFN Output Shape:", ffn_output.shape)
        print("\n")

        residual2 = norm1_output + ffn_output
        norm2_output = self.layer_norm2(residual2)

        return norm2_output

class SimpleBERT(nn.Module):
    def __init__(self, vocab_size, d_model, d_ff, seq_len):
        super(SimpleBERT, self).__init__()
        self.embeddings = Embeddings(vocab_size, d_model, seq_len)
        self.encoder1 = EncoderLayer(d_model, d_ff)
        self.encoder2 = EncoderLayer(d_model, d_ff)

    def forward(self, input_ids):
        embeds = self.embeddings(input_ids)
        encoder1_output = self.encoder1(embeds)
        print("Encoder 1 Output Shape:", encoder1_output.shape)
        encoder2_output = self.encoder2(encoder1_output)
        print("Encoder 2 Output Shape:", encoder2_output.shape)

        return encoder2_output

model = SimpleBERT(vocab_size, d_model, d_ff, seq_len)
output = model(input_ids)

Word Embeddings Shape: torch.Size([20, 11, 786])
Position Embeddings Shape: torch.Size([20, 11, 786])
Final Input Embeddings Shape: torch.Size([20, 11, 786])


Q Shape: torch.Size([20, 11, 786])
K Shape: torch.Size([20, 11, 786])
V Shape: torch.Size([20, 11, 786])
Attention Scores Shape: torch.Size([20, 11, 786])


Residual Matrix Shape: torch.Size([20, 11, 786])
Layer Norm 1 Output Shape: torch.Size([20, 11, 786])


FFN Layer 1 Weight Shape: torch.Size([16, 786])
FFN Layer 1 Bias Shape: torch.Size([16])
FFN Layer 3 Weight Shape: torch.Size([786, 16])
FFN Layer 3 Bias Shape: torch.Size([786])
FFN Output Shape: torch.Size([20, 11, 786])


Encoder 1 Output Shape: torch.Size([20, 11, 786])
Q Shape: torch.Size([20, 11, 786])
K Shape: torch.Size([20, 11, 786])
V Shape: torch.Size([20, 11, 786])
Attention Scores Shape: torch.Size([20, 11, 786])


Residual Matrix Shape: torch.Size([20, 11, 786])
Layer Norm 1 Output Shape: torch.Size([20, 11, 786])


FFN Layer 1 Weight Shape: torch.Size([16, 7