# Model Components
1. Embedding Layer: Converts input token indices to dense vectors.
2. Positional Encoding: Adds position information to embeddings to maintain the sequence order.
3. Encoder and Decoder Layers: Core processing units in the transformer.
4. Output Layer: Converts decoder output to token probabilities for generating text.

In [9]:
! conda install pytorch torchvision torchaudio cpuonly -c pytorch

In [8]:
import torch
import torch.nn as nn

ModuleNotFoundError: No module named 'torch'

## Embedding Layer

In [1]:

class Embedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(Embedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        
    def forward(self, x):
        return self.embedding(x) * math.sqrt(d_model)  # Scale by sqrt(d_model)


ModuleNotFoundError: No module named 'torch'

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        return x + self.encoding[:, :x.size(1)]


In [None]:
def scaled_dot_product_attention(query, key, value, mask=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)

    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)

    attention_weights = torch.softmax(scores, dim=-1)
    return torch.matmul(attention_weights, value), attention_weights
