<a href="https://colab.research.google.com/github/VladimirApter/ml4se_course_2025/blob/main/homeworks/ml4se_course_hw5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Модель: ChatGPT 5**

**Промпт:**
Реализуй transformer Decoder с помощью PyTorch

In [3]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

# ---------------------------
# Positional Encoding
# ---------------------------
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :]


# ---------------------------
# Scaled Dot-Product Attention
# ---------------------------
def attention(q, k, v, mask=None):
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(q.size(-1))
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))
    attn = F.softmax(scores, dim=-1)
    return torch.matmul(attn, v), attn


# ---------------------------
# Multi-Head Attention
# ---------------------------
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.q_proj = nn.Linear(d_model, d_model)
        self.k_proj = nn.Linear(d_model, d_model)
        self.v_proj = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)

    def forward(self, q_input, kv_input, mask=None):
        batch, q_len, d_model = q_input.size()
        k_len = kv_input.size(1)

        q = self.q_proj(q_input)
        k = self.k_proj(kv_input)
        v = self.v_proj(kv_input)

        def split_heads(x):
            return x.view(batch, -1, self.num_heads, self.head_dim).transpose(1, 2)

        q, k, v = split_heads(q), split_heads(k), split_heads(v)

        out, attn = attention(q, k, v, mask)
        out = out.transpose(1, 2).contiguous().view(batch, q_len, d_model)
        return self.out(out), attn


# ---------------------------
# Feed-Forward Network
# ---------------------------
class FeedForward(nn.Module):
    def __init__(self, d_model, hidden_dim=128):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, d_model)
        )

    def forward(self, x):
        return self.net(x)


# ---------------------------
# Transformer Decoder Layer
# ---------------------------
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, hidden_dim=128):
        super().__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForward(d_model, hidden_dim)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)

    def forward(self, x, enc_out, tgt_mask=None):
        x = self.norm1(x + self.self_attn(x, x, mask=tgt_mask)[0])
        x = self.norm2(x + self.cross_attn(x, enc_out)[0])
        x = self.norm3(x + self.ffn(x))
        return x


# ---------------------------
# Transformer Decoder
# ---------------------------
class TransformerDecoder(nn.Module):
    def __init__(self, vocab_size, d_model=32, num_heads=4, num_layers=2, max_len=100):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads) for _ in range(num_layers)])
        self.output = nn.Linear(d_model, vocab_size)

    def forward(self, tgt, enc_out):
        x = self.embedding(tgt)
        x = self.pos_encoding(x)

        seq_len = tgt.size(1)
        tgt_mask = torch.tril(torch.ones(seq_len, seq_len)).to(tgt.device)
        tgt_mask = tgt_mask.unsqueeze(0).unsqueeze(1)

        for layer in self.layers:
            x = layer(x, enc_out, tgt_mask=tgt_mask)

        return self.output(x)


# ---------------------------
# Пример использования без энкодера
# ---------------------------
if __name__ == "__main__":
    batch_size = 2
    seq_len_src = 10
    seq_len_tgt = 8
    vocab_size = 50
    d_model = 32

    # фиктивный выход энкодера
    enc_out = torch.randn(batch_size, seq_len_src, d_model)
    tgt = torch.randint(0, vocab_size, (batch_size, seq_len_tgt))

    decoder = TransformerDecoder(vocab_size, d_model=d_model)
    out = decoder(tgt, enc_out)

    print(out)


tensor([[[-1.7912e-01,  2.5063e-01, -3.7944e-01, -9.9683e-01, -5.7249e-01,
          -5.7281e-01, -2.5439e-01, -1.8484e-01, -4.5297e-01,  2.8060e-01,
           3.0251e-02,  1.2683e+00,  4.7186e-01, -1.4607e-02,  5.0784e-01,
           5.1652e-03, -1.1180e+00,  1.7665e+00, -7.8176e-02, -1.6068e-01,
           3.5998e-01,  2.4024e-01,  1.4772e-01,  5.0254e-01, -9.8214e-02,
           1.9857e-01, -2.2004e-01, -3.5355e-01, -6.4590e-01, -7.4408e-01,
          -5.7827e-02, -8.7296e-01,  1.9951e-01,  6.8790e-01,  5.6589e-01,
           1.7189e-01,  7.3011e-01, -2.5535e-01, -7.9195e-02,  1.5903e-01,
          -3.9241e-01,  2.9310e-01,  3.4093e-01, -6.3920e-01,  6.3215e-02,
           1.5241e-01,  8.0693e-01, -7.2311e-02,  1.5255e+00,  2.4122e-01],
         [ 5.9022e-02,  1.4557e-01, -1.1977e+00, -1.6888e-01, -7.5865e-01,
           2.0378e-01,  5.3450e-02, -8.9729e-01, -9.1389e-02,  5.1869e-01,
          -5.2297e-01,  9.9579e-01, -3.1255e-02,  1.2901e-01,  2.1577e-01,
           1.3955e-01, -