In [None]:
#task 1 
import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def _init_(self, d_model, n_heads):
        super(MultiHeadAttention, self)._init_()
        self.n_heads = n_heads
        self.d_head = d_model // n_heads
        self.linear_q = nn.Linear(d_model, d_model)
        self.linear_k = nn.Linear(d_model, d_model)
        self.linear_v = nn.Linear(d_model, d_model)
        self.linear_out = nn.Linear(d_model, d_model)

    def forward(self, q, k, v, mask=None):
        q, k, v = self.linear_q(q), self.linear_k(k), self.linear_v(v)
        q, k, v = self.split_heads(q), self.split_heads(k), self.split_heads(v)
        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.d_head**0.5)

        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attn_weights = nn.functional.softmax(scores, dim=-1)
        attn_output = torch.matmul(attn_weights, v)
        attn_output = self.combine_heads(attn_output)
        output = self.linear_out(attn_output)

        return output

    def split_heads(self, x):
        batch_size, seq_len, d_model = x.size()
        return x.view(batch_size, seq_len, self.n_heads, self.d_head).transpose(1, 2).contiguous().view(batch_size * self.n_heads, seq_len, self.d_head)

    def combine_heads(self, x):
        batch_size, seq_len, _ = x.size()
        return x.view(batch_size // self.n_heads, self.n_heads, seq_len, self.d_head).transpose(1, 2).contiguous().view(batch_size // self.n_heads, seq_len, -1)

class PositionalEncoding(nn.Module):
    def _init_(self, d_model, max_len=512):
        super(PositionalEncoding, self)._init_()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].detach()

class FeedForward(nn.Module):
    def _init_(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self)._init_()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        x = nn.functional.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

class TransformerBlock(nn.Module):
    def _init_(self, d_model, n_heads, d_ff, dropout=0.1):
        super(TransformerBlock, self)._init_()
        self.attention = MultiHeadAttention(d_model, n_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        self.ff = FeedForward(d_model, d_ff)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x, mask=None):
        x = x + self.dropout(self.attention(x, x, x, mask))
        x = self.norm1(x)
        x = x + self.dropout(self.ff(x))
        x = self.norm2(x)
        return x

class GPT2(nn.Module):
    def _init_(self, vocab_size, d_model=768, n_heads=12, d_ff=3072, n_layers=12):
        super(GPT2, self)._init_()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.transformer_blocks = nn.ModuleList([TransformerBlock(d_model, n_heads, d_ff) for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for block in self.transformer_blocks:
            x = block(x, mask)
        x = self.fc(x)
        return x