# Model Components
1. Embedding Layer: Converts input token indices to dense vectors.
2. Positional Encoding: Adds position information to embeddings to maintain the sequence order.
3. Encoder and Decoder Layers: Core processing units in the transformer.
4. Output Layer: Converts decoder output to token probabilities for generating text.

In [1]:
import torch
x = torch.rand(5, 3)
print(x)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "/opt/anaconda3/envs/fyp-convo-prediction/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/anaconda3/envs/fyp-convo-prediction/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/anaconda3/envs/fyp-convo-prediction/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/envs/fyp-convo-prediction/lib/python3.9/site-packages/traitlets/config/appl

tensor([[0.7222, 0.6455, 0.3649],
        [0.8126, 0.5875, 0.2426],
        [0.4509, 0.0818, 0.6368],
        [0.7041, 0.5582, 0.2735],
        [0.2231, 0.5530, 0.5896]])


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import math
import torch.nn.functional as F


## Embedding Layer

In [3]:
class Embedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(Embedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x) * math.sqrt(d_model)  # Scale embeddings by sqrt(d_model)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.encoding = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))

        self.encoding[:, 0::2] = torch.sin(position * div_term)
        self.encoding[:, 1::2] = torch.cos(position * div_term)
        self.encoding = self.encoding.unsqueeze(0)

    def forward(self, x):
        return x + self.encoding[:, :x.size(1)]


In [4]:
def scaled_dot_product_attention(query, key, value, mask=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)

    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)

    attention_weights = torch.softmax(scores, dim=-1)
    return torch.matmul(attention_weights, value), attention_weights

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, d_model):
        
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.linear_q = nn.Linear(d_model, d_model)
        self.linear_k = nn.Linear(d_model, d_model)
        self.linear_v = nn.Linear(d_model, d_model)
        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        def transform(x, linear_layer):
            x = linear_layer(x)
            return x.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)

        query, key, value = map(lambda x, func: transform(x, func), [query, key, value], [self.linear_q, self.linear_k, self.linear_v])

        # Apply scaled dot product attention
        attention_output, attention_weights = scaled_dot_product_attention(query, key, value, mask)

        # Concatenate attention heads
        attention_output = attention_output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_k)

        return self.fc_out(attention_output)


In [5]:
class PositionwiseFeedforward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedforward, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.dropout(F.relu(self.linear1(x))))


In [6]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(num_heads, d_model)
        self.feed_forward = PositionwiseFeedforward(d_model, d_ff, dropout)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        # Self-attention
        src2 = self.attention(src, src, src, src_mask)
        src = src + self.dropout(src2)  # Add & Norm
        src = self.layer_norm1(src)
        
        # Feed-forward
        src2 = self.feed_forward(src)
        src = src + self.dropout(src2)  # Add & Norm
        return self.layer_norm2(src)


In [7]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout):
        super(Encoder, self).__init__()
        self.embedding = Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

    def forward(self, src, src_mask):
        src = self.embedding(src)
        src = self.positional_encoding(src)
        for layer in self.layers:
            src = layer(src, src_mask)
        return src


In [8]:
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout):
        super(TransformerModel, self).__init__()
        self.encoder = Encoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, src_mask):
        enc_output = self.encoder(src, src_mask)
        output = self.fc_out(enc_output)
        return output


In [9]:
vocab_size = 10000  # Adjust according to your dataset
d_model = 512
num_layers = 6
num_heads = 8
d_ff = 2048
dropout = 0.1

model = TransformerModel(vocab_size, d_model, num_layers, num_heads, d_ff, dropout)


In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)


In [20]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for batch_idx, (input_ids, targets) in enumerate(train_loader):  # Assuming train_loader is your data loader
        src_mask = None  # Define src_mask if required by your task
        outputs = model(input_ids, src_mask)
        loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))


        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f"Epoch {epoch} Batch {batch_idx} Loss {loss.item()}")


NameError: name 'train_loader' is not defined

# Data Loader