<a href="https://colab.research.google.com/github/VridhiJ/Transformer-Architecture/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer Architecture

Implement the transformer architecture from scratch as outlined in the "Attention is All You Need" paper using PyTorch.

![](https://drive.google.com/uc?export=view&id=1qqIS189ikaXOpHUuwSMY67lCYyVTElAJ)

In [1]:
!git clone https://github.com/VridhiJ/Transformer-Architecture.git

Cloning into 'Transformer-Architecture'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [2]:
%cd Transformer-Architecture

/content/Transformer-Architecture


In [3]:
# importing required libraries
import torch.nn as nn
import torch
import torch.nn.functional as F
import math,copy,re
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
warnings.simplefilter("ignore")
print(torch.__version__)

2.6.0+cu124


In [4]:
# Set the seed value
seed_value = 0

# For CPU
torch.manual_seed(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)

# For GPU (if using CUDA)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True

In the "Attention is All You Need" paper, the authors used the following functions to create positional encoding. A cosine function is used for odd time steps, and a sine function is used for even time steps.

<img src="https://miro.medium.com/max/524/1*yWGV9ck-0ltfV2wscUeo7Q.png">

<img src="https://miro.medium.com/max/564/1*SgNlyFaHH8ljBbpCupDhSQ.png">

```
pos -> refers to order in the sentence
i -> refers to position along embedding vector dimension
```

<img src="https://miro.medium.com/max/906/1*B-VR6R5vJl3Y7jbMNf5Fpw.png">

In [5]:
class PositionalEncoding(nn.Module):
  def __init__(self,max_seq_len, model_dimension_embed):
    """
    Args:
      max_seq_len (int): length of the input sequence
      model_dimension_embed (int): embedding dimension of the model
    """
    super(PositionalEncoding,self).__init__()
    self.embed_dim = model_dimension_embed

    # Initialize the positional encoding matrix using the above equation
    pe = torch.zeros(max_seq_len, self.embed_dim)
    for pos in range(max_seq_len):
        for i in range(self.embed_dim // 2):
            pe[pos, 2*i] = math.sin(pos / (10000 ** (2*i / self.embed_dim)))
            pe[pos, 2*i+1] = math.cos(pos / (10000 ** (2*i / self.embed_dim)))

    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)

  def forward(self, x):
    """
    Args:
      x: input vector
    Returns:
      x: output vector with positional embedding added
    """
    # Weight the embeddings relatively larger
    x = x * math.sqrt(self.embed_dim)

    # Add positional encoding to the input embeddings
    seq_len = x.size(1)
    x = x + torch.autograd.Variable(self.pe[:, :seq_len, :], requires_grad=False)
    return x

In [6]:
class MultiHeadAttention(nn.Module):
  def __init__(self,embed_dim=512, n_heads=8):
    """
    Args:
      embed_dim: embedding dimension
      n_heads: number of attention heads
    """
    super(MultiHeadAttention,self).__init__()

    self.embed_dim = embed_dim # 512 dim
    self.n_heads = n_heads # 8 dim
    self.single_head_dim = embed_dim // n_heads # 512 / 8 = 64, each key, query, and value head will be 64d

    # Initialize key, query and value matrices
    self.key_matrix = nn.Linear(self.embed_dim, self.embed_dim, bias = False)
    self.query_matrix = nn.Linear(self.embed_dim, self.embed_dim, bias = False)
    self.value_matrix = nn.Linear(self.embed_dim, self.embed_dim, bias = False)

    # Initialize output projection matrix
    self.out = nn.Linear(self.embed_dim, self.embed_dim)

  def forward(self, key, query, value, mask=None):
    """
    Args:
      key: key vector
      query: query vector
      value: value vector
      mask: mask to be applied to the attention scores(for decoder)

    Returns:
      output: vector from multi-head attention
    """
    batch_size = key.size(0)
    seq_length = key.size(1)
    seq_length_query = query.size(1)

    # Apply linear transformation to the key, query and value matrices
    k = self.key_matrix(key)
    q = self.query_matrix(query)
    v = self.value_matrix(value)

    # Reshape key, query and value
    k = k.view(batch_size, seq_length, self.n_heads, self.single_head_dim)
    q = q.view(batch_size, seq_length_query, self.n_heads, self.single_head_dim)
    v = v.view(batch_size, seq_length, self.n_heads, self.single_head_dim)

    # Transpose key, query and value
    k = k.transpose(1,2)
    q = q.transpose(1,2)
    v = v.transpose(1,2)

    # Compute attention score
    k_adjusted = k.transpose(-1,-2)
    product = torch.matmul(q, k_adjusted)

    if mask is not None:
      if len(mask.size()) == 4:
        product = product.masked_fill(mask == 0, float("-1e20"))
      else:
        mask = mask.unsqueeze(1) # Adds an extra dimension for the heads
        product = product.masked_fill(mask == 0, float("-1e20"))

    product = product / math.sqrt(self.single_head_dim)
    scores = F.softmax(product, dim=-1)

    # Compute weighted sum of value vectors and run it through the last layer
    scores = torch.matmul(scores, v)
    concat = scores.transpose(1,2).contiguous().view(batch_size, seq_length_query, self.single_head_dim * self.n_heads)
    output = self.out(concat)

    return output

In [7]:
class TransformerBlock(nn.Module):
  def __init__(self, embed_dim, expansion_factor=4, n_heads=8):
        super(TransformerBlock, self).__init__()
        """
        Args:
           embed_dim: dimension of the embedding
           expansion_factor: factor determining output dimension of the linear layer
           n_heads: number of attention heads
        """
        self.attention = MultiHeadAttention(embed_dim, n_heads)

        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, expansion_factor * embed_dim),
            nn.ReLU(),
            nn.Linear(expansion_factor * embed_dim, embed_dim)
        )

        self.dropout1 = nn.Dropout(0.2)
        self.dropout2 = nn.Dropout(0.2)

  def forward(self, key, query, value):
        """
        Args:
           key: key vector
           query: query vector
           value: value vector

        Returns:
           norm2_out: output of transformer block
        """
        # Calculate attention output using self.attention
        attention_out = self.attention(key, query, value)  # 32x10x512

        # Do dropout, add residual connection and normalize
        intermediate = self.dropout1(attention_out)
        norm1_out = self.norm1(intermediate + value)

        # Pass through feed forward layer
        feed_fwd_out = self.feed_forward(norm1_out)  # 32x10x512 -> 32x10x2048 -> 32x10x512

        # Do dropout, add residual connection and normalize
        intermediate2 = self.dropout2(feed_fwd_out)
        norm2_out = self.norm2(intermediate2 + norm1_out) # 32x10x512

        return norm2_out


class TransformerEncoder(nn.Module):
    """
    Args:
        seq_len : length of input sequence
        embed_dim: dimension of embedding
        num_layers: number of encoder layers
        expansion_factor: factor determining the number of linear layers in feed-forward layer
        n_heads: number of heads in multi-head attention

    Returns:
        out: output of the encoder
    """
    def __init__(self, seq_len, vocab_size, embed_dim, num_layers=2, expansion_factor=4, n_heads=8):
        super(TransformerEncoder, self).__init__()

        self.embedding_layer = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoder = PositionalEncoding(seq_len, embed_dim)

        self.layers = nn.ModuleList([TransformerBlock(embed_dim, expansion_factor, n_heads) for _ in range(num_layers)])

    def forward(self, x):
        # Apply the embedding layer
        embed_out = self.embedding_layer(x)  # 32x10 -> 32x10x512

        # Apply positional encoding
        out = self.positional_encoder(embed_out)  # 32x10x512

        # Pass through each TransformerBlock
        for layer in self.layers:
            out = layer(out, out, out)  # 32x10x512

        # Return the final output
        return out  # 32x10x512



In [8]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_dim, expansion_factor=4, n_heads=8):
        super(DecoderBlock, self).__init__()

        """
        Args:
           embed_dim: dimension of the embedding
           expansion_factor: factor determining output dimension of the linear layer
           n_heads: number of attention heads
        """
        self.attention = MultiHeadAttention(embed_dim, n_heads=8)
        self.norm = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(0.2)
        self.transformer_block = TransformerBlock(embed_dim, expansion_factor, n_heads)

    def forward(self, key, query, value, mask):
        """
        Args:
           key: key vector
           query: query vector
           value: value vector
           mask: mask to be given for multi-head attention

        Returns:
           out: output of transformer block
        """
        # Implement masked attention with the given mask
        attention = self.attention(query, query, query, mask=mask)  # 32x10x512
        # Do dropout, add residual connection, and then apply normalization
        intermediate = self.dropout(attention)
        query = self.norm(intermediate + query)

        # Pass through the transformer block
        out = self.transformer_block(key, query, value)

        return out

class TransformerDecoder(nn.Module):
    def __init__(self, target_vocab_size, embed_dim, seq_len, num_layers=2, expansion_factor=4, n_heads=8):
        super(TransformerDecoder, self).__init__()
        """
        Args:
           target_vocab_size: vocabulary size of the target
           embed_dim: dimension of embedding
           seq_len: length of input sequence
           num_layers: number of decoder layers
           expansion_factor: factor determining the number of linear layers in the feed-forward layer
           n_heads: number of heads in multi-head attention
        """
        self.word_embedding = nn.Embedding(target_vocab_size, embed_dim)
        self.position_embedding = PositionalEncoding(seq_len, embed_dim)

        self.layers = nn.ModuleList(
            [DecoderBlock(embed_dim, expansion_factor=4, n_heads=8) for _ in range(num_layers)]
        )
        self.fc_out = nn.Linear(embed_dim, target_vocab_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, x, enc_out, mask):
        """
        Args:
            x: input vector from target
            enc_out: output from encoder layer
            trg_mask: mask for decoder self-attention

        Returns:
            out: output vector
        """
        x = self.word_embedding(x)  # 32x10x512

        x = self.position_embedding(x)  # 32x10x512

        x = self.dropout(x)

        for layer in self.layers:
            x = layer(enc_out, x, enc_out, mask)

        out = F.softmax(self.fc_out(x), dim=-1)

        return out

In [9]:
class Transformer(nn.Module):
    def __init__(
        self,
        embed_dim,
        src_vocab_size,
        target_vocab_size,
        seq_length,
        num_labels,
        num_layers=2,
        expansion_factor=4,
        n_heads=8
    ):
        super(Transformer, self).__init__()

        """
        Args:
           embed_dim: dimension of embedding
           src_vocab_size: vocabulary size of source
           target_vocab_size: vocabulary size of target
           seq_length: length of input sequence
           num_layers: number of encoder layers
           expansion_factor: factor determining the number of linear layers in the feed-forward layer
           n_heads: number of heads in multi-head attention
        """

        self.target_vocab_size = target_vocab_size

        self.encoder = TransformerEncoder(
             seq_length, src_vocab_size, embed_dim,
            num_layers=num_layers, expansion_factor=expansion_factor, n_heads=n_heads
        )
        self.decoder = TransformerDecoder(
            target_vocab_size, embed_dim, seq_length,
            num_layers=num_layers, expansion_factor=expansion_factor, n_heads=n_heads
        )
        self.lm_head = nn.Linear(target_vocab_size, num_labels - 1)

    def make_trg_mask(self, trg):
        """
        Args:
            trg: target sequence

        Returns:
            trg_mask: target mask
        """
        # TODO: Implement the mask for the target sequence
        batch_size, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len), device=trg.device)).bool() # [trg_len, trg_len]
        trg_mask = trg_mask.unsqueeze(0).expand(batch_size, -1, -1) # [batch_size, trg_len, trg_len]
        return trg_mask

    def decode(self, src, trg):
        """
        Helper function for inference

        Args:
            src: input to encoder
            trg: input to decoder

        Returns:
            out_labels: final prediction of sequence
        """
        trg_mask = self.make_trg_mask(trg)
        enc_out = self.encoder(src)
        out_labels = []
        batch_size, seq_len = src.shape[0], src.shape[1]
        out = trg
        for i in range(seq_len):
            out = self.decoder(out, enc_out, trg_mask)  # bs x seq_len x vocab_dim
            out = out[:, -1, :].argmax(-1)
            out_labels.append(out.item())
            out = torch.unsqueeze(out, axis=0)
        return out_labels

    def forward(self, src, trg):
        """
        Args:
            src: input to encoder
            trg: input to decoder

        Returns:
            output: final vector which returns probabilities of each target word
        """
        # Implement the forward function for the Transformer class

        #Use src and trg to get the outputs from the encoder and decoder (make sure to pass in the mask where appropriate)
        trg_mask = self.make_trg_mask(trg)
        enc_out = self.encoder(src)
        dec_out = self.decoder(trg, enc_out, trg_mask)

        #Take the average across the sequence length of the decoder's output
        output = dec_out.mean(dim = 1)  # Pooling across the sequence length to get [batch_size, d_model]

        # Call the lm_head to get the predicted token for each batch
        output = self.lm_head(output) # Shape: [batch_size, 1]

        # Remove the extra dimension
        return output.squeeze(-1)  # Shape: [batch_size]

 We will be training the model on a binary classification problem that we will generate here

In [10]:
vocab_size = 20
num_layers = 6
seq_length= 12


  # let 0 be sos token and 1 be eos token

model = Transformer(
      embed_dim=512,
      src_vocab_size=vocab_size,
      target_vocab_size=vocab_size,
      seq_length=seq_length,
      num_labels=2,
      num_layers=num_layers,
      expansion_factor=4,
      n_heads=8
  )

In [11]:
def generate_synthetic_data(num_samples=1000, seq_length=10, vocab_size=100):
    # Ensure middle values follow a normal distribution
    middle_mean = vocab_size // 2
    middle_std = vocab_size // 4
    middle_values = torch.normal(middle_mean, middle_std, (num_samples, seq_length - 4))

    # Clip values to be within the valid range
    middle_values = torch.clamp(middle_values, min=10, max=vocab_size - 1).long()
    other_values = torch.randint(1, vocab_size, (num_samples, 2))  # Changed to vocab_size

    start = torch.zeros((num_samples, 1), dtype=torch.long)
    end = torch.ones((num_samples, 1), dtype=torch.long)

    data = torch.cat((start, other_values[:, :1], middle_values, other_values[:, 1:], end), dim=1)

    # Create target sequences by shifting source sequences to the right
    trg_data = torch.cat((start, data[:, :-1]), dim=1)

    # Generate labels: for example, label is 1 if the sequence contains a value > 18 in the middle part
    labels = (middle_values > 15).any(dim=1).long()

    return data, trg_data, labels

In [12]:
from torch.utils.data import TensorDataset, DataLoader

In [13]:
train_src, train_trg, train_labels = generate_synthetic_data(vocab_size=vocab_size)
test_src, test_trg, test_labels = generate_synthetic_data(vocab_size=vocab_size)

# Create DataLoaders
batch_size = 32
train_dataset = TensorDataset(train_src, train_trg, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_src, test_trg, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [14]:
print(torch.sum(train_labels))
print(train_labels.shape)

tensor(547)
torch.Size([1000])


In [15]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for src, trg, labels in test_loader:
      outputs = model(src, trg)  # Shape: [batch_size]
      outputs = outputs.squeeze()  # Shape: [batch_size]
      predictions = torch.round(torch.sigmoid(outputs))  # Shape: [batch_size]
      total += labels.size(0)
      correct += (predictions == labels).sum().item()

    accuracy = correct / (total)  # Multiply by sequence length to get the total number of elements
    print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.4920


In [16]:
criterion = nn.BCEWithLogitsLoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=0.00010). Accuracy came out to be 0.508
optimizer = torch.optim.Adam(model.parameters(), lr=0.00020) #Accuracy 0.821

In [17]:
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for src, trg, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(src, trg)
        loss = criterion(outputs, labels.float())  # BCEWithLogitsLoss expects float labels
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}')

Epoch 1/5, Loss: 0.6660
Epoch 2/5, Loss: 0.6415
Epoch 3/5, Loss: 0.6381
Epoch 4/5, Loss: 0.6377
Epoch 5/5, Loss: 0.6356


In [18]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for src, trg, labels in test_loader:
        outputs = model(src, trg)  # Shape: [batch_size]
        outputs = outputs.squeeze()  # Shape: [batch_size]
        predictions = torch.round(torch.sigmoid(outputs))  # Shape: [batch_size]
        total += labels.size(0)
        correct += (predictions == labels).sum().item()

    print(correct)
    accuracy = correct / (total)  # Multiply by sequence length to get the total number of elements
    print(f"Test Accuracy: {accuracy:.4f}")


821
Test Accuracy: 0.8210


# Improving Transformer Architecture with SwiGLU Activation

Replacing ReLU (Rectified Linear Unit) activation function with SwiGLU (SwiGLU: Swish-Gated Linear Unit), which has been adopted in models like LLaMA.

## Understanding Activation Functions

Activation functions are crucial components in neural networks that introduce non-linearity into the model, enabling it to learn complex patterns. The ReLU function is a popular choice due to its simplicity and effectiveness. It is defined as:

$ \text{ReLU}(x) = \max(0, x) $

While ReLU has been successful, it has limitations such as the "dying ReLU" problem where neurons can become inactive and only output zero for any input.

SwiGLU, introduced in 2020, is one such replacement activation function. The name SwiGLU is derived from two other activation functions: Swish and GLU. Let's explore these components in detail.
<hr>

<h3><strong>Swish Activation Function</strong></h3>

Swish is a non-linear activation function defined as follows:

$$ \text{Swish}(x) = x \cdot \sigma(\beta x) $$

where $\sigma$ represents the sigmoid function and $\beta$ is a learnable parameter. Swish can outperform the ReLU activation function because it provides a smoother transition around 0, potentially leading to better optimization.

<h3><strong>Gated Linear Unit (GLU)</strong></h3>

Gated Linear Units (GLUs) are neural network layers defined as the component-wise product of two linear transformations, one of which is activated by a sigmoid function. The GLU can be represented by the following equation:

$$ \text{GLU}(x) = \sigma(W_1 x + b) \otimes (V x + c) $$

GLUs have proven effective in capturing long-range dependencies in sequences, addressing some of the vanishing gradient problems associated with other gating mechanisms like those in LSTMs and GRUs.

<h3><strong>SwiGLU</strong></h3>

SwiGLU combines the concepts of Swish and GLU. Instead of using a sigmoid activation function, SwiGLU uses Swish with $\beta = 1$. The resulting formula for SwiGLU is:

$$ \text{SwiGLU}(x) = \text{Swish}(W_1 x + b) \otimes (V x + c) $$



In [26]:
# Implement forward function of the SwiGLU class
class SwiGLU(nn.Module):

    def __init__(self, w1, w2, w3) -> None:
        super().__init__()
        self.w1 = w1
        self.w2 = w2
        self.w3 = w3

    def forward(self, x):
        # Implement forward function of SwiGLU
        x1 = F.linear(x, self.w1.weight.T)
        x2 = F.linear(x, self.w2.weight)
        hidden = F.silu(x1) * x2

        return F.linear(hidden, self.w3.weight)

## Integrate SwiGLU and Compare Performance

1. **Update the TransformerBlock:** Replace the ReLU activation function in the `TransformerBlock` with the `SwiGLU` function.

2. **Update the Main Transformer Class:** Ensure that all components of the `Transformer` class are using the updated `TransformerBlock` with `SwiGLU`.

3. **Train and Evaluate:** Train your updated Transformer model on the provided dataset and evaluate its performance. Compare the performance of the Transformer model using SwiGLU with the original Transformer model using ReLU.

In [27]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, expansion_factor=4, n_heads=8):
        super(TransformerBlock, self).__init__()
        """
        Args:
           embed_dim: dimension of the embedding
           expansion_factor: factor determining output dimension of the linear layer
           n_heads: number of attention heads
        """
        self.attention = MultiHeadAttention(embed_dim, n_heads)

        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)

        hidden_dim = 4 * embed_dim # 4x512
        hidden_dim = int(2 * hidden_dim / 3)
        hidden_dim = expansion_factor * ((hidden_dim + expansion_factor - 1) // expansion_factor)

        self.swi_glu = SwiGLU(
            nn.Linear(expansion_factor*embed_dim, hidden_dim, bias=False), # 2048x2048
            nn.Linear(hidden_dim, expansion_factor*embed_dim, bias=False), #2048x2048
            nn.Linear(expansion_factor*embed_dim, hidden_dim, bias=False) #2048x2048
        )

        # TODO: Use swiglu along with linear layers
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim), #2048
            #nn.ReLU(),
            self.swi_glu,
            nn.Linear(hidden_dim, embed_dim) #512

        )

        self.dropout1 = nn.Dropout(0.2)
        self.dropout2 = nn.Dropout(0.2)

    def forward(self, key, query, value):
        """
        Args:
           key: key vector
           query: query vector
           value: value vector

        Returns:
           norm2_out: output of transformer block
        """
        attention_out = self.attention(key, query, value)  # 32x10x512

        # TODO: Do something similar as the encoder in the previous homework
        intermediate = self.dropout1(attention_out)
        norm1_out = self.norm1(intermediate + value)

        # Use SwiGLU in feed-forward layer
        feed_fwd_out = self.feed_forward(norm1_out)

        intermediate2 = self.dropout2(feed_fwd_out)
        norm2_out = self.norm2(intermediate2 + norm1_out)

        return norm2_out


In [28]:
vocab_size = 20
num_layers = 6
seq_length= 12


# let 0 be sos token and 1 be eos token

model2 = Transformer(
    embed_dim=512,
    src_vocab_size=vocab_size,
    target_vocab_size=vocab_size,
    seq_length=seq_length,
    num_labels=2,
    num_layers=num_layers,
    expansion_factor=4,
    n_heads=8
)

In [29]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model2.parameters(), lr=0.00020) # Accuracy 0.817

In [30]:
num_epochs = 5
for epoch in range(num_epochs):
    model2.train()
    epoch_loss = 0
    for src, trg, labels in train_loader:
        optimizer.zero_grad()
        outputs = model2(src, trg)
        loss = criterion(outputs, labels.float())  # BCEWithLogitsLoss expects float labels
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}')

Epoch 1/5, Loss: 0.6675
Epoch 2/5, Loss: 0.6325
Epoch 3/5, Loss: 0.6314
Epoch 4/5, Loss: 0.6334
Epoch 5/5, Loss: 0.6316


In [31]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for src, trg, labels in test_loader:
        outputs = model2(src, trg)  # Shape: [batch_size]
        outputs = outputs.squeeze()  # Shape: [batch_size]
        predictions = torch.round(torch.sigmoid(outputs))  # Shape: [batch_size]
        total += labels.size(0)
        correct += (predictions == labels).sum().item()

    print(correct)
    accuracy = correct / (total)  # Multiply by sequence length to get the total number of elements
    print(f"Test Accuracy: {accuracy:.4f}")


817
Test Accuracy: 0.8170
