### Importing Libraries

In [17]:
import numpy as np
import torch
import torch.nn as nn

### Dummy Data Generation

In [18]:
seed = 42
torch.manual_seed(seed)  # Set seed for CPU operations
torch.cuda.manual_seed_all(seed)  # Set seed for all GPU operations (if using CUDA)
torch.backends.cudnn.deterministic = True  # Ensure deterministic behavior for CuDNN backend
torch.backends.cudnn.benchmark = False  # Disable the auto-tuner for benchmarking (for reproducibility)

sequence_length = 1024 
embedding_dim = 384 
batch_size = 100
 
# N x A x B (N = batch size, A = sequence_length, B = embedding_dimension)
dummy_input = torch.randn(batch_size, sequence_length, embedding_dim)

data_size_gb = dummy_input.nbytes / (1024 ** 3)
print("Data size (in GB): {:.2f} GB".format(data_size_gb))
print("Data shape:", dummy_input.shape)

Data size (in GB): 0.15 GB
Data shape: torch.Size([100, 1024, 384])


In [19]:
# This input can be called from CUDA and C++ by using the codes below.

# // Loading library
# include <torch/torch.h>

# // Loading the tensor from the .pt file
# torch::Tensor dummy_input = torch::load("dummy_input.pt");

# // Moving the tensor to CPU 
# dummy_input = dummy_input.to(torch::kCPU);

# // Moving the tensor to CUDA 
# dummy_input = dummy_input.to(torch::kCUDA);

torch.save(dummy_input, "dummy_input.pt")

In [21]:
nn.Linear(embedding_dim, embedding_dim)

Linear(in_features=384, out_features=384, bias=True)

### Single-Head Attention 

In [5]:
class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super(SelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.softmax = nn.Softmax(dim=-1)
    
    def forward(self, x):
        # x shape: (batch_size, seq_length, embed_dim)
        Q = self.query(x)  # (batch_size, seq_length, embed_dim)
        K = self.key(x)    # (batch_size, seq_length, embed_dim)
        V = self.value(x)  # (batch_size, seq_length, embed_dim)
        
        # Compute attention scores
        scores = torch.bmm(Q, K.transpose(1, 2))  # (batch_size, seq_length, seq_length)
        scores = scores / (self.embed_dim ** 0.5)  # Scale scores
        attention_weights = self.softmax(scores)  # (batch_size, seq_length, seq_length)
        
        # Weighted sum of values
        attention_output = torch.bmm(attention_weights, V)  # (batch_size, seq_length, embed_dim)
        return attention_output, attention_weights

In [6]:
self_attention = SelfAttention(embedding_dim)

self_attention_output, self_attention_weights = self_attention(dummy_input)

print("Output Shape:", self_attention_output.shape) 
print("Attention Weights Shape:", self_attention_weights.shape)  

torch.save(self_attention_output, "self_attention_output.pt")
torch.save(self_attention_weights, "self_attention_weights.pt")

Output Shape: torch.Size([100, 1024, 384])
Attention Weights Shape: torch.Size([100, 1024, 1024])


### Multi-Head Attention

In [7]:
linear_query = nn.Linear(embedding_dim, embedding_dim)
linear_key = nn.Linear(embedding_dim, embedding_dim)
linear_value = nn.Linear(embedding_dim, embedding_dim)

In [8]:
multihead_attention = nn.MultiheadAttention(embed_dim=embedding_dim, num_heads=4, batch_first=True)

query = linear_query(dummy_input)
key = linear_key(dummy_input)
value = linear_value(dummy_input)

multihead_attention_output, multihead_attention_weights = multihead_attention(query, key, value)

print("Output Shape:", multihead_attention_output.shape) 
print("Attention Weights Shape:", multihead_attention_weights.shape)  

torch.save(self_attention_output, "multihead_attention_output.pt")
torch.save(self_attention_weights, "multihead_attention_weights.pt")

Output Shape: torch.Size([100, 1024, 384])
Attention Weights Shape: torch.Size([100, 1024, 1024])
