# Getting started

In [56]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [53]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}!")

Using cpu!


# Prepare data

In [54]:
path_to_data = "data/moby_dick.txt"
all_data = []
with open(path_to_data, 'r', encoding="utf-8") as f:
    for line in f:
        all_data.append(line.strip())

text = ''.join(all_data)
sentences = text.split('.')[2:] # Remove the chapter

In [55]:
sentence_word = [sentence.split() for sentence in sentences]
sentence_word.sort(key=lambda x: len(x))
sentence_word = sentence_word[1:] # Remove empty sentence

# Get maximum length of sentence
print(max(len(sentence) for sentence in sentence_word))

# Get smallest length of sentence
print(min(len(sentence) for sentence in sentence_word))

361
1


In [57]:
word2idx = {'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, '<UNK>': 3}
curr = 4
for sentence in sentence_word:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = curr
            curr += 1

idx2word = [''] * len(word2idx)
for token, idx in word2idx.items():
    idx2word[idx] = token

def process_sentence(sentence):
    output = [word2idx['<SOS>']]
    for word in sentence:
        if word not in word2idx:
            word = '<UNK>'
        output.append(word2idx[word])

    output.append(word2idx['<EOS>'])
    return output

def add_padding(X, batch_size=32):
    num_sentences = len(X)
    pad_idx = word2idx['<PAD>'] 
    for i in range(0, num_sentences, batch_size):
        idx = min(num_sentences-1, i + batch_size)
        max_length = len(proc_sentence[idx])
        for j in range(i, idx):
            missing = max_length - len(proc_sentence[j])
            proc_sentence[j].extend([pad_idx]*(missing))
    
    return X

class LMDataset(Dataset):
    def __init__(self, X):
        self.X = X
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return torch.tensor(self.X[idx])

batch_size = 32
proc_sentence = [process_sentence(sentence) for sentence in sentence_word]
padded_sentences = add_padding(proc_sentence, batch_size)
dataset = LMDataset(padded_sentences)
dataloader = DataLoader(dataset, shuffle=False, batch_size=batch_size)

In [61]:
for X in dataloader:
    print(X.shape)
    break

torch.Size([32, 3])


# Architecture

In [126]:
class MultiAttentionHead(nn.Module):
    def __init__(self, num_heads, hidden_size):
        super().__init__()
        head_dim = hidden_size // num_heads

        # Weight matrices for queries, keys, values
        self.W_q = nn.Parameter(torch.randn(num_heads, hidden_size, head_dim))
        self.W_k = nn.Parameter(torch.randn(num_heads, hidden_size, head_dim))
        self.W_v = nn.Parameter(torch.randn(num_heads, hidden_size, head_dim))

        self.hidden_size = hidden_size
        self.head_dim = head_dim
        self.num_heads = num_heads

        # Output linear projection
        self.output_proj = nn.Parameter(torch.randn(hidden_size, hidden_size))

    def forward(self, X, padding_mask):
        """
        Forward pass for multi-head attention.

        Inputs:
            X            : (batch_size, seq_len, hidden_size)
            padding_mask : (batch_size, seq_len)  -> 1 for real tokens, 0 for padding
        """
        seq_len = padding_mask.shape[-1]

        # Create attention mask
        mask_matrix = padding_mask.unsqueeze(-1) * padding_mask.unsqueeze(-2)  # (batch, seq_len, seq_len)
        attention_mask = mask_matrix.unsqueeze(1).expand(-1, self.num_heads, -1, -1)
        attention_mask = torch.where(attention_mask == 1, 0.0, -float('inf'))

        # Add head dimension to input
        X_heads = X.unsqueeze(1)  # (batch, 1, seq_len, hidden_size)

        # Compute Q, K, V
        Q = torch.matmul(X_heads, self.W_q)
        K = torch.matmul(X_heads, self.W_k)
        V = torch.matmul(X_heads, self.W_v)

        # Compute attention scores
        K_transposed = K.transpose(-2, -1)
        scores = torch.matmul(Q, K_transposed)
        scaled_scores = scores / (self.head_dim ** 0.5)

        # Apply mask and softmax
        masked_scores = scaled_scores + attention_mask
        att_weights = torch.softmax(masked_scores, dim=-1)

        # Compute attention output
        att_output = torch.matmul(att_weights, V).reshape(-1, seq_len, self.num_heads * self.head_dim)

        # Final linear projection
        projected_output = torch.matmul(att_output, self.output_proj)

        return projected_output

    

class Transformer(nn.Module):
    def __init__(self, voc_size, embed_size, num_heads, depth):
        super().__init__()
        self.embed = nn.Embedding(voc_size, embed_size)
        

    def forward():
        pass

In [125]:
# Example dimensions
batch_size = 2    # number of sequences
seq_len = 5       # tokens per sequence
hidden_size = 32   # embedding dimension
h = 8 # number of heads
head = MultiAttentionHead(h, hidden_size)

# Create random input
X = torch.randn(batch_size, seq_len, hidden_size)
att_mask = torch.randint(0, 2, (batch_size, seq_len))
output = head(X, att_mask)



print("Shape of X:", X.shape)
print(output.shape)

Shape of X: torch.Size([2, 5, 32])
torch.Size([2, 5, 32])
