In [1]:
import torch
if torch.backends.mps.is_available():
    print("MPS device found. Using GPU.")
else:
    print("MPS device not found. Using CPU.")


MPS device not found. Using CPU.


In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

torch.manual_seed(42)

# Download names file
# !wget https://raw.githubusercontent.com/hackerb9/ssa-baby-names/refs/heads/main/allnames.txt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())

print(device)

# Load names
with open("allnames.txt", "r") as f:
    names = f.read().splitlines()
    names = [name.lower() for name in names]

# Define alphabet and mappings
alphabet = [' '] + sorted(list(set(''.join(names)))) + ['.']
itoc = {i: c for i, c in enumerate(alphabet)}
ctoi = {c: i for i, c in enumerate(alphabet)}

encode = lambda name : [ctoi[c] for c in name]
decode = lambda tokens : ''.join([itoc[i] for i in tokens])

# Create training and validation set
n=int(0.9*len(names))
train_data, val_data = random_split(names, [n, len(names)-n])


class NameDataset(Dataset):
    def __init__(self, names):
        self.names = names
        self.ctoi = ctoi
        self.alphabet_size = len(alphabet)

    def __len__(self):
        return len(self.names)

    def __getitem__(self, idx):
        name = self.names[idx]
        x = [self.ctoi[c] for c in name]  # Convert characters to indices
        y = x[1:] + [self.ctoi[' ']]  # The next character to predict (shifted version of x)
        x = torch.tensor(x).to(device)
        y = torch.tensor(y).to(device)
        return x, y  

# Define a function to pad sequences
def pad_sequences(batch):
    max_len = max([len(x) for x, _ in batch])  # Find the max length in the batch
    padded_x = []
    padded_y = []

    for x, y in batch:
        padded_x.append(F.pad(x, (0, max_len - len(x)), "constant", ctoi[' ']))  # Pad x
        padded_y.append(F.pad(y, (0, max_len - len(x)), "constant", ctoi['.']))  # Pad y

    # Stack the padded sequences to create the batch
    return torch.stack(padded_x), torch.stack(padded_y)


True
cuda


In [3]:
torch.manual_seed(42)

train_dataset = NameDataset(train_data)
val_dataset = NameDataset(val_data)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=pad_sequences)

name = next(iter(train_loader)) # Tuple of (x, target)
print(decode(name[0].tolist()[0]))
print(name[0][0]) # grab the 0th name
print(name[1][0]) # grab the 0th target

wilona   
tensor([23,  9, 12, 15, 14,  1,  0,  0,  0], device='cuda:0')
tensor([ 9, 12, 15, 14,  1,  0, 27, 27, 27], device='cuda:0')


In [4]:
n_embd=len(alphabet)

x=torch.tensor(encode('laika')).unsqueeze(0) # to add batch dimension

xenc=F.one_hot(x, num_classes=n_embd).float()
print(xenc)

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])


In [5]:
print(xenc @ xenc.transpose(-2,-1)) # (5x28) * (28x5) -> (5x5)


tensor([[[1., 0., 0., 0., 0.],
         [0., 1., 0., 0., 1.],
         [0., 0., 1., 0., 0.],
         [0., 0., 0., 1., 0.],
         [0., 1., 0., 0., 1.]]])


In [6]:
print((xenc @ xenc.transpose(-2,-1)) @ xenc) # (1x5x5) x (1x5x28) -> (1x5x28)


tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])


In [7]:
(xenc @ xenc.transpose(-2,-1)).softmax(dim=-1)


tensor([[[0.4046, 0.1488, 0.1488, 0.1488, 0.1488],
         [0.1185, 0.3222, 0.1185, 0.1185, 0.3222],
         [0.1488, 0.1488, 0.4046, 0.1488, 0.1488],
         [0.1488, 0.1488, 0.1488, 0.4046, 0.1488],
         [0.1185, 0.3222, 0.1185, 0.1185, 0.3222]]])

In [8]:
attn = (xenc @ xenc.transpose(-2,-1)).softmax(dim=-1) @ xenc
print(attn)


tensor([[[0.0000, 0.2977, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.1488, 0.0000, 0.1488, 0.4046, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.6444, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.1185, 0.0000, 0.1185, 0.1185, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.2977, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.4046, 0.0000, 0.1488, 0.1488, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.2977, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
          0.0000, 0.1488, 0.0000, 0.4046, 0.1488, 0.0000, 0.0000, 0.0000,
          0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,

In [9]:
attn.argmax(dim=-1)


tensor([[12,  1,  9, 11,  1]])

In [10]:
B, T, C = xenc.shape
dk = C

query = nn.Linear(C, dk, bias=False)
key = nn.Linear(C, dk, bias=False) 
value = nn.Linear(C, dk, bias=False) 

Q = query(xenc) # B x T x dk
K = key(xenc) # B x T x dk
V = value(xenc) # B x T x dk

In [11]:
attn = ((Q @ K.transpose(-2,-1))/(dk**0.5)).softmax(dim=-1) @ V


In [12]:
decode(attn.argmax(dim=-1)[0].tolist())


'sssss'

In [13]:
for _ in range(10):  
  attn_probs = attn.softmax(dim=-1)  # Apply softmax to get probabilities over the vocabulary
  sampled_indices = torch.multinomial(attn_probs.view(-1, attn_probs.size(-1)), 1)
  print(decode(sampled_indices.T[0].tolist()))

cftuf
jtmed
unbpc
zxfrr
jzbqz
vuykl
emlfg
nm.uk
kmckr
btoss


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by the number of heads"
        self.num_heads = num_heads
        self.dk = embed_dim // num_heads
        
        # Linear layers for query, key, and value (in the case of cross-attention, separate inputs are used)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)

    def forward(self, q, k, v):
        B, T, C = q.shape  # Assuming q, k, v have the same shape (B: batch size, T: sequence length, C: embedding dim)
        
        # Project Q, K, V using their respective linear layers
        q = self.q_proj(q)  # Shape: (B, T, C)
        k = self.k_proj(k)  # Shape: (B, T, C)
        v = self.v_proj(v)  # Shape: (B, T, C)
        
        # Reshape into (B, num_heads, T, dk)
        q = q.view(B, T, self.num_heads, self.dk).transpose(1, 2)  # (B, heads, T, dk)
        k = k.view(B, T, self.num_heads, self.dk).transpose(1, 2)  # (B, heads, T, dk)
        v = v.view(B, T, self.num_heads, self.dk).transpose(1, 2)  # (B, heads, T, dk)
        
        # Scaled dot-product attention
        attn_weights = (q @ k.transpose(-2, -1)) / (self.dk ** 0.5)  # (B, heads, T, T)
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_output = attn_weights @ v  # (B, heads, T, dk)
        
        # Combine heads back to (B, T, C)
        attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, C)
        
        # Final linear projection
        return self.out_proj(attn_output)


m = MultiHeadAttention(28,4)
attn = m(xenc,xenc,xenc)
attn.shape

torch.Size([1, 5, 28])

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=4*28, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        
        # Multi-Head Attention
        self.self_attention = MultiHeadAttention(d_model, nhead)
        
        # Feedforward layer
        self.feedforward = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),  # First fully connected layer
            nn.ReLU(),                          # Non-linearity
            nn.Linear(dim_feedforward, d_model)  # Second fully connected layer
        )
        
        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # Self-attention block
        attn_output = self.self_attention(src, src, src)
        src = self.norm1(src + attn_output)  # Add & Norm
        
        # Feedforward block
        ff_output = self.feedforward(src)
        src = self.norm2(src + self.dropout(ff_output))  # Add & Norm

        return src

encoder_layer = TransformerEncoderLayer(28, 4)
output = encoder_layer(xenc)
output.shape

torch.Size([1, 5, 28])

In [16]:
import torch
import math

class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=16):
        super(PositionalEncoding, self).__init__()
        
        # Create a long enough "position" tensor
        position = torch.arange(0, max_len).unsqueeze(1)  # (max_len, 1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2) * -(math.log(10000.0) / embed_dim))  # (embed_dim / 2)
        
        # Apply the sine and cosine functions
        pe = torch.zeros(max_len, embed_dim)
        pe[:, 0::2] = torch.sin(position * div_term)  # Apply sine to even indices
        pe[:, 1::2] = torch.cos(position * div_term)  # Apply cosine to odd indices
        
        # Register the positional encoding as a buffer (no gradient updates)
        self.register_buffer('pe', pe)

    def forward(self, x):
        # x: Tensor of shape (batch_size, seq_len, embed_dim)
        return x + self.pe[:x.size(1)]  # Add the positional encoding to the input tensor

m = PositionalEncoding(28)
m.forward(xenc).shape

torch.Size([1, 5, 28])

In [17]:
B, T, C = xenc.shape

print(f"Unmasked attention:\n {(xenc @ xenc.transpose(-2,-1)).softmax(dim=-1)}")

wei = xenc @ xenc.transpose(-2,-1) 
wei = wei.masked_fill(torch.tril(torch.ones(T,T)) == 0, float('-inf')) 
wei = F.softmax(wei, dim=-1) 
    
print(f"Masked attention:\n {wei}")

Unmasked attention:
 tensor([[[0.4046, 0.1488, 0.1488, 0.1488, 0.1488],
         [0.1185, 0.3222, 0.1185, 0.1185, 0.3222],
         [0.1488, 0.1488, 0.4046, 0.1488, 0.1488],
         [0.1488, 0.1488, 0.1488, 0.4046, 0.1488],
         [0.1185, 0.3222, 0.1185, 0.1185, 0.3222]]])
Masked attention:
 tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2689, 0.7311, 0.0000, 0.0000, 0.0000],
         [0.2119, 0.2119, 0.5761, 0.0000, 0.0000],
         [0.1749, 0.1749, 0.1749, 0.4754, 0.0000],
         [0.1185, 0.3222, 0.1185, 0.1185, 0.3222]]])


In [18]:
class MaskedMultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        assert embed_dim % num_heads == 0, "Embedding dimension must be divisible by the number of heads"
        self.num_heads = num_heads
        self.dk = embed_dim // num_heads
        
        # Linear layers for query, key, and value (in the case of cross-attention, separate inputs are used)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)

    def forward(self, q, k, v):
        B, T, C = q.shape  # Assuming q, k, v have the same shape (B: batch size, T: sequence length, C: embedding dim)
        
        # Project Q, K, V using their respective linear layers
        q = self.q_proj(q)  # Shape: (B, T, C)
        k = self.k_proj(k)  # Shape: (B, T, C)
        v = self.v_proj(v)  # Shape: (B, T, C)
        
        # Reshape into (B, num_heads, T, dk)
        q = q.view(B, T, self.num_heads, self.dk).transpose(1, 2)  # (B, heads, T, dk)
        k = k.view(B, T, self.num_heads, self.dk).transpose(1, 2)  # (B, heads, T, dk)
        v = v.view(B, T, self.num_heads, self.dk).transpose(1, 2)  # (B, heads, T, dk)
        
        # Scaled dot-product attention with mask
        attn_weights = (q @ k.transpose(-2, -1)) / (self.dk ** 0.5)  # (B, heads, T, T)
        attn_weights = attn_weights.masked_fill(torch.tril(torch.ones(T,T, device=q.device)) == 0, float('-inf'))
        attn_weights = F.softmax(attn_weights, dim=-1)
        attn_output = attn_weights @ v  # (B, heads, T, dk)
        
        # Combine heads back to (B, T, C)
        attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, C)
        
        # Final linear projection
        return self.out_proj(attn_output)

m = MaskedMultiHeadAttention(28,4)
attn = m(xenc,xenc,xenc)
attn.shape

torch.Size([1, 5, 28])

In [19]:
class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1):
        super(TransformerDecoderLayer, self).__init__()
        
        # Masked Multi-Head Attention
        self.self_attention = MaskedMultiHeadAttention(d_model, nhead)
        
        # Feedforward layer
        self.feedforward = nn.Sequential(
            nn.Linear(d_model, dim_feedforward),  # First fully connected layer
            nn.ReLU(),                          # Non-linearity
            nn.Linear(dim_feedforward, d_model)  # Second fully connected layer
        )
        
        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # Self-attention block
        attn_output = self.self_attention(src, src, src)
        src = self.norm1(src + attn_output)  # Add & Norm
        
        # Feedforward block
        ff_output = self.feedforward(src)
        src = self.norm2(src + self.dropout(ff_output))  # Add & Norm

        return src

encoder_layer = TransformerDecoderLayer(28, 4)
output = encoder_layer(xenc)
output.shape

torch.Size([1, 5, 28])

In [20]:
class RandomNameGenerator(nn.Module):
  def __init__(self, d_model, nhead, nlayers, max_length):
    super().__init__()

    self.d_model = d_model
    self.nhead = nhead
    self.embed = nn.Embedding(len(alphabet), d_model)
#    self.pe = PositionalEncoding(d_model)
    self.wpe = nn.Embedding(max_length,d_model)
    self.decoder = nn.ModuleList([TransformerDecoderLayer(d_model, nhead) for _ in range(nlayers)])

    self.linear = nn.Linear(d_model, len(alphabet))
    self.softmax = nn.Softmax(dim=-1)

  def forward(self, x):
    B, T = x.size()

    x = self.embed(x)
    
    #x = self.pe(x)
    pos = torch.arange(0, T, dtype=torch.long, device=x.device).unsqueeze(0) # shape (1, t)
    x = x + self.wpe(pos)
    
    for layer in self.decoder:
      x = layer(x)
    x = self.linear(x)
    return x
  
  @torch.no_grad()
  def generate(self, x, max_new_tokens):
    for _ in range(max_new_tokens):
      logits = self(x)
      logits = logits[:, -1, :]
      probs = self.softmax(logits)
      next_token = torch.multinomial(probs, num_samples=1)
      if next_token == ctoi[' ']:
        break
      x = torch.cat((x, next_token), dim=1)
    return x[:,1:] # drop the first seed character

torch.manual_seed(42)
m = RandomNameGenerator(32, 4,2,16).to(device)

print(decode(m.generate(torch.tensor([0]).unsqueeze(0).to(device),8).tolist()[0]))
print(f"Model Parameters: {sum(p.numel() for p in m.parameters())}")

irwuopbn
Model Parameters: 277084


In [21]:
optimizer = torch.optim.AdamW(m.parameters(), lr=5e-4, weight_decay=0.01, betas=(0.9, 0.99), eps=1e-8)

for epoch in range(10):
  for xenc_batch, y_batch in train_loader:
    optimizer.zero_grad()

    logits = m(xenc_batch)
    logits = logits.view(-1, logits.size(-1))  # Shape: [batch_size * max_seq_len, vocab_size]
    y_batch = y_batch.view(-1)  # Shape: [batch_size * max_seq_len]

    # Compute the loss using CrossEntropyLoss
    loss = F.cross_entropy(logits, y_batch, ignore_index=ctoi['.'])
    
    # Backward pass
    m.zero_grad(set_to_none=True) # make sure ALL the gradients are set to zero
    loss.backward()

    optimizer.step()

  print(f"Epoch {epoch}, Loss: {loss}")

Epoch 0, Loss: 2.114407777786255
Epoch 1, Loss: 1.8800314664840698
Epoch 2, Loss: 1.8707400560379028
Epoch 3, Loss: 1.900682806968689
Epoch 4, Loss: 1.8413975238800049
Epoch 5, Loss: 1.9123412370681763
Epoch 6, Loss: 1.8829976320266724
Epoch 7, Loss: 1.8685030937194824
Epoch 8, Loss: 1.9657368659973145
Epoch 9, Loss: 1.8908004760742188


In [23]:
@torch.inference_mode()
def evaluate(model, dataset, batch_size=50, max_batches=None):
    model.eval()
    loader = DataLoader(dataset, shuffle=True, batch_size=batch_size, num_workers=0, collate_fn=pad_sequences)
    losses = []
    for i, batch in enumerate(loader):
        X, Y = batch
        logits = model(X)
        logits = logits.view(-1, logits.size(-1))
        Y = Y.view(-1)  # Shape: [batch_size * max_seq_len]

        # Compute the loss using CrossEntropyLoss
        loss = F.cross_entropy(logits, Y, ignore_index=ctoi['.'])

        losses.append(loss.item())
        if max_batches is not None and i >= max_batches:
            break
    mean_loss = torch.tensor(losses).mean().item()
    model.train() # reset model back to training mode
    return mean_loss

In [28]:
print(m)

RandomNameGenerator(
  (embed): Embedding(28, 32)
  (wpe): Embedding(16, 32)
  (decoder): ModuleList(
    (0-1): 2 x TransformerDecoderLayer(
      (self_attention): MaskedMultiHeadAttention(
        (q_proj): Linear(in_features=32, out_features=32, bias=False)
        (k_proj): Linear(in_features=32, out_features=32, bias=False)
        (v_proj): Linear(in_features=32, out_features=32, bias=False)
        (out_proj): Linear(in_features=32, out_features=32, bias=False)
      )
      (feedforward): Sequential(
        (0): Linear(in_features=32, out_features=2048, bias=True)
        (1): ReLU()
        (2): Linear(in_features=2048, out_features=32, bias=True)
      )
      (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (linear): Linear(in_features=32, out_features=28, bias=True)
  (softmax): Softmax(dim=-1)
)


In [29]:
@torch.no_grad()
def generate(self, x, max_new_tokens):
    for _ in range(max_new_tokens):
        logits = self(x)
        logits = logits[:, -1, :] # Look at the last predicted character
        probs = self.softmax(logits)
        # Randomly sample the next character based on the probability distribution
        next_token = torch.multinomial(probs, num_samples=1)
        
        if next_token == ctoi[' ']: # Stop if the model predicts a space
            break
        x = torch.cat((x, next_token), dim=1)
    return x[:, 1:] # Return the generated name

In [33]:
for _ in range(10):
    # Seed with character 0 (often a space or start token)
    seed = torch.tensor([0]).unsqueeze(0).to(device)
    generated_indices = m.generate(seed, 15).tolist()[0]
    print(decode(generated_indices))

erado
ider
ulannah
ayufthe
aylan
uisa
aiya
aelon
akibel
ato


In [34]:
for _ in range(10):
    # Seed with character 0 (often a space or start token)
    seed = torch.tensor([0]).unsqueeze(0).to(device)
    generated_indices = m.generate(seed, 15).tolist()[0]
    print(decode(generated_indices))

azaeleen
ake
rose
idile
ylien
ael
a
uith
adeen
ackoben
