<a href="https://colab.research.google.com/github/QasimWani/simple-transformer/blob/main/transformers/debugging/nano_GPT_debugging_solutions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Target: do each one in less than 30 minutes
# This contains solutions to https://github.com/QasimWani/simple-transformer/blob/main/transformers/debugging/nano_GPT_debugging_problems.ipynb

In [2]:
# Problem 1: Difficulty Medium
# Time taken: 35m

import torch
import torch.nn as nn
import torch.optim as optim
from einops import rearrange

# MultiHeadSelfAttention
# Bugs: 3
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.q_linear = nn.Linear(embed_dim, embed_dim)
        self.k_linear = nn.Linear(embed_dim, embed_dim)
        self.v_linear = nn.Linear(embed_dim, embed_dim)
        self.out_linear = nn.Linear(embed_dim, embed_dim)

        # Error 1 - weights initialized to zeros
        # nn.init.zeros_(self.q_linear.weight)
        # nn.init.zeros_(self.k_linear.weight)
        # nn.init.zeros_(self.v_linear.weight)

    def forward(self, x, mask=None):
        batch_size, seq_len, dk = x.shape
        # b, m, d -> b, m, n, h -> (b, n, m, h)
        # Better to just be safe and use einops
        q = rearrange(self.q_linear(x), 'b m (n h) -> b n m h', n=self.num_heads, h=self.head_dim)
        k = rearrange(self.k_linear(x), 'b m (n h) -> b n m h', n=self.num_heads, h=self.head_dim)
        v = rearrange(self.v_linear(x), 'b m (n h) -> b n m h', n=self.num_heads, h=self.head_dim)

        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5) # (b, n, q, h) x (b, n, h, k) -> (b, n, q, k) Correct!

        if mask is not None:
            # Erorr 2: masked_fill will replace all True with float('inf'). Change to -inf
            # mask = 1 is the future tokens --> we need to replace these
            # mask = 0 is past and current --> These must stay as-is
            scores = scores.masked_fill(mask, float('-inf'))

        attn = torch.softmax(scores, dim=-1)
        out = torch.matmul(attn, v) # b, n, q, h
        # Error 3 - dangeorous broadcasting happening inside view tensor
        # out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim) # b, q, n, k -> b, q, d (doing view before transpose is dangerous)
        out = rearrange(out, 'b n q h -> b q (n h)', n=self.num_heads, h=self.head_dim)
        out = self.out_linear(out)
        return out

# FeedForward
# Bugs: 1
class FeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(embed_dim, ff_dim)
        self.linear2 = nn.Linear(ff_dim, embed_dim)
        self.gelu = nn.GELU() # 1 - Not really a bug, but maybe replace with a GeLU?

    def forward(self, x):
        return self.linear2(self.gelu(self.linear1(x)))

# DecoderLayer
# Bugs: 2
class DecoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ff = FeedForward(embed_dim, ff_dim)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # attn_out = self.self_attn(x, mask)
        # x = x + self.dropout(attn_out)
        # x = self.norm1(x)
        # ff_out = self.ff(x)
        # x = x + self.dropout(ff_out)
        # x = self.norm2(x)
        # return x

        # Correct usage of pre-norm
        attn_out = self.self_attn(self.norm1(x), mask)
        x = x + self.dropout(attn_out)

        ffn_out = self.ff(self.norm2(x))
        x = x + self.dropout(ffn_out)
        return x

# PositionalEncoding
# Bugs: 1
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000): # Fair warning: this might be too much and is not a power of 2 which has worse lookup times in memory
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embed_dim) # m, d
        position = torch.arange(0, max_len).unsqueeze(1).float() # m, 1
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * -(torch.log(torch.tensor(10000.0)) / embed_dim)).unsqueeze(0) # k=d/2. This might lead to unwanted broadcasting. making it explicit.
        pe[:, 0::2] = torch.sin(position * div_term) # (m, 1) x (1, d/2).  = m, d/2
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0) # 1, m, d good idea to avoid broadcasting issues.
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)] # b, m, d + 1, m, d. Correct!
        return x

# TransformerLM
# Bugs: 2
class TransformerLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, ff_dim, max_len=5000, dropout=0.1):
        super(TransformerLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim) # weight sharing with out_linear
        self.pos_enc = PositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([DecoderLayer(embed_dim, num_heads, ff_dim, dropout) for _ in range(num_layers)])
        self.out_linear = nn.Linear(embed_dim, vocab_size, bias=False)  # Off-by-one vocab size. Fixed ;)
        # well i'll do you one double here. let's impose a weight sharing
        self.out_linear.weight = self.embedding.weight # note: we do not need to explicitly transpose since the linear layer will do: x @ W.T + b
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, seq_len):
        # mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1) # Error 2
        # Well the causal mask is applied to query positions. so the final form must be: [b, n, m, m] -> [1, 1, m, m] (assumes MHA)
        # But the bigger problem is that we're masking out all previous token positions because triu will place zeros in all current and previous positions.
        # So either get rid of == 0 or do != 0
        # Remmber: triu will place zeros in current and previous positions and masked_fill will replace all True values!
        mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool() # Solution 2
        return mask.unsqueeze(0).unsqueeze(1)  # For batch and heads. seems correct!

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_enc(x) # we apply x + pos, so should be fine!
        x = self.dropout(x)
        seq_len = x.size(1)
        mask = self.generate_mask(seq_len).to(x.device) # NOTE: Causal mask, not attention mask
        for layer in self.layers:
            x = layer(x, mask)
        out = self.out_linear(x)
        return out

# Data preparation
vocab_size = 100
batch_size = 32
seq_len = 20
data = torch.randint(0, vocab_size, (1000, seq_len))  # Simple random data

# Model instantiation
embed_dim = 256
num_heads = 4
num_layers = 2
ff_dim = 512
model = TransformerLM(vocab_size, embed_dim, num_heads, num_layers, ff_dim).to('cuda')

# Train loop
# Bugs: 2
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss() # what is the padding token? usually i'd do ignore_index=-100

for epoch in range(50):
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size].to('cuda')
        # REMINDER: whenever shape shifting a tensor, contiguous MUST follow view
        inputs = batch[:, :-1] # saves computation by cutting off last token
        targets = batch[:, 1:].contiguous()  # No shift for next-token. Fixed! NOTE: memory here is no longer contiguous, because first element points to second!

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1)) # Off-by-one in vocab size
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 16.65876579284668
Epoch 2, Loss: 9.91919231414795
Epoch 3, Loss: 7.202060222625732
Epoch 4, Loss: 6.663390159606934
Epoch 5, Loss: 5.914766788482666
Epoch 6, Loss: 5.347503662109375
Epoch 7, Loss: 5.191928386688232
Epoch 8, Loss: 5.322828769683838
Epoch 9, Loss: 4.788407802581787
Epoch 10, Loss: 4.530216217041016
Epoch 11, Loss: 4.610927581787109
Epoch 12, Loss: 4.574163436889648
Epoch 13, Loss: 4.445892333984375
Epoch 14, Loss: 4.440374851226807
Epoch 15, Loss: 4.0369672775268555
Epoch 16, Loss: 4.143715858459473
Epoch 17, Loss: 3.9437336921691895
Epoch 18, Loss: 3.7424821853637695
Epoch 19, Loss: 3.8234410285949707
Epoch 20, Loss: 3.7176196575164795
Epoch 21, Loss: 3.562265157699585
Epoch 22, Loss: 3.343305826187134
Epoch 23, Loss: 3.1773416996002197
Epoch 24, Loss: 3.2731451988220215
Epoch 25, Loss: 2.8281917572021484
Epoch 26, Loss: 2.8621819019317627
Epoch 27, Loss: 2.755774736404419
Epoch 28, Loss: 2.6218812465667725
Epoch 29, Loss: 2.5086421966552734
Epoch 30, Los

In [3]:
# Problem 2: Difficulty Medium
# Time taken: 32m

import torch
import torch.nn as nn
import torch.optim as optim
from einops import rearrange


# MultiHeadSelfAttention
# Bugs: 2
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.qkv_linear = nn.Linear(embed_dim, embed_dim * 3) # This is fine, but be careful how this works!
        self.out_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, x, mask=None):
        batch_size, seq_len, d_k = x.shape # b, m, d
        # Better to just use einops TODO
        # qkv = self.qkv_linear(x).view(batch_size, seq_len, 3, self.num_heads, self.head_dim) # [b, m, d*3] -> [b, m, 3, n, h]
        # []: 3, 16, 384
        # 3, 16=seq_len, 384=(3 * 4 * 32)
        q, k, v = rearrange(self.qkv_linear(x), 'b m (three n h) -> three b n m h', m=seq_len, three=3, n=self.num_heads, h=self.head_dim).unbind(0)
        # q, k, v = qkv[:,:,0].transpose(1, 2), qkv[:,:,1].transpose(1, 2), qkv[:,:,2].transpose(1, 2) # [b, m, n, h] -> [b, n, m, h]
        assert q.shape == k.shape == v.shape == (batch_size, self.num_heads, seq_len, self.head_dim)

        scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5) # [b, n, q, h] x [b, n, h, k] -> [b, n, q, k]
        assert scores.shape == (batch_size, self.num_heads, seq_len, seq_len)
        if mask is not None:
            scores = scores.masked_fill(mask.bool(), float('-inf'))
            # scores += mask * -1e9  # Add instead of masked_fill for broadcasting test. Better to multiply with mask position. TODO

        attn = torch.softmax(scores, dim=-1) # [b, n, q, k]
        out = torch.matmul(attn, v) # [b, n, q, k] x [b, n, k, h] -> [b, n, q, h]
        assert out.shape == (batch_size, self.num_heads, seq_len, self.head_dim)
        out = rearrange(out, 'b n m h -> b m (n h)', n=self.num_heads, h=self.head_dim)
        # out = out.transpose(1, 2).contiguous().view(batch_size, seq_len, self.embed_dim) # TODO - convert to einops: [b, q, n, h] -> [b, q, d]
        out = self.out_linear(out)
        # Debug norm:
        # print(out.norm())
        return out

# FeedForward
# Bugs: 1
class FeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(embed_dim, ff_dim)
        self.linear2 = nn.Linear(ff_dim, embed_dim)
        self.gelu = nn.GELU()

    def forward(self, x):
        return self.linear2(self.gelu(self.linear1(x))) # hmm this is correct. not sure what the error is?

# DecoderLayer
# Bugs: 2
class DecoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ff = FeedForward(embed_dim, ff_dim)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask): # TODO add residuals
        # attn_out = self.self_attn(x, mask)
        # x = self.norm1(attn_out)  # No residual add
        # ff_out = self.ff(x)
        # x = self.norm2(x + self.dropout(ff_out))
        # return x
        attn_out = self.self_attn(self.norm1(x), mask)
        x = x + self.dropout(attn_out)

        ff_out = self.ff(self.norm2(x))
        x = x + self.dropout(ff_out)
        return x

# LearnablePositionalEncoding
# Bugs: 1
class LearnablePositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super(LearnablePositionalEncoding, self).__init__()
        self.pe = nn.Parameter(torch.randn(1, max_len, embed_dim) * 1/(embed_dim ** 0.5)) # Too high of random. maybe scale by sqrt(embed_dim)

    def forward(self, x):
        return x + self.pe[:, :x.size(1), :] # [b, m, d] + [1, m, d]

# TransformerLM
# Bugs: 2
class TransformerLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, ff_dim, max_len=5000, dropout=0.1):
        super(TransformerLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_enc = LearnablePositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([DecoderLayer(embed_dim, num_heads, ff_dim, dropout) for _ in range(num_layers)])
        self.out_linear = nn.Linear(embed_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, seq_len):
        mask = torch.triu(torch.ones(1, 1, seq_len, seq_len), diagonal=1)  # No ==0, broadcasts as is. Fixed!
        return mask
        # return mask.unsqueeze(0).unsqueeze(0).expand(-1, self.layers[0].self_attn.num_heads, -1, -1)

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_enc(x) # x + pos
        x = self.dropout(x)
        seq_len = x.size(1)
        mask = self.generate_mask(seq_len).to(x.device)
        for layer in self.layers:
            x = layer(x, mask)
        out = self.out_linear(x)
        return out

# Data preparation
vocab_size = 200
batch_size = 64
seq_len = 30
data = torch.randint(0, vocab_size, (200, seq_len))  # Random data

# Model instantiation
embed_dim = 64
num_heads = 8
num_layers = 2
ff_dim = 64 * 4
model = TransformerLM(vocab_size, embed_dim, num_heads, num_layers, ff_dim).to('cuda')

# Train loop
# Bugs: 3
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()  # Wrong ignore. well what's the padding token? they're all guaranteed to be >0. so idk what token represents pad

for epoch in range(30):
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size].to('cuda')
        b = len(batch)
        inputs = batch[:, :-1]
        targets = batch[:, 1:].contiguous()  # No shift, wrong shape. Fixed! (1)

        optimizer.zero_grad()
        outputs = model(inputs)
        assert outputs.shape == (b, seq_len - 1, vocab_size)
        assert targets.shape == (b, seq_len - 1)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Generation loop (autoregressive)
# Bugs: 2
def generate(model, start_token, max_len=10, device='cpu'):
    model.eval()
    input = torch.tensor([[start_token]]).to(device)
    for _ in range(max_len):
        with torch.no_grad():
            output = model(input)
            next_token = output[:, -1, :].argmax(dim=-1, keepdim=True)  # No [:, -1] # Okay, interestingly we're doubling. this is because we should only be making use of last token
            input = torch.cat([input, next_token], dim=1)
    return input.squeeze(0).tolist()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
generated = generate(model, data[1, 0].item(), seq_len, device)
print("Generated:", generated)
print("Original:", data[1])


# First pass - 8m
# Second pass - 4m
# Third pass - blitzkerg fixes. Alright, so we now know that we're not able to overfit. 32m
# Loss did go down. But the issue was with your causal mask. Took an additional 20m that should have been resolved first-pass in 32m. Overall, took 52m. Very poor performance!


# Lesson learned - 1: when we're generating the same token over and over again. It is almost guaranteed to be related to !causal mask!
# Lesson learned -2: Just stick with einops from the getgo. In MHA implementation, get rid of view/transpose and stick with einops. Will make your life easier!


# Okay, so i was able to get the forward pass to run properly. Now i noticed that the loss doesn't go down. Looking at it, 3.9 ~= ln(50=vocab_size). which means random chance

Epoch 1, Loss: 5.508038520812988
Epoch 2, Loss: 5.356257915496826
Epoch 3, Loss: 5.183887481689453
Epoch 4, Loss: 5.016503810882568
Epoch 5, Loss: 4.840149879455566
Epoch 6, Loss: 4.662425518035889
Epoch 7, Loss: 4.500616073608398
Epoch 8, Loss: 4.352596282958984
Epoch 9, Loss: 4.176392078399658
Epoch 10, Loss: 4.0160746574401855
Epoch 11, Loss: 3.8471202850341797
Epoch 12, Loss: 3.646458148956299
Epoch 13, Loss: 3.4854209423065186
Epoch 14, Loss: 3.2571630477905273
Epoch 15, Loss: 3.1006343364715576
Epoch 16, Loss: 2.9039833545684814
Epoch 17, Loss: 2.7305314540863037
Epoch 18, Loss: 2.5265450477600098
Epoch 19, Loss: 2.3248372077941895
Epoch 20, Loss: 2.1876890659332275
Epoch 21, Loss: 2.0111308097839355
Epoch 22, Loss: 1.822227954864502
Epoch 23, Loss: 1.6882132291793823
Epoch 24, Loss: 1.5529844760894775
Epoch 25, Loss: 1.4647681713104248
Epoch 26, Loss: 1.3355042934417725
Epoch 27, Loss: 1.2203292846679688
Epoch 28, Loss: 1.118321180343628
Epoch 29, Loss: 1.0180186033248901
Epoch 

In [4]:
# Problem 3 - Difficulty Easy
# Time taken: 20m

import torch
import torch.nn as nn
import torch.optim as optim
from einops import rearrange

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        assert embed_dim % num_heads == 0
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.q_linear = nn.Linear(embed_dim, embed_dim)
        self.k_linear = nn.Linear(embed_dim, embed_dim)
        self.v_linear = nn.Linear(embed_dim, embed_dim)
        self.out_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, x, mask=None):
        batch_size, seq_len, dk = x.shape

        # TODO: use einops
        q = rearrange(self.q_linear(x), 'b m (n h) -> b n m h', n=self.num_heads, h=self.head_dim)
        k = rearrange(self.k_linear(x), 'b m (n h) -> b n m h', n=self.num_heads, h=self.head_dim)
        v = rearrange(self.v_linear(x), 'b m (n h) -> b n m h', n=self.num_heads, h=self.head_dim)

        assert q.shape == k.shape == v.shape == (batch_size, self.num_heads, seq_len, self.head_dim)

        # scores = torch.matmul(q, k.transpose(-2, -1)) / (self.head_dim ** 0.5) # Correct!
        scores = torch.einsum('bnqh,bnkh -> bnqk', q, k) / (self.head_dim ** 0.5)
        assert scores.shape == (batch_size, self.num_heads, seq_len, seq_len)

        if mask is not None:
            scores = scores.masked_fill(mask, float('-inf'))

        attn = torch.softmax(scores, dim=-1) # fixed!
        out = torch.matmul(attn, v)
        assert out.shape == (batch_size, self.num_heads, seq_len, self.head_dim)
        # TODO: use einops
        out = rearrange(out, 'b n m h -> b m (n h)', h=self.head_dim, n=self.num_heads)
        # out = out.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_len, self.embed_dim)
        assert out.shape == (batch_size, seq_len, dk)
        out = self.out_linear(out)
        return out

class FeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim):
        super(FeedForward, self).__init__()
        self.linear1 = nn.Linear(embed_dim, ff_dim)
        self.linear2 = nn.Linear(ff_dim, embed_dim)
        self.gelu = nn.GELU() # Replace with GELU

    def forward(self, x):
        return self.linear2(self.gelu(self.linear1(x)))

class DecoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.self_attn = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ff = FeedForward(embed_dim, ff_dim)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # incorrect layernorm and residual connections
        # TODO
        # x = self.norm1(x)
        # attn_out = self.self_attn(x, mask)
        # x = x + self.dropout(attn_out)
        # x = self.norm2(x)
        # ff_out = self.ff(x)
        # x = x + self.dropout(ff_out)
        # return x
        attn_out = self.self_attn(self.norm1(x), mask)
        x = x + self.dropout(attn_out)

        ffn_out = self.ff(self.norm2(x))
        x = x + self.dropout(ffn_out)
        return x

class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super(SinusoidalPositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len).unsqueeze(1).float() # [m, 1]
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * -(torch.log(torch.tensor(10000.0)) / embed_dim)).unsqueeze(0) # [1, d/2]
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0) # [1, m, d]
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)] # [b, m, d] + [1, m, d]

class TransformerLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, ff_dim, max_len=5000, dropout=0.1):
        super(TransformerLM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_enc = SinusoidalPositionalEncoding(embed_dim, max_len)
        self.layers = nn.ModuleList([DecoderLayer(embed_dim, num_heads, ff_dim, dropout) for _ in range(num_layers)])
        self.out_linear = nn.Linear(embed_dim, vocab_size, bias=False)

        # do weight sharing
        self.out_linear.weight = self.embedding.weight # no need to do transpose since linear will automatically apply x @ w.T
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, seq_len):
        # mask = torch.tril(torch.ones(seq_len, seq_len)).view(1, 1, seq_len, seq_len) # i prefer to use upper triangular
        mask = torch.triu(torch.ones(1, 1, seq_len, seq_len), diagonal=1).bool() # must conver to bool for it to work with masked_fill
        return mask

    def forward(self, x):
        x = self.embedding(x)
        x = self.pos_enc(x)
        x = self.dropout(x)
        seq_len = x.size(1)
        mask = self.generate_mask(seq_len).to(x.device)
        for layer in self.layers:
            x = layer(x, mask)
        out = self.out_linear(x)
        return out

vocab_size = 200
batch_size = 64
seq_len = 30
data = torch.randint(0, vocab_size, (2000, seq_len))
# data = torch.arange(0, seq_len).unsqueeze(0).expand(2000, -1) % vocab_size # batch_size, seq_len # [0, 1, 2, 3, ... seq_len], [0, 1, 2, 3, ... seq_len] x batch-size
# If you can generate a loss of 0.0 on the above, then clearly the network learned something!


embed_dim = 512
num_heads = 8
num_layers = 4
ff_dim = 1024
model = TransformerLM(vocab_size, embed_dim, num_heads, num_layers, ff_dim).to('cuda')

optimizer = optim.Adam(model.parameters(), lr=1e-3) # reduce lr
criterion = nn.CrossEntropyLoss()

for epoch in range(20):
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size].to('cuda')
        inputs = batch[:, :-1]
        targets = batch[:, 1:].contiguous()

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.reshape(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

def generate(model, start_token, max_len=20, device='cpu'):
    model.eval()
    input = torch.tensor([start_token]).unsqueeze(0).to(device)
    for _ in range(max_len):
        with torch.no_grad():
            output = model(input)
            next_token = output[:, -1, :].argmax(dim=-1, keepdim=True)
            input = torch.cat([input, next_token], dim=1)
    return input.squeeze(0).tolist()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
generated = generate(model, data[0, 0].item(), seq_len, device)
print("Generated:", generated)
print('data:', data[0])

# First pass - 7m
# Okay so we managed to get the loss down to 5. ln(200) = 5.2 This is still basically random.

# nice so while we did manage to get the loss down

Epoch 1, Loss: 24.584558486938477
Epoch 2, Loss: 13.734915733337402
Epoch 3, Loss: 10.068390846252441
Epoch 4, Loss: 8.234720230102539
Epoch 5, Loss: 6.857690334320068
Epoch 6, Loss: 6.384119510650635
Epoch 7, Loss: 5.696919918060303
Epoch 8, Loss: 5.006101608276367
Epoch 9, Loss: 4.768941879272461
Epoch 10, Loss: 4.523361682891846
Epoch 11, Loss: 4.285684108734131
Epoch 12, Loss: 4.05180025100708
Epoch 13, Loss: 3.895644187927246
Epoch 14, Loss: 3.545987367630005
Epoch 15, Loss: 3.055267095565796
Epoch 16, Loss: 3.0507569313049316
Epoch 17, Loss: 2.6773107051849365
Epoch 18, Loss: 2.3442561626434326
Epoch 19, Loss: 2.103731632232666
Epoch 20, Loss: 1.7644339799880981
Generated: [180, 148, 159, 172, 158, 118, 150, 48, 190, 127, 48, 156, 194, 139, 189, 25, 63, 150, 77, 25, 194, 105, 13, 128, 94, 71, 157, 76, 118, 131, 127]
data: tensor([180,  98, 127,  39,  69, 140, 133,  89, 159, 142,  73,  41, 182, 189,
         62,  57, 124, 157, 193,  91,  74,  65,  70,  12,  86,  17, 129, 183,
    

In [5]:
# Problem 4 - Difficulty Hard
# Time Taken: 41m (second pass done at 27m)

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import math
from einops import rearrange

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads

        self.qkv = nn.Linear(embed_dim, embed_dim * 3, bias=False)
        self.out = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

        self._reset_parameters()

    def _reset_parameters(self):
        nn.init.xavier_uniform_(self.qkv.weight)
        nn.init.constant_(self.out.bias, 0.) # this is good since transformers make use of zero bias initially to make it act like the identity function initially.

    def forward(self, x, mask=None, return_attn=False):
        B, T, _ = x.shape

        # (b, m, d * 3)
        q, k, v = rearrange(self.qkv(x), 'b m (n h three) -> three b n m h', three=3, n=self.num_heads, h=self.head_dim).unbind(0)
        assert q.shape == k.shape == v.shape == (B, self.num_heads, T, self.head_dim), f"{q.shape}, {k.shape}, {v.shape} != {B, self.num_heads, T, self.head_dim}"
        # qkv = self.qkv(x).reshape(B, T, 3, self.num_heads, C // self.num_heads)
        # qkv = qkv.permute(2, 0, 3, 1, 4)
        # q, k, v = qkv[0], qkv[1], qkv[2]
        attn = torch.einsum("bnqh,bnkh -> bnqk", q, k) * (1.0 / math.sqrt(k.size(-1))) # Correct!

        assert attn.shape == (B, self.num_heads, T, T), f"{attn.shape} != {B, self.num_heads, T, T}"

        if mask is not None:
            # mask = mask.unsqueeze(1) # TODO: do we need this? look into mask construction and shape. Removed
            # shape = [1, 1, m, m]
            # okay, let's visualize the mask
            attn = attn.masked_fill(mask[:, :, :T, :T], float('-inf')) # NOTE: mask_cache once defined
            # attn = attn.masked_fill(mask == 0, -1e9)

        attn = F.softmax(attn, dim=-1)
        attn = self.dropout(attn)
        assert attn.shape == (B, self.num_heads, T, T), f"{attn.shape} != {B, self.num_heads, T, T}"
        out = torch.einsum('bnqk,bnkh -> bnqh', attn, v) # (1, 8, 1, 1) x (1, 8, 1, 32) -> (1, 8, 32, 1)
        assert out.shape == (B, self.num_heads, T, self.head_dim)
        out = rearrange(out, 'b n m h -> b m (n h)', h=self.head_dim, n=self.num_heads)
        assert out.shape == (B, T, self.embed_dim)
        # out = out.transpose(1, 2).reshape(B, T, C)
        out = self.out(out)

        if return_attn:
            return out, attn
        return out

class FeedForward(nn.Module):
    def __init__(self, embed_dim, ff_dim, dropout=0.1):
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, ff_dim)
        self.fc2 = nn.Linear(ff_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = F.gelu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

class DecoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attn = MultiHeadSelfAttention(embed_dim, num_heads, dropout)
        self.ff = FeedForward(embed_dim, ff_dim, dropout)
        self.norm1 = nn.LayerNorm(embed_dim, eps=1e-6)
        self.norm2 = nn.LayerNorm(embed_dim, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        # FIXED!
        # residual = x
        # x = self.norm1(x)
        # x = self.attn(x, mask)
        # x = self.dropout(x) + residual

        # residual = x
        # x = self.norm2(x)
        # x = self.ff(x)
        # x = self.dropout(x) + residual

        # return x
        attn_out = self.attn(self.norm1(x), mask)
        x = x + self.dropout(attn_out)

        ffn_out = self.ff(self.norm2(x))
        x = x + self.dropout(ffn_out)
        return x



class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000, base=10000):
        super().__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) # [m, 1]

        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() *
                           (-math.log(base) / embed_dim)).unsqueeze(0) # [1, d/2]

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        self.register_buffer('pe', pe.unsqueeze(0)) # [1, m, d]

    def forward(self, x):
        # return x * math.sqrt(x.size(-1)) + self.pe[:, :x.size(1)] # hmm seems like a weird normalization trick
        return x + self.pe[:, :x.size(1)] # made it simpler

class TransformerLM(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, ff_dim,
                 max_len=512, dropout=0.1, pad_idx=0):
        super().__init__()
        self.embed_dim = embed_dim
        self.pad_idx = pad_idx
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.pos_enc = SinusoidalPositionalEncoding(embed_dim, max_len)
        self.dropout = nn.Dropout(dropout)

        self.layers = nn.ModuleList([
            DecoderLayer(embed_dim, num_heads, ff_dim, dropout)
            for _ in range(num_layers)
        ])

        self.norm = nn.LayerNorm(embed_dim)
        self.fc = nn.Linear(embed_dim, vocab_size, bias=False)
        # weight sharing
        self.fc.weight = self.embed.weight

        self.vocab_size = vocab_size

        self.register_buffer('mask_cache', torch.empty(0, dtype=torch.bool)) # hmm, sus
        self._init_weights()

    def _init_weights(self):
        nn.init.normal_(self.embed.weight, mean=0.0, std=0.02)
        # nn.init.normal_(self.fc.weight, mean=0.0, std=0.02) # no need since we're dong weight sharing

    def _generate_square_subsequent_mask(self, sz):
        if self.mask_cache.numel() > 0 and self.mask_cache.size(-1) >= sz: # Replace size(0) with size(-1)
            return self.mask_cache[:sz, :sz]

        mask = torch.triu(torch.ones(1, 1, sz, sz), diagonal=1).bool() # much easier to represent this way
        # mask = mask.masked_fill(mask == 1, float('-inf'))
        # mask = mask.masked_fill(mask == 0, float(0.0))
        self.mask_cache = mask
        return mask

    def create_attention_mask(self, x, padding_mask=None):
        batch_size, seq_len = x.shape
        device = x.device

        causal_mask = self._generate_square_subsequent_mask(seq_len).to(device) # [1, 1, m, m]

        if padding_mask is None:
            # real = 1, pad = 0
            # but when doing mask fill 1 is masked out. So should be: padding_mask == 0, s.t. real = 0, pad = 1
            padding_mask = (x == self.pad_idx)[:, None, None, :] # convert to boolean [b, m]. need it to be [b, 1, 1, m] because we pad key tokens
            combined_mask = causal_mask | padding_mask # logical or saves the day here
            return combined_mask
        return causal_mask

    def forward(self, x, padding_mask=None):
        batch_size, seq_len = x.shape
        mask = self.create_attention_mask(x, padding_mask)

        assert x.shape == (batch_size, seq_len)
        x = self.embed(x) * (self.embed_dim ** 0.5) # Scaling trick! TRY THIS: if you comment it out, loss will not go below 6.0
        assert x.shape == (batch_size, seq_len, self.embed_dim)
        x = self.pos_enc(x)
        assert x.shape == (batch_size, seq_len, self.embed_dim)
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, mask)
            assert x.shape == (batch_size, seq_len, self.embed_dim)

        x = self.fc(self.norm(x)) # final layer norm
        assert x.shape == (batch_size, seq_len, self.vocab_size)

        return x

def create_batch_mask(lengths, max_len):
    batch_size = len(lengths)
    mask = torch.zeros(batch_size, max_len, dtype=torch.bool)
    for i, length in enumerate(lengths):
        mask[i, :length] = 1
    return mask

vocab_size = 1000
batch_size = 32
seq_len = 128
num_epochs = 50
pad_token = 0

def create_padded_batch(batch_size, seq_len, vocab_size, pad_token=0):
    data = []
    for _ in range(batch_size):
        actual_len = torch.randint(seq_len//2, seq_len, (1,)).item()
        seq = torch.randint(1, vocab_size, (actual_len,))
        padded = torch.full((seq_len,), pad_token)
        padded[:actual_len] = seq
        data.append(padded)
    return torch.stack(data)

data = torch.cat([create_padded_batch(batch_size, seq_len, vocab_size, pad_token)
                   for _ in range(500 // batch_size + 1)], dim=0)[:500]

model = TransformerLM(vocab_size, 256, 8, 4, 1024, max_len=256, dropout=0.1, pad_idx=pad_token)
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=0.01)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
criterion = nn.CrossEntropyLoss(ignore_index=pad_token)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    total_loss = 0
    num_batches = 0

    for i in range(0, len(data), batch_size):
        batch_data = data[i:min(i+batch_size, len(data))].to(device)

        inputs = batch_data[:, :-1]
        targets = batch_data[:, 1:].contiguous()

        padding_mask = (inputs != pad_token).float()

        optimizer.zero_grad()
        outputs = model(inputs, padding_mask)

        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()
        num_batches += 1

    scheduler.step()
    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

@torch.no_grad()
def generate(model, start_tokens, max_len=50, temperature=1.0):
    model.eval()
    device = next(model.parameters()).device
    tokens = start_tokens.to(device)

    for _ in range(max_len):
        # Top-k sampling
        padding_mask = (tokens != model.pad_idx).float()
        outputs = model(tokens, padding_mask)
        logits = outputs[:, -1, :] / temperature
        probs = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, 1)
        tokens = torch.cat([tokens, next_token], dim=1)

        if next_token.item() == pad_token:
            break

    return tokens

start = data[0, :10].unsqueeze(0)
generated = generate(model, start, max_len=30)
print(f"Generated: {generated.tolist()[0]}")
print(f"Target: {data[0].tolist()}")

# 17 minutes - okay so we got the tensor shapes aligned. nan because of padding mask
# 27m - alright, so we managed to get loss below ln(1000) - 6.4. still bad
# 41m - figured out the issue. The causal mask was blowing up in size because self.cache_mask[:sq, :sq] was always false. Best to just truncate it in MHA calculation

# Lesson learned: For sanity sake in MHA always ensure to keep mask[:, :, :seq_len, :seq_len]

Epoch 1/50, Loss: 6.9482
Epoch 2/50, Loss: 6.9261
Epoch 3/50, Loss: 6.9030
Epoch 4/50, Loss: 6.8689
Epoch 5/50, Loss: 6.7863
Epoch 6/50, Loss: 6.6459
Epoch 7/50, Loss: 6.4910
Epoch 8/50, Loss: 6.3252
Epoch 9/50, Loss: 6.1542
Epoch 10/50, Loss: 5.9704
Epoch 11/50, Loss: 5.7814
Epoch 12/50, Loss: 5.5736
Epoch 13/50, Loss: 5.3486
Epoch 14/50, Loss: 5.1152
Epoch 15/50, Loss: 4.8744
Epoch 16/50, Loss: 4.6420
Epoch 17/50, Loss: 4.4035
Epoch 18/50, Loss: 4.1638
Epoch 19/50, Loss: 3.9268
Epoch 20/50, Loss: 3.7070
Epoch 21/50, Loss: 3.4950
Epoch 22/50, Loss: 3.2964
Epoch 23/50, Loss: 3.1209
Epoch 24/50, Loss: 2.9512
Epoch 25/50, Loss: 2.8006
Epoch 26/50, Loss: 2.6614
Epoch 27/50, Loss: 2.5332
Epoch 28/50, Loss: 2.4297
Epoch 29/50, Loss: 2.3146
Epoch 30/50, Loss: 2.2315
Epoch 31/50, Loss: 2.1305
Epoch 32/50, Loss: 2.0572
Epoch 33/50, Loss: 1.9849
Epoch 34/50, Loss: 1.9134
Epoch 35/50, Loss: 1.8646
Epoch 36/50, Loss: 1.8114
Epoch 37/50, Loss: 1.7629
Epoch 38/50, Loss: 1.7205
Epoch 39/50, Loss: 1.

In [6]:
# Problem 5
# Difficulty Easy
# Time Taken 14 10s

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from einops import rearrange

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.embed_dim=embed_dim; self.num_heads=num_heads
        self.head_dim=embed_dim//num_heads
        self.qkv = nn.Linear(embed_dim, embed_dim*3)
        self.proj = nn.Linear(embed_dim, embed_dim)
    def forward(self,x,mask=None):
        B,T,C=x.shape
        q, k, v = rearrange(self.qkv(x), 'b m (three n h) -> three b n m h', three=3, h=self.head_dim, n=self.num_heads)

        # qkv=self.qkv(x).view(B,T,3,self.num_heads,self.head_dim)
        # q,k,v=qkv[:,:,0].transpose(1,2), qkv[:,:,1].transpose(1,2), qkv[:,:,2].transpose(1,2)

        scores=q@k.transpose(-2,-1)/(self.head_dim**0.5) # Seems correct!
        assert scores.shape == (B, self.num_heads, T, T)

        if mask is not None:
            scores = scores.masked_fill(mask[:, :, :T, :T], float('-inf'))
            # scores = scores.masked_fill(mask==0,0)

        attn = F.softmax(scores,dim=-1)
        out = attn @ v # seems correct
        assert out.shape == (B, self.num_heads, T, self.head_dim)
        out = rearrange(out, 'b n m h -> b m (n h)', n=self.num_heads, h=self.head_dim)
        # out=out.transpose(1,2).reshape(B,T,C)
        return self.proj(out)

class FeedForward(nn.Module):
    def __init__(self,embed_dim,ff_dim):
        super().__init__()
        self.l1=nn.Linear(embed_dim,ff_dim)
        self.gelu = nn.GELU()
        self.l2=nn.Linear(ff_dim,embed_dim)
    def forward(self,x):
        return self.l2(self.gelu(self.l1(x)))

class DecoderLayer(nn.Module):
    def __init__(self,embed_dim,num_heads,ff_dim,dropout=0.1):
        super().__init__()
        self.attn=MultiHeadSelfAttention(embed_dim,num_heads)
        self.ff=FeedForward(embed_dim,ff_dim)
        self.norm1=nn.LayerNorm(embed_dim)
        self.norm2=nn.LayerNorm(embed_dim)
        self.drop=nn.Dropout(dropout)
    def forward(self,x,mask):
        x = x + self.drop(self.attn(self.norm1(x),mask))
        x = x + self.drop(self.ff(self.norm2(x)))
        return x

class SinusoidalPositionalEncoding(nn.Module):
    def __init__(self,embed_dim,max_len=5000):
        super().__init__()
        pe=torch.zeros(max_len,embed_dim)
        pos=torch.arange(0,max_len).unsqueeze(1).float() # [m, 1]
        div=torch.exp(torch.arange(0,embed_dim,2).float()*-(torch.log(torch.tensor(10000.0))/embed_dim)).unsqueeze(0) # [1, d/2]
        pe[:,0::2]=torch.sin(pos * div)
        pe[:,1::2]=torch.cos(pos * div)
        pe=pe.unsqueeze(0) # [1, m, d]
        self.register_buffer("pe",pe)

    def forward(self,x):
      return x + self.pe[:,:x.size(1)] # (b, m, d) + (1, m, d)

class TransformerLM(nn.Module):
    def __init__(self,vocab_size,embed_dim,num_heads,num_layers,ff_dim,max_len=512):
        super().__init__()
        self.embed=nn.Embedding(vocab_size,embed_dim)
        self.pos=SinusoidalPositionalEncoding(embed_dim,max_len)
        self.layers=nn.ModuleList([DecoderLayer(embed_dim,num_heads,ff_dim) for _ in range(num_layers)])
        self.out=nn.Linear(embed_dim,vocab_size)

    def mask(self,T):
        # return torch.tril(torch.ones(T,T))==1
        return torch.triu(torch.ones(1, 1, T, T), diagonal=1).bool()

    def forward(self,x):
        x = self.embed(x) # if std << 1.0, then scale it up
        x = self.pos(x)
        m = self.mask(x.size(1)).to(x.device)
        for l in self.layers:
          x = l(x,m)
        return self.out(x)

vocab=80; seq=25; batch=16
data=torch.randint(0,vocab,(400,seq))
# Can we learn rule of 0 thru seq
# data = torch.arange(0, seq).unsqueeze(0).expand(400, -1) # 400, seq

model=TransformerLM(vocab,128,4,2,256)
opt=optim.Adam(model.parameters(),lr=1e-3)
loss_fn=nn.CrossEntropyLoss()

for e in range(50):
    for i in range(0,len(data),batch):
        inp = data[i:i+batch]
        tgt = inp[:,1:].contiguous()
        out = model(inp)
        loss = loss_fn(out[:,:-1].contiguous().view(-1,vocab), tgt.view(-1))
        opt.zero_grad(); loss.backward(); opt.step()
    print(e,loss.item())

def generate(model,start,max_len=15):
    model.eval(); x=torch.tensor([[start]])
    for _ in range(max_len):
        with torch.no_grad():
            out = model(x)
            nxt = out[:, -1, :].argmax(-1, keepdim=True)
            x = torch.cat([x,nxt],dim=1)
    return x
print("Sample:",generate(model,data[0, 0].item(),15))
print("Target:", data[0].tolist())
# Baseline: ln(50) = 4.3
# First pass - 11m got loss down to 2.9

0 4.5227370262146
1 4.3678059577941895
2 4.271291255950928
3 4.171764373779297
4 4.046815395355225
5 3.9116032123565674
6 3.745391607284546
7 3.5111544132232666
8 3.3014843463897705
9 3.042496681213379
10 2.780313491821289
11 2.5006587505340576
12 2.216636896133423
13 2.0774731636047363
14 1.9390748739242554
15 1.6542234420776367
16 1.5063520669937134
17 1.3554095029830933
18 1.280140995979309
19 1.0645058155059814
20 1.0304051637649536
21 1.035711407661438
22 0.8601331114768982
23 0.8623487949371338
24 0.7634195685386658
25 0.730797290802002
26 0.76020747423172
27 0.6380252838134766
28 0.5974095463752747
29 0.5660734176635742
30 0.5395298600196838
31 0.50992751121521
32 0.5218378305435181
33 0.48669853806495667
34 0.5084396600723267
35 0.4684503972530365
36 0.39678478240966797
37 0.4593440294265747
38 0.46391162276268005
39 0.3630252182483673
40 0.38009557127952576
41 0.3211081326007843
42 0.35254156589508057
43 0.3483334481716156
44 0.3280523717403412
45 0.2834053635597229
46 0.37685