In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

<h2> Word Embeddings </h2>

<h2> Tokenization </h2>

### Standard tokenizer

In [2]:
class Tokenizer:

    def __init__(self):

        self.vocab = None
        self.max_len = None
        self.vocab_size = None

    def preprocess(self, text):

        # remove all non alphabetic characters
        text = ''.join(e for e in text if e.isalnum() or e.isspace())

        # convert to lowercase
        text = text.lower()

        return text

    def generate_vocab(self, texts):

        text_preprocessed = [self.preprocess(text) for text in texts]

        self.max_len = max([len(seq.split()) for seq in text_preprocessed])

        words = " ".join(text_preprocessed).split()

        vocab = list(set(words))

        vocab.sort()

        self.vocab = vocab

        self.vocab_size = len(self.vocab) + 1
    
    def tokenize(self, texts):

        total_tokens = []

        for text in texts:
    
            text_preprocessed = self.preprocess(text)

            words = text_preprocessed.split()

            tokens = []

            for word in words:
                tokens.append(self.vocab.index(word))
            
            tokens += [self.vocab_size-1 for _ in range(self.max_len - len(tokens))]

            total_tokens.append(tokens)

        return total_tokens

texts = [
    "I am a student", 
    "I am a teacher", 
    "I am a doctor", 
    "I am a programmer", 
    "The quick brown fox jumps over the lazy dog"
]

tokenizer = Tokenizer()

tokenizer.generate_vocab(texts)

print(tokenizer.vocab)

tokens = tokenizer.tokenize(texts)

print(tokens)

print(tokenizer.max_len, [len(t) for t in tokens])

['a', 'am', 'brown', 'doctor', 'dog', 'fox', 'i', 'jumps', 'lazy', 'over', 'programmer', 'quick', 'student', 'teacher', 'the']
[[6, 1, 0, 12, 15, 15, 15, 15, 15], [6, 1, 0, 13, 15, 15, 15, 15, 15], [6, 1, 0, 3, 15, 15, 15, 15, 15], [6, 1, 0, 10, 15, 15, 15, 15, 15], [14, 11, 2, 5, 7, 9, 14, 8, 4]]
9 [9, 9, 9, 9, 9]


### Character level encoder

In [31]:
class CharacterTokenizer:

    def __init__(self, text):
        self.vocab = sorted(list(set(text)))
        self.vocab_size = len(self.vocab)
        self.char_to_index = {char: i for i, char in enumerate(self.vocab)}
        self.index_to_char = {i: char for i, char in enumerate(self.vocab)}

    def encode(self, text):
        return [self.char_to_index[char] for char in text]
    
    def decode(self, encoded_text):
        return "".join([self.index_to_char[i] for i in encoded_text])
    
text = "The quick brown fox jumps over the lazy dog"
tokenizer = CharacterTokenizer(text)
encoded_text = tokenizer.encode(text)
print(encoded_text)
decoded_text = tokenizer.decode(encoded_text)
print(decoded_text)
assert text == decoded_text, "Error: decoded text is not the same as the original text"

[1, 9, 6, 0, 18, 22, 10, 4, 12, 0, 3, 19, 16, 24, 15, 0, 7, 16, 25, 0, 11, 22, 14, 17, 20, 0, 16, 23, 6, 19, 0, 21, 9, 6, 0, 13, 2, 27, 26, 0, 5, 16, 8]
The quick brown fox jumps over the lazy dog


<h2> Embedding Layer </h2>

In [3]:
class EmbeddingLayer(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, x):

        return self.embedding(x)
    

embedding_dim = 10

print(tokenizer.vocab_size)
print(tokens)

embedding_layer = EmbeddingLayer(tokenizer.vocab_size, embedding_dim)

tokens = torch.Tensor(tokens).long()

embeddings = embedding_layer(tokens)

print(embeddings.shape)
print(embeddings)

16
[[6, 1, 0, 12, 15, 15, 15, 15, 15], [6, 1, 0, 13, 15, 15, 15, 15, 15], [6, 1, 0, 3, 15, 15, 15, 15, 15], [6, 1, 0, 10, 15, 15, 15, 15, 15], [14, 11, 2, 5, 7, 9, 14, 8, 4]]
torch.Size([5, 9, 10])
tensor([[[-1.0304, -0.6306, -0.3745, -0.0424, -0.3442, -1.1487, -1.2837,
          -0.9136,  0.1740,  0.2564],
         [ 0.2923, -0.1173, -0.2187,  0.3942,  0.1204, -1.7418,  0.2152,
          -0.2712,  0.5643,  0.3067],
         [ 1.5491, -0.7142, -0.2190,  0.9998,  0.1721, -1.3855,  1.1591,
           1.2405,  1.0196,  0.2970],
         [ 1.2113,  0.9077, -1.1597,  0.6473, -0.6197,  2.3342, -1.0646,
          -0.8589, -0.6293, -1.7637],
         [ 1.0357,  0.8639,  0.6747,  0.7032,  0.1055,  0.6124,  0.1596,
          -0.5449,  1.5960, -1.6241],
         [ 1.0357,  0.8639,  0.6747,  0.7032,  0.1055,  0.6124,  0.1596,
          -0.5449,  1.5960, -1.6241],
         [ 1.0357,  0.8639,  0.6747,  0.7032,  0.1055,  0.6124,  0.1596,
          -0.5449,  1.5960, -1.6241],
         [ 1.0357,  0.863

<h2> The Attention Layer </h2>

In [4]:
class Attention(nn.Module):

    def __init__(self, d_model, head_size):
        super().__init__()
        
        self.d_model = d_model
        self.linear = nn.Linear(d_model, 3 * head_size)

    def forward(self, x):

        q, k, v = self.linear(x).chunk(3, dim=-1)

        # attn = softmax(Q K.T) V
        attn = torch.einsum("bnd,bkd->bnk", q, k)
        attn = attn / (self.d_model ** 0.5)
        attn = F.softmax(attn, dim=-1)
        attn = attn @ v

        return attn
    

attention_layer = Attention(embedding_dim, embedding_dim)
attn_logits = attention_layer(embeddings)
embeddings.shape, attn_logits.shape

(torch.Size([5, 9, 10]), torch.Size([5, 9, 10]))

In [5]:
# naive implementation of multi-head attention

class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, n_heads):
        super().__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"

        self.n_heads = n_heads
        self.d_model = d_model
        self.head_size = d_model // n_heads

        self.heads = nn.ModuleList([Attention(self.d_model, self.head_size) for _ in range(n_heads)])

    def forward(self, x):
        return torch.cat([head(x) for head in self.heads], dim=-1)
    
d_model = 32
n_heads = 4
seq_len = 16
batch_size = 8
shifted_x = torch.randn(batch_size, seq_len, d_model)
multi_head_attn = MultiHeadAttention(d_model, n_heads)
attn_logits = multi_head_attn(shifted_x)
attn_logits.shape

torch.Size([8, 16, 32])

In [6]:
# faster implementation of multi-head attention

class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, n_heads):
        super().__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"

        self.n_heads = n_heads
        self.d_model = d_model
        self.head_size = d_model // n_heads

        self.qkv = nn.Linear(d_model, 3*d_model)
        self.fc = nn.Linear(d_model, d_model)

    def forward(self, x):
        batch_size, seq_len, _ = x.shape
            
        qkv = self.qkv(x)
        qkv = qkv.view(batch_size, seq_len, self.n_heads, 3*self.head_size)
        qkv = qkv.permute(0, 2, 1, 3) # batch, n_heads, seq_len, 3 * head_size
        q, k, v = qkv.chunk(3, dim=-1) # batch, n_heads, seq_len, head_size

        # attn = softmax(Q @ K.T) V
        attn = torch.einsum("bhid,bhjd->bhij", q, k)
        attn /= self.head_size ** 0.5
        attn = F.softmax(attn, dim=-1)
        attn = attn @ v
        attn = attn.permute(0, 2, 1, 3).contiguous()
        attn = attn.reshape(batch_size, seq_len, -1)

        return self.fc(attn)

d_model = 32
n_heads = 4
seq_len = 16
batch_size = 8
shifted_x = torch.randn(batch_size, seq_len, d_model)
multi_head_attn = MultiHeadAttention(d_model, n_heads)
attn_logits = multi_head_attn(shifted_x)
attn_logits.shape


torch.Size([8, 16, 32])

In [7]:
# faster implementation of multi-head attention w/masking

class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, n_heads):
        super().__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"

        self.n_heads = n_heads
        self.d_model = d_model
        self.head_size = d_model // n_heads

        self.qkv = nn.Linear(d_model, 3*d_model)
        self.fc = nn.Linear(d_model, d_model)

    def forward(self, x):
        batch_size, seq_len, _ = x.shape
            
        qkv = self.qkv(x)
        qkv = qkv.view(batch_size, seq_len, self.n_heads, 3*self.head_size)
        qkv = qkv.permute(0, 2, 1, 3) # batch, n_heads, seq_len, 3 * head_size
        q, k, v = qkv.chunk(3, dim=-1) # batch, n_heads, seq_len, head_size

        # attn = softmax(Q @ K.T) V
        attn = torch.einsum("bhid,bhjd->bhij", q, k)
        attn /= self.head_size ** 0.5
        mask = torch.full((seq_len, seq_len), float('-inf')).to(x.device)
        mask = torch.triu(mask, diagonal=1)
        attn = attn + mask
        attn = F.softmax(attn, dim=-1)
        attn = attn @ v
        attn = attn.permute(0, 2, 1, 3).contiguous()
        attn = attn.reshape(batch_size, seq_len, -1)

        return self.fc(attn)

d_model = 32
n_heads = 4
seq_len = 16
batch_size = 8
shifted_x = torch.randn(batch_size, seq_len, d_model)
multi_head_attn = MultiHeadAttention(d_model, n_heads)
attn_logits = multi_head_attn(shifted_x)
attn_logits.shape

torch.Size([8, 16, 32])

In [8]:
class AttentionBlock(nn.Module):

    def __init__(self, d_model, n_heads):
        super().__init__()

        self.attn = MultiHeadAttention(d_model, n_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)
        self.mlp = nn.Sequential(
            nn.Linear(d_model, 4*d_model),
            nn.ReLU(),
            nn.Linear(4*d_model, d_model)
        )

    def forward(self, x):
        x = x + self.dropout(self.attn(self.norm1(x)))
        x = x + self.mlp(self.norm2(x))
        return x

In [9]:
class Transformer(nn.Module):

    def __init__(self, vocab_size, d_model, n_heads, n_layers, block_size):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_embedding = nn.Embedding(block_size, d_model)
        self.attention_blocks = nn.Sequential(*[AttentionBlock(d_model, n_heads) for _ in range(n_layers)])
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        # x size: (batch_size, seq_len)
        x = self.embedding(x) # (batch_size, seq_len, d_model)
        x = x + self.pos_embedding(torch.arange(x.size(1), device=x.device))
        x = self.attention_blocks(x)
        return self.fc(x)

In [10]:
class GPT(nn.Module):

    def __init__(self, vocab_size, d_model, n_heads, n_layers, block_size):
        super().__init__()

        self.transformer = Transformer(vocab_size, d_model, n_heads, n_layers, block_size)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, x, targets=None):
        logits = self.transformer(x)
        loss = None
        if targets is not None:
            loss = self.loss_fn(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    def generate(self, x, steps=100, deterministic=False):
        for _ in range(steps):
            logits = self.transformer(x)
            if deterministic:
                next_token = torch.argmax(logits[:, -1], dim=-1).unsqueeze(-1)
            else:
                next_token = torch.multinomial(F.softmax(logits[:, -1], dim=-1), num_samples=1)
            x = torch.cat([x, next_token], dim=-1)
        return x

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
with open("../data/input.txt", "r") as f:
    text = f.read()

In [14]:
def get_batch(text, block_size):

    tokens = tokenizer.encode(text)

    for i in range(0, len(tokens) - block_size, block_size):
        yield tokens[i:i+block_size], tokens[i+1:i+block_size+1]

In [16]:
from tqdm import tqdm

num_epochs = 5
block_size = 256
d_model = 256
n_heads = 4
n_layers = 4
lr = 1e-4
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
model = GPT(tokenizer.vocab_size, d_model, n_heads, n_layers, block_size).to(device)
optim = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(num_epochs):
    for batch in tqdm(get_batch(text, block_size), desc=f"Training epoch {epoch+1}", total=len(tokenizer.encode(text))//block_size):
        x, y = torch.tensor(batch[0]).unsqueeze(0).to(device), torch.tensor(batch[1]).unsqueeze(0).to(device)
        logits, loss = model(x, y)
        optim.zero_grad()
        loss.backward()
        optim.step()

Training epoch 1: 100%|██████████| 1320/1320 [01:02<00:00, 21.06it/s]
Training epoch 2: 100%|██████████| 1320/1320 [01:00<00:00, 21.80it/s]
Training epoch 3: 100%|██████████| 1320/1320 [01:01<00:00, 21.63it/s]
Training epoch 4: 100%|██████████| 1320/1320 [01:01<00:00, 21.39it/s]
Training epoch 5: 100%|██████████| 1320/1320 [01:01<00:00, 21.57it/s]


In [22]:
for epoch in range(num_epochs):
    for batch in tqdm(get_batch(text, block_size), desc=f"Training epoch {epoch+1}", total=len(tokenizer.encode(text))//block_size):
        x, y = torch.tensor(batch[0]).unsqueeze(0).to(device), torch.tensor(batch[1]).unsqueeze(0).to(device)
        logits, loss = model(x, y)
        optim.zero_grad()
        loss.backward()
        optim.step()

Training epoch 1: 100%|██████████| 1320/1320 [01:25<00:00, 15.47it/s]
Training epoch 2: 100%|██████████| 1320/1320 [01:26<00:00, 15.17it/s]
Training epoch 3: 100%|██████████| 1320/1320 [01:22<00:00, 15.90it/s]
Training epoch 4: 100%|██████████| 1320/1320 [01:19<00:00, 16.61it/s]
Training epoch 5: 100%|██████████| 1320/1320 [01:18<00:00, 16.74it/s]


In [21]:
context = "PROPSERO:" # None | str: "The quick brown fox jumps over the lazy dog"
if context:
    x = torch.tensor(tokenizer.encode(context)).unsqueeze(0).to(device)
else:
    x = torch.zeros((1, 1), dtype=torch.long).to(device)
output = model.generate(x, deterministic=False)
print(tokenizer.decode(output[0].tolist()))

PROPSERO:
MIRAN.

ANTONIO:
Upon my throne; thou art how me.

ANTONIO:
But else had been any of chosen?

ANTONZALO:
He dies my royal traffic.'
Is not change precious to begin
And still of a sweat in the rough and cracks They are or else; he appeal: his formeraces
shearing?

MARIAN: thou doth your manners of day? I flatings
