In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

<h2> Word Embeddings </h2>

<h2> Tokenization </h2>

In [12]:
class Tokenizer:

    def __init__(self):

        self.vocab = None
        self.max_len = None
        self.vocab_size = None

    def preprocess(self, text):

        # remove all non alphabetic characters
        text = ''.join(e for e in text if e.isalnum() or e.isspace())

        # convert to lowercase
        text = text.lower()

        return text

    def generate_vocab(self, texts):

        text_preprocessed = [self.preprocess(text) for text in texts]

        self.max_len = max([len(seq.split()) for seq in text_preprocessed])

        words = " ".join(text_preprocessed).split()

        vocab = list(set(words))

        vocab.sort()

        self.vocab = vocab

        self.vocab_size = len(self.vocab) + 1
    
    def tokenize(self, texts):

        total_tokens = []

        for text in texts:
    
            text_preprocessed = self.preprocess(text)

            words = text_preprocessed.split()

            tokens = []

            for word in words:
                tokens.append(self.vocab.index(word))
            
            tokens += [self.vocab_size-1 for _ in range(self.max_len - len(tokens))]

            total_tokens.append(tokens)

        return total_tokens

texts = [
    "I am a student", 
    "I am a teacher", 
    "I am a doctor", 
    "I am a programmer", 
    "The quick brown fox jumps over the lazy dog"
]

tokenizer = Tokenizer()

tokenizer.generate_vocab(texts)

print(tokenizer.vocab)

tokens = tokenizer.tokenize(texts)

print(tokens)

print(tokenizer.max_len, [len(t) for t in tokens])

['a', 'am', 'brown', 'doctor', 'dog', 'fox', 'i', 'jumps', 'lazy', 'over', 'programmer', 'quick', 'student', 'teacher', 'the']
[[6, 1, 0, 12, 15, 15, 15, 15, 15], [6, 1, 0, 13, 15, 15, 15, 15, 15], [6, 1, 0, 3, 15, 15, 15, 15, 15], [6, 1, 0, 10, 15, 15, 15, 15, 15], [14, 11, 2, 5, 7, 9, 14, 8, 4]]
9 [9, 9, 9, 9, 9]


<h2> Embedding Layer </h2>

In [13]:
class EmbeddingLayer(nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, x):

        return self.embedding(x)
    

embedding_dim = 10

print(tokenizer.vocab_size)
print(tokens)

embedding_layer = EmbeddingLayer(tokenizer.vocab_size, embedding_dim)

tokens = torch.Tensor(tokens).long()

embeddings = embedding_layer(tokens)

print(embeddings.shape)
print(embeddings)

16
[[6, 1, 0, 12, 15, 15, 15, 15, 15], [6, 1, 0, 13, 15, 15, 15, 15, 15], [6, 1, 0, 3, 15, 15, 15, 15, 15], [6, 1, 0, 10, 15, 15, 15, 15, 15], [14, 11, 2, 5, 7, 9, 14, 8, 4]]
torch.Size([5, 9, 10])
tensor([[[ 0.6797,  0.0868, -0.7239,  0.4557,  0.6637, -0.7207, -0.0239,
           0.4902,  0.2609, -0.5700],
         [ 0.3847, -1.1373,  1.0454,  0.3670, -0.1145,  1.9662,  1.3524,
           0.8335, -0.1486, -0.0880],
         [-0.2736,  0.7379, -1.4303,  0.6544, -0.0867, -0.4270,  0.8674,
           0.5018,  0.8120,  0.9697],
         [-0.8729,  2.0364, -1.2143, -1.0850, -0.0095,  1.3176, -0.1300,
           0.8922, -0.1097, -0.8783],
         [-0.1839,  0.2155, -0.4136, -1.6557,  0.7286, -1.0491,  0.0059,
          -0.0436, -1.4449,  0.2166],
         [-0.1839,  0.2155, -0.4136, -1.6557,  0.7286, -1.0491,  0.0059,
          -0.0436, -1.4449,  0.2166],
         [-0.1839,  0.2155, -0.4136, -1.6557,  0.7286, -1.0491,  0.0059,
          -0.0436, -1.4449,  0.2166],
         [-0.1839,  0.215

<h2> The Attention Layer </h2>

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Attention(nn.Module):

    def __init__(self, d_model, head_size):
        super().__init__()
        
        self.d_model = d_model
        self.linear = nn.Linear(d_model, 3 * head_size)

    def forward(self, x):
        # split the input into Q, K, V
        q, k, v = self.linear(x).chunk(3, dim=-1) # we pass our input through a linear layer and then split it into 3 parts

        # Recall the formula for the attention mechanism
        # attn = softmax(Q K.T) V
        # Hint: Use torch.matmul() for matrix multiplication (https://pytorch.org/docs/stable/generated/torch.matmul.html)
        # Hint: Use F.softmax() to apply the softmax function (https://pytorch.org/docs/stable/generated/torch.nn.functional.softmax.html)
        # Hint: Look at the transpose function for PyTorch tensors (https://pytorch.org/docs/stable/generated/torch.transpose.html)
        ### YOUR CODE HERE
        attn = ...


        ### END YOUR CODE

        return attn
    
attention_layer = Attention(embedding_dim, embedding_dim)
attn_logits = attention_layer(embeddings)

# Ensure the shape is the same both before and after the attention layer
embeddings.shape, attn_logits.shape

(torch.Size([5, 9, 10]), torch.Size([5, 9, 10]))

In [5]:
# naive implementation of multi-head attention

class MultiHeadAttention(nn.Module):

    def __init__(self, d_model, n_heads):
        super().__init__()
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"

        self.n_heads = n_heads
        self.d_model = d_model
        self.head_size = d_model // n_heads

        # Implement multi-head attention using the Attention module
        # Hint: Use nn.ModuleList to hold multiple instances of the Attention module
        # Ex: self.heads = nn.ModuleList([AttentionHead1, AttentionHead2, ...])
        ### YOUR CODE HERE ###
        self.heads = ...
        ### END YOUR CODE ###

    def forward(self, x):
        # pass the input through all the heads and concatenate the results
        # Hint: Use torch.cat() to concatenate the results of the different heads, specify the dimension using the dim argument
        ### YOUR CODE HERE ###

        return ...
        ### END YOUR CODE ###
    
d_model = 32
n_heads = 4
seq_len = 16
batch_size = 8
shifted_x = torch.randn(batch_size, seq_len, d_model)
multi_head_attn = MultiHeadAttention(d_model, n_heads)
attn_logits = multi_head_attn(shifted_x)
attn_logits.shape

torch.Size([8, 16, 32])

In [8]:
class AttentionBlock(nn.Module):

    def __init__(self, d_model, n_heads):
        super().__init__()

        ### YOUR CODE HERE ###
        self.norm = ...     # LayerNorm
        self.attn = ...     # MultiHeadAttention
        self.dropout = ...  # Dropout
        self.norm2 = ...    # LayerNorm
        self.linear = ...   # Linear layer or multiple linear layers
        ### END YOUR CODE ###

    def forward(self, x):
        # we first normalize the input
        x_attn = self.norm(x)
        # we then pass it through the multi-head attention layer and apply dropout
        x_attn = self.dropout(self.attn(x_attn))
        # we add the input to the output of the multi-head attention
        # this is called a residual connection (https://towardsdatascience.com/what-is-residual-connection-efb07cab0d55)
        x = x + x_attn


        ### YOUR CODE HERE ###
        # we pass the output through a linear layer and apply dropout
        x_linear = ...
        # we apply normalization (remember to use self.norm2, not self.norm)
        x_linear = ...
        # we add the input to the output of the linear layer
        x = ...
        ### END CODE HERE ###

        
        return x

In [9]:
class Transformer(nn.Module):

    def __init__(self, vocab_size, d_model, n_heads, n_layers, block_size):
        super().__init__()

        self.embedding = ...    # Embedding layer for encoding the input tokens
        self.pos_embedding = ... # Positional encoding
        self.attention_blocks = ... # Stack of n_layers attention blocks. Hint: use nn.Sequential (https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html)
        self.fc = ... # Final fully connected layer projecting the model output to the vocab size

    def forward(self, x):
        # x size: (batch_size, seq_len)
        # For simplicity, we have implemented the embedding and positional encoding for you
        x = self.embedding(x) # (batch_size, seq_len, d_model)
        x = x + self.pos_embedding(torch.arange(x.size(1), device=x.device))

        ### YOUR CODE HERE ###
        x = ...     # pass the input through the stack of attention blocks
        x = ...     # pass the output through the final fully connected layer
        return x

In [10]:
class GPT(nn.Module):

    def __init__(self, vocab_size, d_model, n_heads, n_layers, block_size):
        super().__init__()

        self.transformer = Transformer(vocab_size, d_model, n_heads, n_layers, block_size)

        ### YOUR CODE HERE ###
        self.loss_fn = ... # Loss function for training the model
        ### END YOUR CODE ###

    def forward(self, x, targets=None):
        logits = self.transformer(x)
        loss = None
        if targets is not None:
            loss = self.loss_fn(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    def generate(self, x, steps=100, deterministic=False):
        for _ in range(steps):
            logits = self.transformer(x)
            last_token_logits = logits[:, -1]
            if deterministic:
                next_token = torch.argmax(last_token_logits, dim=-1).unsqueeze(-1)
            else:
                next_token = torch.multinomial(F.softmax(last_token_logits, dim=-1), num_samples=1)
            x = torch.cat([x, next_token], dim=-1)
        return x

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
with open("input.txt", "r") as f:
    text = f.read()

In [13]:
def get_batch(text, block_size):

    tokens = tokenizer.encode(text)

    for i in range(0, len(tokens) - block_size, block_size):
        yield tokens[i:i+block_size], tokens[i+1:i+block_size+1]

In [14]:
from tqdm import tqdm


# Define the hyperparameters for training
# In general, fewer epochs means faster training, but the model may not have enough time to learn
# A larger block size means the model can learn more context, but training will be slower
# A larger d_model, n_heads, and n_layers means the model can learn more complex patterns, but training will be slower


### YOUR CODE HERE ###
num_epochs = 10     # Number of epochs to train the model, you can change this
block_size = 256    # Length of the sequence to train the model on, you can change this (try 128, 256, 512)
d_model = 256       # Dimension of the model, you can change this
n_heads = 4         # Number of attention heads, you can change this
n_layers = 4        # Number of transformer layers, you can change this
lr = 1e-4           # Learning rate for training, you can change this (try 1e-3, 1e-4, 1e-5)
### END YOUR CODE ###


device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
model = GPT(tokenizer.vocab_size, d_model, n_heads, n_layers, block_size).to(device)
optim = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(num_epochs):
    for batch in tqdm(get_batch(text, block_size), desc=f"Training epoch {epoch+1}", total=len(tokenizer.encode(text))//block_size):
        x, y = torch.tensor(batch[0]).unsqueeze(0).to(device), torch.tensor(batch[1]).unsqueeze(0).to(device)
        logits, loss = model(x, y)
        optim.zero_grad()
        loss.backward()
        optim.step()

Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training epoch 1: 100%|██████████| 1320/1320 [01:03<00:00, 20.79it/s]
Training epoch 2: 100%|██████████| 1320/1320 [01:01<00:00, 21.53it/s]
Training epoch 3: 100%|██████████| 1320/1320 [00:59<00:00, 22.30it/s]
Training epoch 4: 100%|██████████| 1320/1320 [00:59<00:00, 22.30it/s]
Training epoch 5: 100%|██████████| 1320/1320 [00:57<00:00, 23.15it/s]
Training epoch 6: 100%|██████████| 1320/1320 [00:56<00:00, 23.21it/s]
Training epoch 7: 100%|██████████| 1320/1320 [00:57<00:00, 23.13it/s]
Training epoch 8: 100%|██████████| 1320/1

In [17]:
context = "PROPSERO:" # None | str: "The quick brown fox jumps over the lazy dog"
if context:
    x = torch.tensor(tokenizer.encode(context)).unsqueeze(0).to(device)
else:
    x = torch.zeros((1, 1), dtype=torch.long).to(device)
output = model.generate(x, deterministic=False)
print(tokenizer.decode(output[0].tolist()))

PROPSERO:
Sir when she know, if the more of Juliet's, good lord so thou
Of thy father'st prev more than it like of holy
First goodness:
O no thought
But from the bestOLANDA:
We'll draw,
To those
Was ever What.
What dogs.
She eat'd themselves and upon my ghost in his beard?

VIRGILIA:
Than to watch:
Come on his poor father? what thou
