# Quick Transformer lol

In [137]:
import torch
import torch.nn as nn

neurons = nn.Linear(1,2)

embedding = nn.Embedding(10, 3)

print(neurons.weight)



Parameter containing:
tensor([[-0.1533],
        [-0.9457]], requires_grad=True)


In [138]:
# an Embedding matrix of 10 embeddings with 3 dimensions each
embedding = nn.Embedding(10, 3)

# a batch of 2 samples of 1 index each
input = torch.LongTensor([[9],[0]])

print('THE INPUTS \n',input)

# what does the embedding matrix look like?
print('THE EMBEDDING MATRIX \n',embedding.weight)

# what is the result of the input embedding?
print('EMBEDDINGS FOR THE GIVEN INPUTS \n',embedding(input))

THE INPUTS 
 tensor([[9],
        [0]])
THE EMBEDDING MATRIX 
 Parameter containing:
tensor([[-0.8222, -0.4863,  1.7287],
        [ 0.4299, -0.1073,  0.3182],
        [-0.8278,  0.4542,  1.5049],
        [ 0.7230,  0.9364, -1.5144],
        [ 0.8682,  0.6140,  0.4117],
        [-1.3442,  1.0153,  1.1109],
        [ 1.1355,  0.6441, -1.2329],
        [-0.2634,  0.9824, -0.2587],
        [ 1.1496, -0.6391, -0.4868],
        [-0.3935,  1.1036, -0.4481]], requires_grad=True)
EMBEDDINGS FOR THE GIVEN INPUTS 
 tensor([[[-0.3935,  1.1036, -0.4481]],

        [[-0.8222, -0.4863,  1.7287]]], grad_fn=<EmbeddingBackward0>)


nn.Embedding 

the embedding module creates a lookup table with a configurable number of rows and columns. 
- number of rows is the number of unique words in the vocabulary,
- columns is the size of the word embeddings.

 The embedding module is initialized with random values, and the embeddings are learned during training.

 

## Import the dataset

In [139]:
with open('input.txt','r', encoding='utf-8') as f:
    text = f.read()


In [140]:
print('the length of the text is ',len(text))
print(text[:200])

the length of the text is  1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


## OK, lets tokenize this bih

embedding layer is a lookup table, 

- input to the embedding layer is a list of indices
- output is a list of word (token) embeddings.

lets turn our dataset into a list of indices, and feed it to the embedding layer. 

#### how do we do this?
(We're doing a character level model)

[x] we find all possible vocab words (letters) in datataset 
    - separate the dataset by letters, make a set of them, a list 
- tokenize the dataset wrt the vocab 
    - enumerate every element in the list 
    - make a function that returns index num for each string
        - and a string for each num
- pass all of these token indices into the embedding table
    - turn entire dataset into nums wrt this tokenization strat
- get back all the embeddings  

In [141]:
# FIND ALL POSSIBLE VOCAB WORDS IN DATASET 

from pprint import pprint

vocab_letters  = sorted(list(set(text)))
print(''.join(vocab_letters))
print(len(vocab_letters))

# this is the entirety of our vocab for our word level transformer 


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


##### MAKE A FUNCTION THAT ENCODES STR -> IDX AND

 DECODES  IDX -> BACK TO STR

In [142]:
# INDEX ALL OF THE LETTERS
# LETTER ENCODER

# maps between the index and the letter 
letter_list = {letter:index for index, letter in enumerate(vocab_letters)}

str_to_int = lambda s: [letter_list[letter] for letter in s]

print('does our encoder work? ',str_to_int('amongus baby'))

numbered_dataset = str_to_int(text)

# here is the dataset as a list of numbers
print('our dataset turned into numbers:', numbered_dataset[:100])



does our encoder work?  [39, 51, 53, 52, 45, 59, 57, 1, 40, 39, 40, 63]
our dataset turned into numbers: [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59]


In [143]:
# DECODER - ints back to str
# jsut in case 

int_to_str = { index: letter for index, letter in enumerate(vocab_letters)}

# here is the mapping of the numbers back to letters 
print(int_to_str)

decoded = lambda i : ''.join([int_to_str[integer] for integer in i ])

# lets make sure the decoder works 
decoded([39, 51, 53, 52, 45, 59, 57, 1, 40, 39, 40, 63])

decoded_dataset = decoded(numbered_dataset)

print(decoded_dataset[:200])

{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i', 48: 'j', 49: 'k', 50: 'l', 51: 'm', 52: 'n', 53: 'o', 54: 'p', 55: 'q', 56: 'r', 57: 's', 58: 't', 59: 'u', 60: 'v', 61: 'w', 62: 'x', 63: 'y', 64: 'z'}
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


# lets make an embedding matrix 

where each character has its own embedding vector 
lets make the vectors of depth 1000!

In [144]:
import torch.nn as nn 

vocab_length = len(vocab_letters)

embedding_matrix = nn.Embedding(vocab_length, 5, sparse=True)

# ok so now every single element of vocab has a respective embedding 

In [145]:
# quick example 

# lets tokenize a word and embbed it 

word = 'amongussy'

encoded_word = str_to_int(word)

print('this is the encoded word:' ,encoded_word)

# get the embeddings for each one of these tokens 

# turn list into a tensor (basically the same as a list)
encoded_word_tensor = torch.LongTensor(encoded_word)

embedded_word = embedding_matrix(encoded_word_tensor)

print('this is the word embedding: \n', embedded_word)

# basically we are getting the index of each one of those elements and retrieving the embedding from  it
# reminded --> the embedding class starts as a random matrix and converges to true, meaning rich emebddings for each word!!!


this is the encoded word: [39, 51, 53, 52, 45, 59, 57, 57, 63]
this is the word embedding: 
 tensor([[ 2.2615,  1.2927,  0.7182,  0.6097, -0.2294],
        [ 0.3377, -0.6146,  2.0111,  0.2768, -0.5767],
        [-1.5159, -3.4483, -0.8304, -0.3416,  1.6252],
        [ 0.9731, -0.2882, -1.0467, -0.1645,  2.4650],
        [ 1.2739, -1.1889, -1.1373, -1.1227,  0.9211],
        [ 0.9225, -0.4928,  0.5311, -1.8355,  0.9262],
        [ 0.9395, -0.5285, -1.0335, -0.1193, -2.1736],
        [ 0.9395, -0.5285, -1.0335, -0.1193, -2.1736],
        [ 0.3213,  0.6322,  1.1415,  0.4657, -1.2951]],
       grad_fn=<EmbeddingBackward0>)


# lets encode the whole damn dataset 


In [146]:
# lets encode the whole damn dataset 

encoded_dataset = str_to_int(text)

print('first thousand elements of the dataset ',encoded_dataset[:1000])

data = torch.tensor(encoded_dataset, dtype=torch.long)

print(data)

# pprint('this is the encoded dataset \n', str(encoded_dataset))
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like thisb

first thousand elements of the dataset  [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59, 1, 39, 56, 43, 1, 39, 50, 50, 1, 56, 43, 57, 53, 50, 60, 43, 42, 1, 56, 39, 58, 46, 43, 56, 1, 58, 53, 1, 42, 47, 43, 1, 58, 46, 39, 52, 1, 58, 53, 1, 44, 39, 51, 47, 57, 46, 12, 0, 0, 13, 50, 50, 10, 0, 30, 43, 57, 53, 50, 60, 43, 42, 8, 1, 56, 43, 57, 53, 50, 60, 43, 42, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 18, 47, 56, 57, 58, 6, 1, 63, 53, 59, 1, 49, 52, 53, 61, 1, 15, 39, 47, 59, 57, 1, 25, 39, 56, 41, 47, 59, 57, 1, 47, 57, 1, 41, 46, 47, 43, 44, 1, 43, 52, 43, 51, 63, 1, 58, 53, 1, 58, 46, 43, 1, 54, 43, 53, 54, 50, 43, 8, 0, 0, 13, 50, 5

In [147]:
# lets retrieve the embeddings for the first 8 elements of the dataset 

sequence_len = 16

# this pulls sequence_len amount of embeddings out of the matrix from the dataset indexes 
# the rows are the embeddings and columns are the features 
embedding_matrix(data[:sequence_len])

tensor([[ 1.3145,  0.4172,  1.2170, -1.0035, -0.5640],
        [-1.5588,  0.8676,  1.0475,  1.7504,  0.0233],
        [-2.0481, -1.9609,  0.2787, -0.2928,  0.0828],
        [ 0.9395, -0.5285, -1.0335, -0.1193, -2.1736],
        [ 0.2413, -0.3848, -1.4866,  2.8223, -0.7338],
        [-0.1004,  0.2456,  0.0935, -0.0538, -0.8991],
        [-0.0486, -0.3853,  0.7480,  1.1025, -0.1715],
        [-1.5588,  0.8676,  1.0475,  1.7504,  0.0233],
        [ 0.2413, -0.3848, -1.4866,  2.8223, -0.7338],
        [-1.5588,  0.8676,  1.0475,  1.7504,  0.0233],
        [ 0.3477, -2.0663,  0.4960, -0.0407, -0.0199],
        [ 0.5980,  1.5773, -1.6572,  0.5171, -0.6409],
        [ 0.9731, -0.2882, -1.0467, -0.1645,  2.4650],
        [-0.2048,  0.0537, -0.3139,  0.0148,  1.6283],
        [-0.8620,  0.7733,  1.4342,  1.5054,  0.6243],
        [-0.2675, -0.4354,  1.2403, -0.2514, -1.7759]],
       grad_fn=<EmbeddingBackward0>)

#### Embbed the whole dataset 

In [148]:
embedded_dataset= embedding_matrix(data)

# now each letter has its own embedding vector 
embedded_dataset.shape

torch.Size([1115394, 5])

# dataset is embedded!!

pass into transformer block

what's in the **block**??

#### 1. attention head 
#### 2. feedforward layer 

### now we gotta figure out how to do a foward pass
- what does the forward pass do??



### Scratch that

lets implement a head of attention first, then we can put this into the model
- why? bc i felt like it lel

---
# Attention
#### What does a head of attention do??

it creates 
1. queries ❓
2. keys 🔑
3. values 🗣️

we multiply k and q to get the affinities/attention scores between embeddings

softmax attention scores to get weights

mult weights & values to get the ΔEmbedding

add ΔEmbedding to Embeddings recursively 
- slowly add more and more semantic meaning to embeds

---

# Actually jk

lets get this thing to create completions.

even if they suck, lets just get it to work first

In [154]:
#  the bigram learns probabilities of the next token based on the previous one 

# the model's forward function gets called when you call the model
#  this is part of the nn.Module functionality 

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)


class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size): 
        super().__init__()
        # initialize the embedding table for every element in the vocab
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # get the logits of the last character 
        # measure how far off we where from our targets
        embeddings = self.token_embedding_table(idx) # --> returns indexes + embeddings 
        # basically (B,T, C) adding the channel, C dimension
        return embeddings



model = BigramLanguageModel(vocab_length)

# here, we are embedding the first 1000 elements of the dataset, making them 65-dimensional vectors 
embedding_tensor = model.forward(data[:1000])

print(embedding_tensor.shape)
pprint(embedding_tensor)
    

torch.Size([1000, 65])
tensor([[-0.1935, -0.6324, -0.2059,  ..., -0.5336, -1.3621,  0.3269],
        [ 1.6515, -0.0424, -0.7355,  ...,  0.8682,  2.0593, -0.8159],
        [-0.6722,  0.2322, -0.1632,  ...,  0.1390,  0.7560,  0.4296],
        ...,
        [ 1.1407,  0.8935, -2.4000,  ...,  0.3227,  1.5431, -1.0392],
        [ 0.1808, -0.0700, -0.3596,  ...,  1.6097, -0.4032, -0.8345],
        [ 0.1808, -0.0700, -0.3596,  ...,  1.6097, -0.4032, -0.8345]],
       grad_fn=<EmbeddingBackward0>)


### Quick little recap of wtf we just did 

so we started a bigram language model 

we initalized the super() parent module nn.Module 

we created an embedding table of vocab_size by n-dimensions

we created the forward function with limited utility 
- right now all it does is retrieve the vector embeddings for the indices that are passed into it 
- returning a B, T, C tensor where T is sequence length and C is the channels of the embeddings 

now- we dont want to run the entire sequence linearly 
- we can strip out the sequence into batches 
    - this allows us to run many batches n parallel
    - parallelism is our friend 
    - we can do many sequences in parallel instead of one linearly 

