# Quick Transformer lol

In [324]:
import torch
import torch.nn as nn

# how to initialize neurons,
# how to initialize embedding matrix

neurons = nn.Linear(1,2)

embedding = nn.Embedding(10, 3)


print(neurons.weight)
print(embedding.weight)



Parameter containing:
tensor([[0.5006],
        [0.2748]], requires_grad=True)
Parameter containing:
tensor([[ 0.1603, -0.4229, -0.9776],
        [ 0.1618, -0.6182, -1.1653],
        [-0.0393, -1.2973,  3.2234],
        [ 1.1670, -2.0040,  0.2367],
        [ 0.1582,  0.9344,  1.2147],
        [ 0.7320,  1.2849, -0.3356],
        [-0.3271, -0.3789, -0.0440],
        [-0.9420,  0.6211, -0.1851],
        [-0.7103,  1.6472, -0.8887],
        [ 1.8583, -1.1187, -1.3211]], requires_grad=True)


In [580]:
# an Embedding matrix of 10 embeddings with 3 dimensions each
embedding = nn.Embedding(10, 3)

# a batch of 2 samples of 1 index each
input = torch.LongTensor([[9],[0]])

print('THE INPUTS \n',input)

# what does the embedding matrix look like?
print('THE EMBEDDING MATRIX \n',embedding.weight)

# what is the result of the input embedding?
print('EMBEDDINGS FOR THE GIVEN INPUTS \n',embedding(input))

THE INPUTS 
 tensor([[9],
        [0]])
THE EMBEDDING MATRIX 
 Parameter containing:
tensor([[-0.0405,  0.1501, -0.3983],
        [ 0.0072, -1.7136,  0.6234],
        [-0.0432,  0.1452, -1.0400],
        [ 0.3987,  0.0086,  0.5366],
        [ 0.3724,  1.1028, -0.5192],
        [ 0.2357, -0.9623, -0.8102],
        [-0.1859,  0.2538,  0.6283],
        [ 0.9312, -1.4137,  1.2699],
        [ 1.1944, -2.5325, -1.3760],
        [ 0.6366, -0.2231,  0.4830]], requires_grad=True)
EMBEDDINGS FOR THE GIVEN INPUTS 
 tensor([[[ 0.6366, -0.2231,  0.4830]],

        [[-0.0405,  0.1501, -0.3983]]], grad_fn=<EmbeddingBackward0>)


nn.Embedding 

the embedding module creates a lookup table with a configurable number of rows and columns. 
- number of rows is the number of unique words in the vocabulary,
- columns is the size of the word embeddings.

 The embedding module is initialized with random values, and the embeddings are learned during training.

 

## Import the dataset

In [581]:
with open('input.txt','r', encoding='utf-8') as f:
    text = f.read()


In [582]:
print('the length of the text is ',len(text))
print(text[:200])

the length of the text is  1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


## OK, lets tokenize this bih

embedding layer is a lookup table, 

- input to the embedding layer is a list of indices
- output is a list of word (token) embeddings.

lets turn our dataset into a list of indices, and feed it to the embedding layer. 

#### how do we do this?
(We're doing a character level model)

[x] we find all possible vocab words (letters) in datataset 
    - separate the dataset by letters, make a set of them, a list 
- tokenize the dataset wrt the vocab 
    - enumerate every element in the list 
    - make a function that returns index num for each string
        - and a string for each num
- pass all of these token indices into the embedding table
    - turn entire dataset into nums wrt this tokenization strat
- get back all the embeddings  

In [584]:
# FIND ALL POSSIBLE VOCAB WORDS IN DATASET 

vocab_letters  = sorted(list(set(text)))
print(''.join(vocab_letters))
print(len(vocab_letters))

# this is the entirety of our vocab for our word level transformer 


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


##### MAKE A FUNCTION THAT ENCODES STR -> IDX AND

 DECODES  IDX -> BACK TO STR

In [590]:
# INDEX ALL OF THE LETTERS
# LETTER ENCODER

# maps between the index and the letter 
letter_list = {letter:index for index, letter in enumerate(vocab_letters)}

print('here is our dictionary of letters and their indices:',letter_list)

str_to_int = lambda s: [letter_list[letter] for letter in s]

print('does our encoder work? ',str_to_int('amongus baby'))

numbered_dataset = str_to_int(text)

# here is the dataset as a list of numbers
print('our dataset turned into numbers:', numbered_dataset[:100])



here is our dictionary of letters and their indices: {'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
does our encoder work?  [39, 51, 53, 52, 45, 59, 57, 1, 40, 39, 40, 63]
our dataset turned into numbers: [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43

In [601]:
# DECODER - ints back to str
# jsut in case 

int_to_str = { index: letter for index, letter in enumerate(vocab_letters)}

# here is the mapping of the numbers back to letters 
print(int_to_str)

decoded = lambda i : ''.join([int_to_str[integer] for integer in i ])

# lets make sure the decoder works 
print('DECODED MESSAGE ',decoded([39, 51, 53, 52, 45, 59, 57, 1, 40, 39, 40, 63]))

decoded_dataset = decoded(numbered_dataset)

print(decoded_dataset[:200])

{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i', 48: 'j', 49: 'k', 50: 'l', 51: 'm', 52: 'n', 53: 'o', 54: 'p', 55: 'q', 56: 'r', 57: 's', 58: 't', 59: 'u', 60: 'v', 61: 'w', 62: 'x', 63: 'y', 64: 'z'}
DECODED MESSAGE  amongus baby
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


# lets make an embedding matrix 

where each character has its own embedding vector 
lets make the vectors of depth 1000!

In [610]:
import torch.nn as nn 

vocab_length = len(vocab_letters)

embedding_matrix = nn.Embedding(vocab_length, 5, sparse=True)

# ok so now every single element of vocab has a respective embedding 

In [615]:
# quick example 

# lets tokenize a word and embbed it 

word = 'amongussy'

encoded_word = str_to_int(word)

print('this is the encoded word:' ,encoded_word)

# get the embeddings for each one of these tokens 
# turn list into a tensor (basically the same as a list)
encoded_word_tensor = torch.LongTensor(encoded_word)

embedded_word = embedding_matrix(encoded_word_tensor)

print('this is the word embedding: \n', embedded_word)

# basically we are getting the index of each one of those elements and retrieving the embedding from  it
# reminded --> the embedding class starts as a random matrix and converges to true, meaning rich emebddings for each word!!!


this is the encoded word: [39, 51, 53, 52, 45, 59, 57, 57, 63]
this is the word embedding: 
 tensor([[ 4.4324e-01,  1.0784e+00, -3.3171e-02, -1.9377e+00,  5.2010e-01],
        [ 3.8638e-01, -6.0041e-01,  4.1063e-01, -1.5800e-01, -3.6941e-01],
        [ 1.3314e+00,  1.4940e-01, -1.0414e+00, -7.3719e-01, -3.5982e-01],
        [-8.3868e-01, -6.4568e-01,  1.5569e-01, -2.0627e-02, -1.1563e+00],
        [-1.6021e-03,  1.5700e+00,  1.1495e+00, -1.1201e+00,  1.4911e+00],
        [-1.2403e+00, -1.4496e+00, -1.7733e-01, -3.7104e-02,  1.9600e+00],
        [ 8.7598e-01, -9.4798e-01, -8.7764e-02,  6.6826e-02,  4.0936e-01],
        [ 8.7598e-01, -9.4798e-01, -8.7764e-02,  6.6826e-02,  4.0936e-01],
        [-7.0437e-02,  2.4551e-01, -1.5056e+00, -2.5242e+00,  4.6545e-01]],
       grad_fn=<EmbeddingBackward0>)


# lets encode the whole damn dataset 


In [146]:
# lets encode the whole damn dataset 

encoded_dataset = str_to_int(text)

print('first thousand elements of the dataset ',encoded_dataset[:1000])

data = torch.tensor(encoded_dataset, dtype=torch.long)

print(data)

# pprint('this is the encoded dataset \n', str(encoded_dataset))
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like thisb

first thousand elements of the dataset  [18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59, 1, 39, 56, 43, 1, 39, 50, 50, 1, 56, 43, 57, 53, 50, 60, 43, 42, 1, 56, 39, 58, 46, 43, 56, 1, 58, 53, 1, 42, 47, 43, 1, 58, 46, 39, 52, 1, 58, 53, 1, 44, 39, 51, 47, 57, 46, 12, 0, 0, 13, 50, 50, 10, 0, 30, 43, 57, 53, 50, 60, 43, 42, 8, 1, 56, 43, 57, 53, 50, 60, 43, 42, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 18, 47, 56, 57, 58, 6, 1, 63, 53, 59, 1, 49, 52, 53, 61, 1, 15, 39, 47, 59, 57, 1, 25, 39, 56, 41, 47, 59, 57, 1, 47, 57, 1, 41, 46, 47, 43, 44, 1, 43, 52, 43, 51, 63, 1, 58, 53, 1, 58, 46, 43, 1, 54, 43, 53, 54, 50, 43, 8, 0, 0, 13, 50, 5

In [620]:
# lets retrieve the embeddings for the first 8 elements of the dataset 

sequence_len = 16

# this pulls sequence_len amount of embeddings out of the matrix from the dataset indexes 
# the rows are the embeddings and columns are the features 
embedding_matrix(data[:sequence_len])

tensor([[ 0.1981, -0.2763,  0.2850,  0.0373,  0.8419],
        [-0.6436, -1.6836, -0.6482,  0.6465,  0.6399],
        [-0.6567,  0.7852,  0.3708, -0.4710,  0.2444],
        [ 0.8760, -0.9480, -0.0878,  0.0668,  0.4094],
        [ 1.7495, -0.8827,  0.7772,  0.3706, -0.0818],
        [-1.1026,  0.3207,  1.4355,  1.2953, -0.1130],
        [-0.8380, -0.7666, -0.0140, -1.8904, -0.7230],
        [-0.6436, -1.6836, -0.6482,  0.6465,  0.6399],
        [ 1.7495, -0.8827,  0.7772,  0.3706, -0.0818],
        [-0.6436, -1.6836, -0.6482,  0.6465,  0.6399],
        [-0.3891, -0.3347,  0.5705, -1.0866, -0.1941],
        [ 0.5354,  0.1426,  0.3261,  0.1197, -0.6784],
        [-0.8387, -0.6457,  0.1557, -0.0206, -1.1563],
        [-0.3537,  0.4702,  0.7982, -0.4346, -2.1039],
        [ 0.1819, -0.4138,  0.1301, -0.2231, -0.6040],
        [-0.8845, -1.8248, -0.4857,  0.6976,  0.8490]],
       grad_fn=<EmbeddingBackward0>)

#### Embbed the whole dataset 

In [621]:
embedded_dataset= embedding_matrix(data)

# now each letter has its own embedding vector 
embedded_dataset.shape

torch.Size([1115394, 5])

# dataset is embedded!!

pass into transformer block

what's in the **block**??

#### 1. attention head 
#### 2. feedforward layer 

### now we gotta figure out how to do a foward pass
- what does the forward pass do??



### Scratch that

lets implement a head of attention first, then we can put this into the model
- why? bc i felt like it lel

---
# Attention
#### What does a head of attention do??

it creates 
1. queries ❓
2. keys 🔑
3. values 🗣️

we multiply k and q to get the affinities/attention scores between embeddings

softmax attention scores to get weights

mult weights & values to get the ΔEmbedding

add ΔEmbedding to Embeddings recursively 
- slowly add more and more semantic meaning to embeds

---

# Actually jk

lets get this thing to create completions.

even if they suck, lets just get it to work first

In [628]:
#  the bigram learns probabilities of the next token based on the previous one 

# the model's forward function gets called when you call the model
#  this is part of the nn.Module functionality 

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size): 
        # initialize the nn.Module parent class
        super().__init__()
        # initialize the embedding table for every element in the vocab
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # get the logits of the last character 
        # measure how far off we where from our targets
        embeddings = self.token_embedding_table(idx) # --> returns indexes + embeddings 
        # basically (B,T, C) adding the channel, C dimension
        return embeddings

model = BigramLanguageModel(vocab_length)

# INITALIZED EMBEDDING table 
token_embedding_table = model.token_embedding_table
print('EMBEDDING MATRIX \n',token_embedding_table)
print(token_embedding_table.weight)

print('---------------------------')

# embed first 1000 elements of data, making them 65-dimensional vectors 
embedding_tensor = model.forward(data[:1000])
print('EMBEDDING THE DATA (first 1000) \n', embedding_tensor.shape)
print(embedding_tensor)
    

EMBEDDING MATRIX 
 Embedding(65, 65)
Parameter containing:
tensor([[ 0.1808, -0.0700, -0.3596,  ...,  1.6097, -0.4032, -0.8345],
        [ 0.5978, -0.0514, -0.0646,  ..., -1.4649, -2.0555,  1.8275],
        [ 1.3035, -0.4501,  1.3471,  ...,  0.1910, -0.3425,  1.7955],
        ...,
        [ 0.4222, -1.8111, -1.0118,  ...,  0.5462,  0.2788,  0.7280],
        [-0.8109,  0.2410, -0.1139,  ...,  1.4509,  0.1836,  0.3064],
        [-1.4322, -0.2810, -2.2789,  ..., -0.5551,  1.0666,  0.5364]],
       requires_grad=True)
---------------------------
EMBEDDING THE DATA (first 1000) 
 torch.Size([1000, 65])
tensor([[-0.1935, -0.6324, -0.2059,  ..., -0.5336, -1.3621,  0.3269],
        [ 1.6515, -0.0424, -0.7355,  ...,  0.8682,  2.0593, -0.8159],
        [-0.6722,  0.2322, -0.1632,  ...,  0.1390,  0.7560,  0.4296],
        ...,
        [ 1.1407,  0.8935, -2.4000,  ...,  0.3227,  1.5431, -1.0392],
        [ 0.1808, -0.0700, -0.3596,  ...,  1.6097, -0.4032, -0.8345],
        [ 0.1808, -0.0700, -0.35

### Quick little recap of wtf we just did 

so we started a bigram language model 

we initalized the super() parent module nn.Module 

we created an embedding table of vocab_size by n-dimensions

we created the forward function with limited functionality  
- right now all it does is retrieve the vector embeddings for the indices (idx) that are passed into it 
- returning a B, T, C tensor where T is sequence length and C is the channels of the embeddings 

#### Optimization

now- we dont want to run the entire sequence linearly 
- we can strip out the sequence into batches 
    - this allows us to run many batches n parallel
    - parallelism is our friend - GPUs can compute many things simultaneously
        - we want to encourage these capabilities in our program  
    - we can do many sequences in parallel instead of one linearly 

### Lets understand the feedforward mechanism 

sorry i cant do this one from first principles

that shit is hard to keep all in mind simultaneously

i know a few things 
- this is a classic neural net
    - has weights, biases
    - and non-linear layers 
        - the non-linearity allow for the model to inch towards different weights to optimize itself with respect to a goal 
        - imagine a machine that retrieves a delta between a ball throw and the target it wanted, the non-linearity allows the machine to adjust the force (wweight) of the throw with respect to the target (loss)

so we'll just copy paste it and break it down

In [720]:
class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
        )

    def forward(self, x):
        return self.net(x)

# here, n_embd is the number of dimensions in the embedding
# the first linear layer projects the embeddings to a higher-dimensional space
# the second linear layer brings it back down to the original dimensionality
# the ReLU activation function is applied in between the two linear layers

ffw = FeedFoward(65).forward(embedding_tensor)

print(ffw.shape)
print(ffw)

n_embd = 65

print(nn.Linear(n_embd, 4 * n_embd))


torch.Size([1000, 65])
tensor([[-0.1063,  0.0056,  0.0715,  ...,  0.2231, -0.0992,  0.1521],
        [-0.1040,  0.2804,  0.0873,  ..., -0.2492, -0.1800,  0.1106],
        [ 0.0151, -0.1253,  0.2796,  ..., -0.2357, -0.0340, -0.0983],
        ...,
        [ 0.1486, -0.1871, -0.1081,  ..., -0.1200, -0.3288, -0.0159],
        [ 0.1479, -0.1909, -0.0126,  ...,  0.0750, -0.0729,  0.1843],
        [ 0.1479, -0.1909, -0.0126,  ...,  0.0750, -0.0729,  0.1843]],
       grad_fn=<AddmmBackward0>)
Linear(in_features=65, out_features=260, bias=True)


### notes on the FeedFoward layer

so the linear layer
- n_embed is the size/dimensions of the embedding vectors (# of features/token)
projects the input to a higher-dimensional space, then back down
This allows for more complex transformations of the data
- the expansion and contraction create a "bottleneck" architecture

Process flow:

Input (65 dimensions) -> Expand to 260 -> Apply ReLU -> Reduce back to 65


Physical interpretation:

- Each of the 65 input "neurons" connects to multiple neurons in the 260-dimensional space
- It's not exactly "each neuron has four output dimensions", but rather a full connection where each of the 65 inputs influences all 260 outputs
- ReLU then acts on these 260 values, potentially setting some to zero
- The 260 values are then combined in various ways to produce the 65 output values


eedForward layer expands the input to a higher dimension, applies non-linearity, and then projects it back to the original dimension, all while maintaining the sequence length.

---
---

basically what is happening here is that the C param of each (B, T, C) is being multiplied
- higher embedding vector length 
- creates a richer area to optimize/apply relu/bias onto 

When an `nn.Linear` object is created, it randomly initializes a weight matrix and a bias vector. The size of the weight matrix is `out_features` x `in_features`, and the size of the bias vector is `out_features`.

another thing that this linear layuer is doing
turning ine output into multiple outputs, how?
- weighting that one output out into a sum of one across all the output projections 
- adding a bias/relu 



In [785]:
m = nn.Linear(5, 6)

input = torch.randn(4, 5)

output = m(input)

print(input.shape)
print(output.shape)

print(input)
print(output)

print(m.bias)

print(sum([ 0.9070,  2.0752, -0.2229, -0.4429,  1.2246]))
print(sum([-0.4813,  0.4688, -0.8751, -1.2857,  0.1232,  0.9951]))



torch.Size([4, 5])
torch.Size([4, 6])
tensor([[-4.2169e-01,  2.0878e+00,  1.0460e-01, -2.3480e+00, -1.5604e+00],
        [ 4.2957e-01,  7.5407e-01, -1.4370e+00, -6.0126e-01, -4.5125e-01],
        [ 8.6857e-01,  7.9275e-01,  6.0557e-01,  2.3086e-04,  7.8297e-03],
        [-1.2700e+00,  5.9990e-01, -7.4589e-01,  3.0727e-01, -7.2420e-01]])
tensor([[-0.9884, -1.4970, -0.5441,  2.3051, -0.8877,  2.0593],
        [-0.3017,  0.0825, -0.3103,  0.4518, -1.0844,  0.8471],
        [-0.7126,  0.0435,  1.0352,  0.1520, -0.6396, -0.1819],
        [ 0.1801,  0.2077, -0.4750,  0.3845, -0.4263,  0.9890]],
       grad_fn=<AddmmBackward0>)
Parameter containing:
tensor([-0.2998,  0.1982,  0.1845, -0.0683, -0.3465,  0.0981],
       requires_grad=True)
3.5410000000000004
-1.0550000000000002
