In [1]:
from dataclasses import dataclass
import torch
import torch.nn as nn
import numpy as np
import requests
import re


In [3]:
@dataclass
class Config:
    d_model:int
    d_vocab:int
    d_hidden:int
    max_seq_len:int
    numTrans:int

In [4]:
class MLP(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.fc1 = nn.Linear(config.d_model, config.d_hidden)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(config.d_hidden, config.d_model)

    def forward(self, x):
        x = self.fc2(self.act(self.fc1(x)))
        return x

In [5]:
class Attention(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.Wqk = nn.Parameter(torch.rand(config.d_model, config.d_model))
        self.Wov = nn.Parameter(torch.rand(config.d_model, config.d_model))

        mask = torch.triu(torch.ones(config.max_seq_len, config.max_seq_len),
                          diagonal=1
                          )
        mask = mask.masked_fill(mask==1, -float('inf'))
        self.register_buffer("M", mask)

    
    def forward(self, x): 
        T = x.size(0)
        temp = x @ self.Wqk @ x.T + self.M[:T, :T]
        scores = torch.softmax(temp,dim=-1)
        scores = scores @ x @ self.Wov

        return scores


        

In [6]:
class Transformer(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.attn = Attention(config)
        self.mlp = MLP(config)
        self.ln1 = nn.LayerNorm(config.d_model)
        self.ln2 = nn.LayerNorm(config.d_model)

    def forward(self, x):
        #res = self.mlp(x) + self.attn(x) + x
        x_norm = self.ln1(x)
        attn_out = self.attn(x_norm)
        x = x+attn_out
        x_norm = self.ln2(x)
        mlp_out = self.mlp(x_norm)
        x = x+mlp_out

        return x

In [7]:
class LanguageModel(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.config = config
        self.embedding = nn.Embedding(config.d_vocab, config.d_model)
        self.tbs = nn.ModuleList([Transformer(config) for i in range(config.numTrans)])
        self.lm_head = nn.Linear(config.d_model, config.d_vocab)
        #self.t1 = Transformer(config)
    
    def forward(self, x_tokens):
        x = self.embedding(x_tokens)
        #print("print:", x)
        #print(x.shape)
        #x = self.tbs[0](x)
        temp = x
        for i in range(self.config.numTrans):
            temp = self.tbs[i](temp)

        #X = torch.stack(x)
        logits = self.lm_head(temp)
        
        return logits


In [8]:
config = Config(d_model=30, d_vocab=100, d_hidden=128, max_seq_len=3, numTrans=3)
model = LanguageModel(config)
x = torch.tensor([1, 5, 24])
#print(x)
res = model(x)
#print(res)

In [10]:
mask = torch.triu(torch.ones(10, 10),
                          diagonal=1
                          )
print(mask)
mask = mask.masked_fill(mask==1, -float('inf'))
print(mask)

tensor([[0., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 1., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf],
        [0.

In [16]:
url = "https://www.gutenberg.org/files/1342/1342-0.txt" # Just a demo dataset, we can think about more creative datasets.
r = requests.get(url)
text = r.text

In [35]:
from processing import get_recipes_text
text = get_recipes_text()

ImportError: cannot import name 'get_recipes_text' from 'processing' (c:\Users\amsba\Desktop\Coding\Class\LLMs\a1_transformer\processing.py)

In [None]:
text = text.lower().replace("\n", " ")
tokens = text.split()
tokens = re.findall(r"\b\w+\b", text.lower()) # I just used normal tokenization like word level token (every word is a token). We need to think about more advance tokenization techniques



vocab = list(set(tokens))
vocab.sort()

token2id = {token: idx for idx, token in enumerate(vocab)}
id2token = {idx: tok for tok, idx in token2id.items()}
print(len(vocab))

7030


In [20]:
config = Config(d_model=64, d_vocab=len(vocab), d_hidden=128, max_seq_len=1024, numTrans=2)  

token_ids = [token2id[tok] for tok in tokens]

print(len(token_ids))


128769


In [21]:
# Testing....
embedding = nn.Embedding(num_embeddings=config.d_vocab, embedding_dim=config.d_model)

token_ids_tensor = torch.tensor(token_ids[:config.max_seq_len])
#print(token_ids_tensor)

x = embedding(token_ids_tensor)
print(x.shape)
print(x)

torch.Size([1024, 64])
tensor([[-0.3203,  0.5543, -0.0435,  ...,  1.1255,  1.8135, -0.7516],
        [ 0.5108, -0.7017,  1.3207,  ..., -0.9895, -0.5102,  1.1979],
        [-1.5709,  0.4920,  0.8093,  ..., -1.2583,  0.8982,  0.8086],
        ...,
        [-0.8477,  0.7623, -0.1941,  ...,  0.0026,  1.6839, -1.6065],
        [ 0.2184, -0.3358, -1.8773,  ..., -0.9616,  0.3345,  1.2210],
        [-0.4483,  1.4829,  0.8466,  ..., -0.4188,  0.2324, -0.5937]],
       grad_fn=<EmbeddingBackward0>)


In [22]:
x_ids = torch.tensor(token_ids[:config.max_seq_len])
y_ids = torch.tensor(token_ids[1:config.max_seq_len+1])

print(x_ids.shape)

torch.Size([1024])


In [24]:
model = LanguageModel(config)
logits = model(x_ids)
logits

tensor([[-1.7186, -0.1856,  1.7579,  ...,  0.4096, -0.7695,  1.5261],
        [-1.3910,  0.8565,  1.6679,  ..., -0.1610, -2.6980,  3.8181],
        [-1.9061,  0.6609,  1.6998,  ...,  0.5690, -0.5054,  2.8198],
        ...,
        [ 1.0073,  0.1663,  1.3406,  ...,  0.1250, -0.6013,  3.8738],
        [-1.4835, -1.8537, -0.6857,  ..., -3.0837, -0.1971,  0.5632],
        [-2.4108, -0.0838, -0.6624,  ...,  0.4469, -1.1086,  2.2518]],
       grad_fn=<AddmmBackward0>)

In [25]:
print(logits.shape)
print(y_ids.shape)

torch.Size([1024, 7030])
torch.Size([1024])


In [26]:
targets = y_ids
print(targets.shape)

torch.Size([1024])


In [27]:
loss_fn = nn.CrossEntropyLoss()
loss = loss_fn(logits, targets)
print("Loss: ", loss.item())

Loss:  10.669713020324707


In [28]:
### Training Loop ###
model = LanguageModel(config)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

for step in range(1000):  # number of training steps
    # sample a random chunk of text
    start = np.random.randint(0, len(token_ids) - config.max_seq_len - 1)
    x_ids = torch.tensor(token_ids[start:start+config.max_seq_len])
    y_ids = torch.tensor(token_ids[start+1:start+config.max_seq_len+1])
    logits = model(x_ids)
    targets = y_ids
    loss = loss_fn(logits, targets)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 50 == 0:
        print(f"step {step}, loss = {loss.item():.4f}")

step 0, loss = 10.4195
step 50, loss = 7.2960
step 100, loss = 6.4542
step 150, loss = 6.2343
step 200, loss = 6.2475
step 250, loss = 6.0763
step 300, loss = 6.1092
step 350, loss = 5.9576
step 400, loss = 5.8933
step 450, loss = 5.9033
step 500, loss = 5.9237
step 550, loss = 5.6171
step 600, loss = 5.2216
step 650, loss = 5.4718
step 700, loss = 5.5188
step 750, loss = 5.7507
step 800, loss = 5.8355
step 850, loss = 5.5377
step 900, loss = 5.5768
step 950, loss = 5.9387


In [None]:
from processing import get_recipe_arr, get_vocab

recipe_arr = get_recipe_arr()

vocab, token2id, id2token = get_vocab()

config = Config(d_model=64, d_vocab=len(vocab), d_hidden=128, max_seq_len=1024, numTrans=2)  

for recipe in recipe_arr:

    model = LanguageModel(config)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()

    for step in range(1000):  # number of training steps
        # sample a random chunk of text
        start = np.random.randint(0, len(token_ids) - config.max_seq_len - 1)
        x_ids = torch.tensor(token_ids[start:start+config.max_seq_len])
        y_ids = torch.tensor(token_ids[start+1:start+config.max_seq_len+1])
        logits = model(x_ids)
        targets = y_ids
        loss = loss_fn(logits, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 50 == 0:
            print(f"step {step}, loss = {loss.item():.4f}")

'Here is how to make Chinese Five-Spice Steak with Oranges and Sesame Broccolini. You need 3 small oranges (about 1 lb.), 1 1/2 tsp. Chinese five-spice powder, 1 tsp. light brown sugar, 2 1/4 tsp. kosher salt, divided, 1 hanger steak (about 1 1/4 lb.), cut in half lengthwise, center gristle removed, 2 Tbsp. vegetable oil, 3 bunches broccolini (about 1 1/2 lb.), trimmed, halved lengthwise if large, 2 Tbsp. toasted sesame oil, 1/2 tsp. crushed red pepper flakes, 1 tsp. toasted sesame seeds, plus more for serving, 3 scallions, thinly sliced, Flaky sea salt, Steamed rice and hot sauce (for serving; optional). Finely grate 2 tsp. orange zest from 1 orange into a small bowl. Cut all oranges in half; set aside.\nAdd five-spice powder, brown sugar, and 2 tsp. kosher salt to bowl with zest and stir to combine. Rub steak all over with spice mixture.\nHeat vegetable oil in a large heavy skillet (preferably cast iron) over high. Cook steak, turning often, until browned on all sides and an instant-

In [None]:
text

In [30]:
max_num_tokens = 50
prompt_text = "But the greatness of Mr Collins could not have been so satisfactorily"

for i in range(max_num_tokens):
    prompt_tokens = [token2id[tok] for tok in prompt_text.lower().split()]
    prompt_tensor = torch.tensor(prompt_tokens)

    with torch.no_grad():
        logits = model(prompt_tensor)
    
    last_logits = logits[-1]
    prob = torch.softmax(last_logits, dim=-1)
    next_token_id = torch.argmax(prob).item()
    next_token = id2token[next_token_id]
    print(next_token, end=' ')

    # append to prompt
    prompt_text += " " + next_token

    

allen not be of the house had been a great of the house was to the whole to the whole to the whole to the whole to the whole to the whole to the whole to the whole to the whole to the whole to the whole to the whole 