<h1>This is the primary notebook for the generative pre-trained transformer.</h1>
<br>
This file contains our cleaned, revised work, and experimental or older code should be put elsewhere

**Authors:** Jesan Ahammed Ovi, Stone Amsbaugh

**Instructor:** Michael Ivanitsky

In [1]:
#get the necessary imports and define our config class
from dataclasses import dataclass
import torch
import torch.nn as nn
import numpy as np
import requests
import re

#our custom utility files
from vocab_utility import get_token_arr, get_dictionaries
from recipe_utility import *

@dataclass
class Config:
    d_model:int
    d_vocab:int
    d_hidden:int
    max_seq_len:int
    n_transformers:int

In [2]:
#this cell defines our MLP, Attention head, transformer that combines these, as well as the language model containing these

#Multi layer perceptron module, just a NN
class MLP(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.fc1 = nn.Linear(config.d_model, config.d_hidden)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(config.d_hidden, config.d_model)

    def forward(self, x):
        x = self.fc2(self.act(self.fc1(x)))
        return x
    
#'secret sauce' attention head. Allows the model to look back at previous tokens indefinitely, and select what is important
class Attention(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        #initialize our parameters to be random
        self.Wqk = nn.Parameter(torch.rand(config.d_model, config.d_model))
        self.Wov = nn.Parameter(torch.rand(config.d_model, config.d_model))

        #create the mask, which isn't a model parameter but we still need it
        mask = torch.triu(torch.ones(config.max_seq_len, config.max_seq_len), diagonal=1)
        mask = mask.masked_fill(mask==1, -float('inf'))
        self.register_buffer("M", mask)

    
    def forward(self, x): 
        T = x.size(0)
        temp = x @ self.Wqk @ x.T + self.M[:T, :T]
        scores = torch.softmax(temp,dim=-1)
        scores = scores @ x @ self.Wov

        return scores
    
class Transformer(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.attn = Attention(config)
        self.mlp = MLP(config)
        self.mlp_normalizer = nn.LayerNorm(config.d_model)
        self.attn_normalizer = nn.LayerNorm(config.d_model)

    def forward(self, x):
        attn_out = self.attn(self.attn_normalizer(x))
        mlp_out = self.mlp(self.mlp_normalizer(x))

        return x+attn_out+mlp_out
    
#compile multiple transformers, embedding layer, and our output layer, as well as our overall configurations into the language model
class LanguageModel(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.config = config
        self.embedding = nn.Embedding(self.config.d_vocab, self.config.d_model)
        self.tbs = nn.ModuleList([Transformer(self.config) for _ in range(self.config.n_transformers)])
        self.lm_head = nn.Linear(self.config.d_model, self.config.d_vocab)
    
    def forward(self, x_tokens):
        temp = self.embedding(x_tokens)
        #look that propagates this through the transformer layers
        for i in range(self.config.n_transformers):
            temp = self.tbs[i](temp)

        logits = self.lm_head(temp)
        
        return logits

Now we have our architecture built out, we will implement a training loop. This first demonstration will be on the Gutenberg sample

In [21]:
#Some utility functions for processing vocabulary
def get_dictionaries(tokens):
    # takes array of words(tokens), makes forward and backward dictionaries from words to their token identifiers
    forward_dict = {} #get token ID
    backward_dict = {} #get english token
    i = 0
    for token in tokens:
        if token in forward_dict:
            continue
        #if new token, give it an ID
        forward_dict[token] = i
        backward_dict[i] = token
        i+=1
    
    return forward_dict, backward_dict

def get_token_arr(text):
    #return(["A"])
    #takes text and makes more standardized tokens
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\.? \n]', '', text)
    #add a space to make the punctuation their own tokens
    text.replace("."," .")
    text.replace(","," ,")
    text.replace("?"," ?")
    text.replace("!"," !")
    
    token_arr = text.split()
    return token_arr

In [13]:
# Demo 1 - Gutenberg - gather text
url = "https://www.gutenberg.org/files/1342/1342-0.txt" 
r = requests.get(url)

In [19]:
# make tokens, token IDs
tokens = get_token_arr(r.text) # TODO: Find a better way to tokenize this. Michael suggested we treat standard punctuation as their own tokens, split on whitespace, and drop other characters

token_to_id, id_to_token = get_dictionaries(tokens) #get unique IDs for all the tokens
d_vocab = len(token_to_id)

config = Config(d_model=64, d_vocab=d_vocab, d_hidden=128, max_seq_len=1024, n_transformers=2)  

token_ids = [token_to_id[tok] for tok in tokens]

In [20]:
### Training Loop ###
model = LanguageModel(config)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

for step in range(1000):  # number of training steps
    # sample a random chunk of text
    start = np.random.randint(0, len(token_ids) - config.max_seq_len - 1)
    x_ids = torch.tensor(token_ids[start:start+config.max_seq_len])
    y_ids = torch.tensor(token_ids[start+1:start+config.max_seq_len+1])
    logits = model(x_ids)
    targets = y_ids
    loss = loss_fn(logits, targets)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 50 == 0:
        print(f"step {step}, loss = {loss.item():.4f}")

step 0, loss = 10.7569
step 50, loss = 7.6163
step 100, loss = 6.7221
step 150, loss = 6.2426
step 200, loss = 6.7349
step 250, loss = 6.2635
step 300, loss = 6.0314
step 350, loss = 5.5470
step 400, loss = 6.1993
step 450, loss = 5.9281
step 500, loss = 6.2200
step 550, loss = 5.8801
step 600, loss = 5.7898
step 650, loss = 5.6240
step 700, loss = 5.7769
step 750, loss = 5.0353
step 800, loss = 5.2699
step 850, loss = 5.4781
step 900, loss = 5.1874
step 950, loss = 5.5779


In [31]:
max_num_tokens = 50
prompt_text = "You shall not"

for i in range(max_num_tokens):
    prompt_tokens = [token_to_id[tok] for tok in prompt_text.lower().split()]
    prompt_tensor = torch.tensor(prompt_tokens)

    with torch.no_grad():
        logits = model(prompt_tensor)
    
    last_logits = logits[-1]
    prob = torch.softmax(last_logits, dim=-1)
    next_token_id = torch.argmax(prob).item()
    next_token = id_to_token[next_token_id]
    print(next_token, end=' ')

    # append to prompt
    prompt_text += " " + next_token

be a smart and the ladies to be to be to be to be to be to be to be to be to be to be to be to be to be to be to be to be to be to be to be to be to be to be 

In [40]:
import requests

url = "https://en.wikipedia.org/api/rest_v1/page/random/summary"
headers = {
    "User-Agent": "samsbaugh (samsbaugh@mines.edu)"
}

resp = requests.get(url, headers=headers)
#print(resp.status_code)
#print(resp.json()["title"])
data = resp.json()

#print(data["title"])
print(data["extract"])
#print(data["content_urls"]["desktop"]["page"])

The Academie Brochu is a historic school at 29 Pine Street in Southbridge, Massachusetts. Built in 1899, it is one of the city's most imposing Colonial Revival buildings, and a significant element of the development of its Franco-American community. The building was listed on the National Register of Historic Places on June 22, 1989. It was gifted to Harrington Memorial Hospital and now houses Harrington Health System offices.
