In [None]:
print("Hello world!")

In [3]:
from dataclasses import dataclass
import torch
import torch.nn as nn
import numpy as np
import requests
import re

@dataclass
class Config:
    d_model:int
    d_vocab:int
    d_hidden:int
    max_seq_len:int
    numTrans:int

In [None]:

class MLP(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.fc1 = nn.Linear(config.d_model, config.d_hidden)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(config.d_hidden, config.d_model)

    def forward(self, x):
        x = self.fc2(self.act(self.fc1(x)))
        return x
    
class Attention(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.Wqk = nn.Parameter(torch.rand(config.d_model, config.d_model))
        self.Wov = nn.Parameter(torch.rand(config.d_model, config.d_model))

        # mask = torch.triu(torch.ones(config.max_seq_len, config.max_seq_len),
        #                   diagonal=1
        #                   )
        # mask = mask.masked_fill(mask==1, -float('inf'))
        # self.register_buffer("M", mask)

    def get_mask(self, n):
        mask = torch.triu(torch.ones(n, n), diagonal=1)
        mask = mask.masked_fill(mask==1, -float('inf'))
        return mask
    
    def forward(self, x): # x -> 
        temp = x @ self.Wqk @ x.T + self.get_mask(x.shape(0))
        scores = torch.softmax(temp, dim=1)

        scores = scores @ x @ self.Wov

        return scores
    
class Transformer(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.attn = Attention(config)
        self.mlp = MLP(config)

    def forward(self, x):
        res = self.mlp(x) + self.attn(x) + x
        return res
    
class LanguageModel(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.config = config
        self.embedding = nn.Embedding(config.d_vocab, config.d_model)
        self.tbs = nn.ModuleList([Transformer(config) for i in range(self.config.numTrans)])
        #self.t1 = Transformer(config)
    
    def forward(self, x_tokens):
        x = self.embedding(x_tokens)
        temp = x
        for i in range(self.config.numTrans):
            temp = self.tbs[i](temp)
        return x

In [99]:
# test no. 1
config = Config(d_model=30, d_vocab=100, d_hidden=128, max_seq_len=3, numTrans=3)
model = LanguageModel(config)
x = torch.tensor([1, 5, 24])
res = model(x)
res

tensor([[-1.4443,  1.4568,  0.3347, -0.3722, -1.4085, -1.2414, -0.6048,  0.0476,
         -0.8957, -0.5576, -1.6263,  1.4795,  1.0423,  1.4978, -2.2204,  0.1678,
         -0.2253,  0.0305, -1.3987, -1.0215,  0.7041,  0.9197,  0.7740,  0.7017,
          0.4842,  2.2864, -0.1840,  1.1123,  1.2535,  1.1048],
        [-0.3318,  0.7217, -1.9149,  0.0842, -0.8393,  0.7559, -2.2055, -0.9364,
          1.4806,  0.4718,  0.4114,  0.7480, -0.5534,  0.5398,  0.6972, -0.1158,
          0.5376, -0.9454,  0.0379,  0.8882, -0.4611,  1.3352,  0.4513, -0.4222,
          1.9414,  0.1637, -0.3131, -0.4753,  0.3018,  2.4414],
        [-0.2408, -0.5617, -0.6802,  0.5548, -0.1123, -1.4520,  0.7677, -0.5034,
         -0.0312,  0.5439,  1.1484,  1.1314,  1.2265, -1.5124, -0.1712, -1.6632,
          1.0540, -0.1720, -0.9873, -0.1587, -0.5318, -0.0286,  1.1374,  0.2203,
         -2.0199, -2.1378,  0.4648,  2.5427, -0.6818, -0.4421]],
       grad_fn=<EmbeddingBackward0>)

In [None]:
# get a dictionary with each of the 1000 most common english words. Swap out the file with other .txt files that just have words if you want.
def get_common_word_dict(f_name = 'texts/words1000.txt'):
    word_dict = {}
    with open(f_name,'r') as f:
        lines = f.readlines()
        i = 0
        for line in lines:
            word_dict[line.strip()] = i
            i+=1
    print(f"Created dictionary with {i} words.")
    return word_dict

#get a 1d torch tensor of tokens from a sequence of words
# if you give it an empty dictionary it will create one for you with the words from the sentence.
def tokenize_sentence(sentence, dictionary={}):
    #sentence_arr = re.split('-|\\. |, | |\n', sentence) #split on any of these possible delimiters we may see
    sentence_arr = re.split('\\. |, | |\n|\t', sentence) #split on any of these possible delimiters we may see
    tokens = [-1 for _ in range(len(sentence_arr))]
    if len(sentence_arr)>21: 
        print(sentence_arr[:20])
    for i, word in enumerate(sentence_arr):
        word = word.lower()
        #get rid of non alphanumeric characters for now
        # if not(word.isalnum()):
        #     pattern = r'[^a-zA-Z0-9]' 
        #     replacement = ''
        #     word = re.sub(pattern, replacement, word)
        token = dictionary.get(word, -1)
        # if we don't know this word, add it to dictionary
        if token == -1:
            token = len(dictionary)
            dictionary[word] = token
        tokens[i] = token
    # make it a 1d tensor
    tokens = torch.tensor(tokens)
    return tokens

def tokenize_many_sentences(sentences, dictionary={}):
    token_batches = [[] for _ in sentences]
    for i, sentence in enumerate(sentences):
        token_batches[i] = tokenize_sentence(sentence)

def tokenize_file(f_name, dictionary = {}):
    with open(f_name,'r',encoding='utf-8') as f:
        tokens = tokenize_sentence(f.read(), dictionary=dictionary)
    return tokens

In [98]:
# example of using these functions
my_dict = get_common_word_dict()
word2test = 'language'
print(f"Getting token for word '{word2test}':",my_dict[word2test]) #the course is MATH498: Large Language Modles and the 498th most common word is apparently language which is funny
print("---------")
sentence = "typically this is completely random, but\nsometimes it could be learned." #excerpt from a lecture I was in when writing this
my_dict = get_common_word_dict('texts/google-10000-english-usa-no-swears.txt') #use bigger dictionary
print(f"Sentence to tokenize: '{sentence}'")
tokens = tokenize_sentence(sentence, my_dict)
print(f"Tokenized sentence: {tokens}")
print("---------")
sentence = "now we will tokenize a sentence without a dictionary" #excerpt from a lecture I was in when writing this
tokens = tokenize_sentence(sentence) #tokenize without a dictionary just assigns tokens to words
print(f"Tokenized sentence: {tokens}")


Created dictionary with 1000 words.
Getting token for word 'language': 498
---------
Created dictionary with 9884 words.
Sentence to tokenize: 'typically this is completely random, but
sometimes it could be learned.'
Tokenized sentence: tensor([3836,   11,    7, 2318, 1853,   42, 1724,   15,  206,   18, 9884])
---------
Tokenized sentence: tensor([0, 1, 2, 3, 4, 5, 6, 4, 7])


In [85]:
my_dict = get_common_word_dict('texts/google-10000-english-usa-no-swears.txt')
tokens = tokenize_file('texts/recipes.txt', dictionary=my_dict)
print(tokens[:20])

Created dictionary with 9884 words.
['Here', 'is', 'how', 'to', 'make', 'Miso-Butter', 'Roast', 'Chicken', 'With', 'Acorn', 'Squash', 'Panzanella', 'You', 'need', '1', '(3½–4-lb.)', 'whole', 'chicken', '2¾', 'tsp']
tensor([  69,    7,   86,    3,  131, 9884, 9885, 3570,   12, 9886, 9887, 9888,
          14,  181, 9889, 9890,  936, 3570, 9891, 9892])


In [90]:
# test no. 2 (run after cell above)
from processing import get_recipe_arr

recipes = get_recipe_arr()
# config = Config(d_model=30, d_vocab=len(my_dict), d_hidden=128, max_seq_len=len(tokens), numTrans=3)
# model = LanguageModel(config)
# res = model(tokens)
# res