In [17]:
import torch
import tiktoken

In [18]:
# needed to pull txt file from github
import os
import urllib.request

if not os.path.exists("one-foggy-night.txt"):
    url = ("https://raw.githubusercontent.com/Valentino-Cheek/CS-290-LLMs/refs/heads/main/one-foggy-night.txt")
    file_path = "one-foggy-night.txt"
    urllib.request.urlretrieve(url, file_path)

In [19]:
# read text file and print some text
with open ("one-foggy-night.txt", "r" ) as f:
    raw_text = f.read()

print (raw_text[:50])

A PLUME of smoke drifted under the great glass dom


In [20]:
# initizating the tokenizer and encoding the text
tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)

In [21]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [22]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [23]:
# a dataloader with a max length of 4 and a stride of 1
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)

print(second_batch)

[tensor([[   32,  9297, 38340,   286]]), tensor([[ 9297, 38340,   286,  7523]])]
[tensor([[ 9297, 38340,   286,  7523]]), tensor([[38340,   286,  7523, 38648]])]


In [24]:
# a data loader with a batch size of 2, max length of 6 and stride of 6
dataloader = create_dataloader_v1(
    raw_text, batch_size=2, max_length=6, stride=6, shuffle=False
)
data_iter = iter(dataloader)

# organize the data into inputs and targets
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   32,  9297, 38340,   286,  7523, 38648],
        [  739,   262,  1049,  5405, 29500,   286]])

Targets:
 tensor([[ 9297, 38340,   286,  7523, 38648,   739],
        [  262,  1049,  5405, 29500,   286,  3454]])


In [25]:
# a data loader with a batch size, max length and stride of 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=4, max_length=4, stride=4, shuffle=False
)
data_iter = iter(dataloader)

# organize the data into inputs and targets
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   32,  9297, 38340,   286],
        [ 7523, 38648,   739,   262],
        [ 1049,  5405, 29500,   286],
        [ 3454,   363, 17913,  9327]])

Targets:
 tensor([[ 9297, 38340,   286,  7523],
        [38648,   739,   262,  1049],
        [ 5405, 29500,   286,  3454],
        [  363, 17913,  9327,   355]])


In [26]:
# iterating through the tensors, looking at the words rather than the tokens
for row in inputs:
    print(tokenizer.decode ( row.tolist()))
# iterating through the tensors, looking at the words rather than the tokens
for row in targets:
    print(tokenizer.decode ( row.tolist()))

A PLUME of
 smoke drifted under the
 great glass dome of
 Slagborough Station
 PLUME of smoke
 drifted under the great
 glass dome of Sl
agborough Station as


In [27]:
# create an embedding layer with a vocab size of 8 and an output dimension of 4

# Embedding is essential in the creation of LLMs because it allows the model
# to have parameters that are trainable.

#The embedding creates the ability for the model to interpret contextual
#information through training.

# Without it the model would not be able to interpret the english language,
# embedding creates numbers that can be adjusted based on the raw words inputed.

# This step allows (or is the foundation of ) the training and "learning" of a LLM
vocab_size = 8
output_dim = 4

embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[-0.2827,  0.5502,  1.5729,  0.5980],
        [-0.6337,  0.4333, -1.5340, -0.5352],
        [ 0.6825, -0.4818, -0.4992,  0.8141],
        [ 1.1403,  0.9638, -0.6608,  0.3767],
        [ 1.9081,  0.7504, -0.0339,  0.5945],
        [-0.4364, -0.1453,  0.4935, -1.0386],
        [-0.7417, -0.7000,  0.4368,  0.7527],
        [-1.1524, -0.3119,  0.7796,  1.0384]], requires_grad=True)


In [28]:
# remove the requires_grad=True
inputs = embedding_layer.weight.data
print(inputs)

tensor([[-0.2827,  0.5502,  1.5729,  0.5980],
        [-0.6337,  0.4333, -1.5340, -0.5352],
        [ 0.6825, -0.4818, -0.4992,  0.8141],
        [ 1.1403,  0.9638, -0.6608,  0.3767],
        [ 1.9081,  0.7504, -0.0339,  0.5945],
        [-0.4364, -0.1453,  0.4935, -1.0386],
        [-0.7417, -0.7000,  0.4368,  0.7527],
        [-1.1524, -0.3119,  0.7796,  1.0384]])


In [29]:
# get all of the attention scores by matrix multiplication
# transpose in order to create a similarity matrix
attention_scores = inputs @ inputs.T
attention_scores

tensor([[ 3.2142, -2.3152, -0.7563, -0.6061,  0.1757,  0.1985,  0.9617,  2.0013],
        [-2.3152,  3.2288, -0.3113,  0.5070, -1.1503,  0.0125, -0.9061, -1.1564],
        [-0.7563, -0.3113,  1.6099,  0.9504,  1.4417, -1.3197,  0.2257, -0.1800],
        [-0.6061,  0.5070,  0.9504,  2.8077,  3.1454, -1.3550, -1.5255, -1.7386],
        [ 0.1757, -1.1503,  1.4417,  3.1454,  4.5587, -1.5759, -1.5078, -1.8420],
        [ 0.1985,  0.0125, -1.3197, -1.3550, -1.5759,  1.5337, -0.1407, -0.1455],
        [ 0.9617, -0.9061,  0.2257, -1.5255, -1.5078, -0.1407,  1.7973,  2.1951],
        [ 2.0013, -1.1564, -0.1800, -1.7386, -1.8420, -0.1455,  2.1951,  3.1112]])

In [30]:
# normalize the attention scores
# soft max also highlights more important tokens, causes the model to "focus" on the most relevant tokens
attention_weights = torch.softmax( attention_scores, dim = -1)
attention_weights

tensor([[0.6476, 0.0026, 0.0122, 0.0142, 0.0310, 0.0317, 0.0681, 0.1926],
        [0.0033, 0.8476, 0.0246, 0.0557, 0.0106, 0.0340, 0.0136, 0.0106],
        [0.0305, 0.0476, 0.3254, 0.1682, 0.2750, 0.0174, 0.0815, 0.0543],
        [0.0121, 0.0367, 0.0572, 0.3663, 0.5134, 0.0057, 0.0048, 0.0039],
        [0.0095, 0.0025, 0.0338, 0.1858, 0.7636, 0.0017, 0.0018, 0.0013],
        [0.1307, 0.1085, 0.0286, 0.0276, 0.0222, 0.4967, 0.0931, 0.0926],
        [0.1270, 0.0196, 0.0609, 0.0106, 0.0107, 0.0422, 0.2930, 0.4360],
        [0.1797, 0.0076, 0.0203, 0.0043, 0.0038, 0.0210, 0.2181, 0.5452]])

In [31]:
# check that each row of the normalized scores add up to 1, validating that softmax and the matrix multiplication went properly
attention_weights.sum(axis = -1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [32]:
# calculate the context vectors based on the attention weights and inputs
context_vectors = attention_weights @ inputs
context_vectors

tensor([[-0.3872,  0.2762,  1.1936,  0.6379],
        [-0.4746,  0.4012, -1.3135, -0.4184],
        [ 0.7691,  0.1727, -0.2214,  0.5842],
        [ 1.3991,  0.7279, -0.3173,  0.4791],
        [ 1.6842,  0.7403, -0.1518,  0.5568],
        [-0.4049, -0.0178,  0.3638, -0.2826],
        [-0.7124, -0.2799,  0.6207,  0.7548],
        [-0.8287, -0.2263,  0.7885,  0.8322]])