In [18]:
import torch
import tiktoken

In [19]:
# needed to pull txt file from github
import os
import urllib.request

if not os.path.exists("one-foggy-night.txt"):
    url = ("https://raw.githubusercontent.com/Valentino-Cheek/CS-290-LLMs/refs/heads/main/one-foggy-night.txt")
    file_path = "one-foggy-night.txt"
    urllib.request.urlretrieve(url, file_path)

In [20]:
# read text file and print some text
with open ("one-foggy-night.txt", "r" ) as f:
    raw_text = f.read()

print (raw_text[:50])

A PLUME of smoke drifted under the great glass dom


In [21]:
# initizating the tokenizer and encoding the text
tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)

In [22]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [23]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [24]:
# a dataloader with a max length of 4 and a stride of 1
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)

print(second_batch)

[tensor([[   32,  9297, 38340,   286]]), tensor([[ 9297, 38340,   286,  7523]])]
[tensor([[ 9297, 38340,   286,  7523]]), tensor([[38340,   286,  7523, 38648]])]


In [25]:
# a data loader with a batch size of 2, max length of 6 and stride of 6
dataloader = create_dataloader_v1(
    raw_text, batch_size=2, max_length=6, stride=6, shuffle=False
)
data_iter = iter(dataloader)

# organize the data into inputs and targets
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   32,  9297, 38340,   286,  7523, 38648],
        [  739,   262,  1049,  5405, 29500,   286]])

Targets:
 tensor([[ 9297, 38340,   286,  7523, 38648,   739],
        [  262,  1049,  5405, 29500,   286,  3454]])


In [26]:
# a data loader with a batch size, max length and stride of 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=4, max_length=4, stride=4, shuffle=False
)
data_iter = iter(dataloader)

# organize the data into inputs and targets
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   32,  9297, 38340,   286],
        [ 7523, 38648,   739,   262],
        [ 1049,  5405, 29500,   286],
        [ 3454,   363, 17913,  9327]])

Targets:
 tensor([[ 9297, 38340,   286,  7523],
        [38648,   739,   262,  1049],
        [ 5405, 29500,   286,  3454],
        [  363, 17913,  9327,   355]])


In [27]:
# iterating through the tensors, looking at the words rather than the tokens
for row in inputs:
    print(tokenizer.decode ( row.tolist()))
# iterating through the tensors, looking at the words rather than the tokens
for row in targets:
    print(tokenizer.decode ( row.tolist()))

A PLUME of
 smoke drifted under the
 great glass dome of
 Slagborough Station
 PLUME of smoke
 drifted under the great
 glass dome of Sl
agborough Station as


In [28]:
# create an embedding layer with a vocab size of 8 and an output dimension of 4

# Embedding is essential in the creation of LLMs because it allows the model
# to have parameters that are trainable.

#The embedding creates the ability for the model to interpret contextual
#information through training.

# Without it the model would not be able to interpret the english language,
# embedding creates numbers that can be adjusted based on the raw words inputed.

# This step allows (or is the foundation of ) the training and "learning" of a LLM
vocab_size = 8
output_dim = 4

embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.6806, -0.4945, -1.7836, -1.4530],
        [ 1.4918,  0.3406,  1.1078,  0.8309],
        [ 0.2917, -1.3365, -0.4469,  0.2935],
        [-1.1407,  0.4209, -0.0612, -0.6397],
        [-0.9546,  0.2700, -0.3316,  0.0858],
        [-0.2956, -0.0960, -0.7541,  0.7398],
        [-1.8088, -1.9085,  0.7073,  0.1320],
        [ 0.9494, -0.1660, -0.3525, -0.3045]], requires_grad=True)


In [29]:
# remove the requires_grad=True
inputs = embedding_layer.weight.data
print(inputs)

tensor([[ 0.6806, -0.4945, -1.7836, -1.4530],
        [ 1.4918,  0.3406,  1.1078,  0.8309],
        [ 0.2917, -1.3365, -0.4469,  0.2935],
        [-1.1407,  0.4209, -0.0612, -0.6397],
        [-0.9546,  0.2700, -0.3316,  0.0858],
        [-0.2956, -0.0960, -0.7541,  0.7398],
        [-1.8088, -1.9085,  0.7073,  0.1320],
        [ 0.9494, -0.1660, -0.3525, -0.3045]])


In [30]:
# get all of the attention scores by matrix multiplication
# transpose in order to create a similarity matrix
attention_scores = inputs @ inputs.T
attention_scores

tensor([[ 5.9999, -2.3362,  1.2300,  0.0541, -0.3164,  0.1164, -1.7406,  1.7994],
        [-2.3362,  4.2590, -0.2712, -2.1575, -1.6281, -0.6943, -2.4552,  0.7163],
        [ 1.2300, -0.2712,  2.1572, -1.0557, -0.4660,  0.5962,  1.7457,  0.5669],
        [ 0.0541, -2.1575, -1.0557,  1.8913,  1.1680, -0.1303,  1.1322, -0.9365],
        [-0.3164, -1.6281, -0.4660,  1.1680,  1.1014,  0.5697,  0.9881, -0.8604],
        [ 0.1164, -0.6943,  0.5962, -0.1303,  0.5697,  1.2125,  0.2820, -0.2241],
        [-1.7406, -2.4552,  1.7457,  1.1322,  0.9881,  0.2820,  7.4318, -1.6901],
        [ 1.7994,  0.7163,  0.5669, -0.9365, -0.8604, -0.2241, -1.6901,  1.1459]])

In [31]:
# normalize the attention scores
# soft max also highlights more important tokens, causes the model to "focus" on the most relevant tokens
attention_weights = torch.softmax( attention_scores, dim = -1)
attention_weights

tensor([[9.6960e-01, 2.3243e-04, 8.2234e-03, 2.5373e-03, 1.7517e-03, 2.7005e-03,
         4.2165e-04, 1.4532e-02],
        [1.2972e-03, 9.4898e-01, 1.0229e-02, 1.5509e-03, 2.6334e-03, 6.6998e-03,
         1.1517e-03, 2.7460e-02],
        [1.4801e-01, 3.2987e-02, 3.7410e-01, 1.5053e-02, 2.7149e-02, 7.8532e-02,
         2.4790e-01, 7.6269e-02],
        [6.7086e-02, 7.3474e-03, 2.2113e-02, 4.2122e-01, 2.0436e-01, 5.5790e-02,
         1.9718e-01, 2.4913e-02],
        [5.7594e-02, 1.5513e-02, 4.9592e-02, 2.5412e-01, 2.3776e-01, 1.3971e-01,
         2.1228e-01, 3.3431e-02],
        [9.7099e-02, 4.3163e-02, 1.5688e-01, 7.5869e-02, 1.5278e-01, 2.9055e-01,
         1.1458e-01, 6.9074e-02],
        [1.0306e-04, 5.0434e-05, 3.3662e-03, 1.8227e-03, 1.5781e-03, 7.7889e-04,
         9.9219e-01, 1.0840e-04],
        [4.0851e-01, 1.3831e-01, 1.1911e-01, 2.6487e-02, 2.8583e-02, 5.4002e-02,
         1.2467e-02, 2.1253e-01]])

In [32]:
# check that each row of the normalized scores add up to 1, validating that softmax and the matrix multiplication went properly
attention_weights.sum(axis = -1)

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])

In [33]:
# calculate the context vectors based on the attention weights and inputs
context_vectors = attention_weights @ inputs
context_vectors

tensor([[ 0.6703, -0.4923, -1.7404, -1.4100],
        [ 1.4373,  0.3029,  1.0295,  0.7856],
        [-0.1832, -1.0416, -0.3153, -0.0176],
        [-0.9620, -0.2135, -0.1263, -0.2771],
        [-0.8336, -0.3424, -0.1691, -0.0772],
        [-0.2837, -0.4278, -0.4132,  0.1144],
        [-1.7972, -1.8970,  0.6989,  0.1313],
        [ 0.6249, -0.3595, -0.7465, -0.4813]])