In [29]:
import torch
import tiktoken

In [30]:
with open ("one-foggy-night.txt", "r" ) as f:
    raw_text = f.read()

print (raw_text[:50])

A PLUME of smoke drifted under the great glass dom


In [31]:
tokenizer = tiktoken.get_encoding("gpt2")

In [32]:
enc_text = tokenizer.encode(raw_text)

In [33]:
print(enc_text[:20])

[32, 9297, 38340, 286, 7523, 38648, 739, 262, 1049, 5405, 29500, 286, 3454, 363, 17913, 9327, 355, 262, 8342, 10604]


In [34]:
print( tokenizer.decode( enc_text[:2]))
len( enc_text)

A PL


5513

In [35]:
for i in range(1,10):
    print("Input:", tokenizer.decode(enc_text[:i]), "Target:", tokenizer.decode([enc_text[i]]))

Input: A Target:  PL
Input: A PL Target: UME
Input: A PLUME Target:  of
Input: A PLUME of Target:  smoke
Input: A PLUME of smoke Target:  drifted
Input: A PLUME of smoke drifted Target:  under
Input: A PLUME of smoke drifted under Target:  the
Input: A PLUME of smoke drifted under the Target:  great
Input: A PLUME of smoke drifted under the great Target:  glass


In [36]:
print("PyTorch version:", torch.__version__)

PyTorch version: 2.2.2


In [37]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [38]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [39]:
with open("one-foggy-night.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [40]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[   32,  9297, 38340,   286]]), tensor([[ 9297, 38340,   286,  7523]])]


In [41]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 7523, 38648,   739,   262]]), tensor([[38648,   739,   262,  1049]])]


In [42]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   32,  9297, 38340,   286],
        [ 7523, 38648,   739,   262],
        [ 1049,  5405, 29500,   286],
        [ 3454,   363, 17913,  9327],
        [  355,   262,  8342, 10604],
        [ 1625,  6364,   284,   257],
        [ 1302, 24219,    13,   632],
        [  373,  1972,  2739,   783]])

Targets:
 tensor([[ 9297, 38340,   286,  7523],
        [38648,   739,   262,  1049],
        [ 5405, 29500,   286,  3454],
        [  363, 17913,  9327,   355],
        [  262,  8342, 10604,  1625],
        [ 6364,   284,   257,  1302],
        [24219,    13,   632,   373],
        [ 1972,  2739,   783,    11]])


In [43]:
#get the dimensions and type of the input
print(type(inputs))
print(inputs.shape)

<class 'torch.Tensor'>
torch.Size([8, 4])


In [44]:
# look at the first row
inputs[0]

tensor([   32,  9297, 38340,   286])

In [45]:
# look at the first column, or the first element of each row
inputs[:,0]

tensor([  32, 7523, 1049, 3454,  355, 1625, 1302,  373])

In [46]:
# iterating through the tensors, looking at the words rather than the tokens 
for row in inputs:
    print(tokenizer.decode ( row.tolist()))

A PLUME of
 smoke drifted under the
 great glass dome of
 Slagborough Station
 as the Northern Express
 came slowly to a
 standstill. It
 was getting late now


In [47]:

with open("one-foggy-night.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

[tensor([[   32,  9297, 38340,   286]]), tensor([[ 9297, 38340,   286,  7523]])]
[tensor([[ 9297, 38340,   286,  7523]]), tensor([[38340,   286,  7523, 38648]])]


In [48]:
vocab_size = 6
output_dim = 3

In [49]:
embedding = torch.nn.Embedding( vocab_size, output_dim )

In [50]:
print(embedding.weight)

Parameter containing:
tensor([[-0.1503,  0.9886, -0.7420],
        [ 0.8194, -0.4790,  0.2558],
        [ 1.8690,  0.9179, -0.1486],
        [-0.0701,  2.0119,  1.3039],
        [-1.4388, -0.2981, -0.3882],
        [ 0.9299,  0.4336,  0.3698]], requires_grad=True)


In [51]:
#get rid of the requires_grad bit value
embedding.weight.data

tensor([[-0.1503,  0.9886, -0.7420],
        [ 0.8194, -0.4790,  0.2558],
        [ 1.8690,  0.9179, -0.1486],
        [-0.0701,  2.0119,  1.3039],
        [-1.4388, -0.2981, -0.3882],
        [ 0.9299,  0.4336,  0.3698]])

In [None]:
# or get rid of the ..
print(embedding.weight.detach())

tensor([[-0.1503,  0.9886, -0.7420],
        [ 0.8194, -0.4790,  0.2558],
        [ 1.8690,  0.9179, -0.1486],
        [-0.0701,  2.0119,  1.3039],
        [-1.4388, -0.2981, -0.3882],
        [ 0.9299,  0.4336,  0.3698]])


In [53]:
embeddingb = torch.nn.Embedding( tokenizer.n_vocab, 256)

In [54]:
embeddingb.weight

Parameter containing:
tensor([[-1.0513,  0.0978, -0.7628,  ...,  1.0831,  0.5312,  0.6050],
        [ 0.7677, -0.6192,  0.4671,  ..., -0.3672, -0.4005, -1.4050],
        [ 1.5926, -0.2443, -0.0318,  ..., -0.1702,  0.3452, -0.8612],
        ...,
        [ 0.0069, -0.0329, -1.6368,  ...,  0.4549,  1.0701,  2.6172],
        [-1.1016,  0.8737,  1.5565,  ..., -0.5927, -0.9203, -0.8780],
        [-0.1298,  0.8040,  1.3808,  ...,  1.2625, -1.5140,  0.0861]],
       requires_grad=True)

In [55]:
# make a tensor from scratch from scratch
x = torch.tensor([2.2,1.5])
y = torch.tensor([1.4,2.3])
print(x,y)

tensor([2.2000, 1.5000]) tensor([1.4000, 2.3000])


In [56]:
torch.dot(x,y)

tensor(6.5300)