## Master Handling Raw Texts !!

### Converting Our Raw Text Into Tokens ??

In [1]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [2]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [3]:
len(raw_text)

20479

In [4]:
## Split By Words ??

import re

text = "Hello, World. This is a test"
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'World.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


In [5]:
## Split With Special Chars (, .) and Words ??

import re

text = "Hello, World. This is a test"
result = re.split(r'([,.]|\s)', text)

print(result)

['Hello', ',', '', ' ', 'World', '.', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


In [7]:
## Remove the Spaces ??

result = [i for i in result if i.strip()]

print(result)

['Hello', ',', 'World', '.', 'This', 'is', 'a', 'test']


In [10]:
## More Take Care about Special Tokens ??

text = "Hello, World. This -- is a _Test."

result = re.split(r'([,.:;?_"()\']|--|\s)', text)

result = [item for item in result if item.strip()]

print(result)

['Hello', ',', 'World', '.', 'This', '--', 'is', 'a', '_', 'Test', '.']


In [12]:
## Let's Try Our Raw Text ??

text = "Hello, World. This -- is a _Test."

result = re.split(r'([,.:;?_"()\']|--|\s)', raw_text)

preprocessed = [item for item in result if item.strip()]

print(preprocessed)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--', 'deploring', 'his', 'unaccountable', 'abdication', '.', '"', 'Of', 'course', 'it', "'", 's', 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture', "'", 'way', 'up', ';', 'but', 'I', 'don', "'", 't', 'think', 'of', 'that', ',

In [13]:
len(preprocessed)

4666

In [14]:
preprocessed[:10]

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

### Converting Tokens Into Tokens IDs ??

In [15]:
## Filter Tokens Taking Only The Unique One's

all_tokens = sorted(set(preprocessed))

all_tokens

['"',
 "'",
 '(',
 ')',
 ',',
 '--',
 '.',
 ':',
 ';',
 '?',
 'A',
 'Ah',
 'Among',
 'And',
 'Are',
 'Arrt',
 'As',
 'At',
 'Be',
 'Begin',
 'Burlington',
 'But',
 'By',
 'Carlo',
 'Chicago',
 'Claude',
 'Come',
 'Croft',
 'Destroyed',
 'Devonshire',
 'Don',
 'Dubarry',
 'Emperors',
 'Florence',
 'For',
 'Gallery',
 'Gideon',
 'Gisburn',
 'Gisburn!',
 'Gisburns',
 'Grafton',
 'Greek',
 'Grindle',
 'Grindles',
 'HAD',
 'Had',
 'Hang',
 'Has',
 'He',
 'Her',
 'Hermia',
 'His',
 'How',
 'I',
 'If',
 'In',
 'It',
 'Jack',
 'Jack!',
 'Jove',
 'Jove!',
 'Just',
 'Lord',
 'Made',
 'Miss',
 'Money',
 'Monte',
 'Moon-dancers',
 'Mr',
 'Mrs',
 'My',
 'Never',
 'No',
 'Now',
 'Nutley',
 'Of',
 'Oh',
 'On',
 'Once',
 'Only',
 'Or',
 'Perhaps',
 'Poor',
 'Professional',
 'Renaissance',
 'Rickham',
 'Rickham!',
 'Riviera',
 'Rome',
 'Russian',
 'Sevres',
 'She',
 'Stroud',
 'Stroud!',
 'Strouds',
 'Suddenly',
 'That',
 'The',
 'Then',
 'There',
 'They',
 'This',
 'Those',
 'Though',
 'Thwing',
 'Thw

In [16]:
len(all_tokens)

1148

In [18]:
vocab_size = len(all_tokens)

vocab_size

1148

In [19]:
## Mapping All Token Into Key : Value Pairs and This IS Simply The Tokens Process ??

vocabs = {token:integer for integer, token in enumerate(all_tokens)}

vocabs

{'"': 0,
 "'": 1,
 '(': 2,
 ')': 3,
 ',': 4,
 '--': 5,
 '.': 6,
 ':': 7,
 ';': 8,
 '?': 9,
 'A': 10,
 'Ah': 11,
 'Among': 12,
 'And': 13,
 'Are': 14,
 'Arrt': 15,
 'As': 16,
 'At': 17,
 'Be': 18,
 'Begin': 19,
 'Burlington': 20,
 'But': 21,
 'By': 22,
 'Carlo': 23,
 'Chicago': 24,
 'Claude': 25,
 'Come': 26,
 'Croft': 27,
 'Destroyed': 28,
 'Devonshire': 29,
 'Don': 30,
 'Dubarry': 31,
 'Emperors': 32,
 'Florence': 33,
 'For': 34,
 'Gallery': 35,
 'Gideon': 36,
 'Gisburn': 37,
 'Gisburn!': 38,
 'Gisburns': 39,
 'Grafton': 40,
 'Greek': 41,
 'Grindle': 42,
 'Grindles': 43,
 'HAD': 44,
 'Had': 45,
 'Hang': 46,
 'Has': 47,
 'He': 48,
 'Her': 49,
 'Hermia': 50,
 'His': 51,
 'How': 52,
 'I': 53,
 'If': 54,
 'In': 55,
 'It': 56,
 'Jack': 57,
 'Jack!': 58,
 'Jove': 59,
 'Jove!': 60,
 'Just': 61,
 'Lord': 62,
 'Made': 63,
 'Miss': 64,
 'Money': 65,
 'Monte': 66,
 'Moon-dancers': 67,
 'Mr': 68,
 'Mrs': 69,
 'My': 70,
 'Never': 71,
 'No': 72,
 'Now': 73,
 'Nutley': 74,
 'Of': 75,
 'Oh': 76,
 'On

In [20]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [22]:
tokenizer = SimpleTokenizerV1(vocabs)

text = """"
It's the last he painted, you know," 
Mrs. Gisburn said with pardonable pride.
"""

ids = tokenizer.encode(text)

print(ids)

[0, 56, 1, 861, 1001, 609, 538, 756, 4, 1144, 603, 4, 0, 69, 6, 37, 862, 1123, 764, 804, 6]


In [23]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

### Adding Special Context Tokens ??

In [24]:
tokenizer = SimpleTokenizerV1(vocabs)

text = "Hello, do you like tea. Is this-- a test?"

tokenizer.encode(text)

KeyError: 'Hello'

- The above produces an error because the word "Hello" is not contained in the vocabulary
- To deal with such cases, we can add special tokens like "<|unk|>" to the vocabulary to represent unknown words
- Since we are already extending the vocabulary, let's add another token called "<|endoftext|>" which is used in GPT-2 training to denote the end of a text (and it's also used between concatenated text, like if our training datasets consists of multiple articles, books, etc.)

In [26]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [29]:
len(vocab)

1150

In [31]:
vocab[5:]

TypeError: unhashable type: 'slice'

In [32]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1145)
('your', 1146)
('yourself', 1147)
('<|endoftext|>', 1148)
('<|unk|>', 1149)


In [33]:
## Second Class Handles the Not Found Tokens ??
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [34]:
tokenizer = SimpleTokenizerV2(vocab)

In [35]:
tokenizer.encode(text)

[1149, 4, 360, 1144, 635, 988, 6, 1149, 1012, 5, 120, 1149, 9]

In [37]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea. <|unk|> this -- a <|unk|>?'

### Byte Pair Encoding ??
- GPT-2 used BytePair encoding (BPE) as its tokenizer
- it allows the model to break down words that aren't in its predefined vocabulary into smaller subword units or even individual characters, enabling it to handle out-of-vocabulary words
- For instance, if GPT-2's vocabulary doesn't have the word "unfamiliarword," it might tokenize it as ["unfam", "iliar", "word"] or some other subword breakdown, depending on its trained BPE merges
- The original BPE tokenizer can be found here: https://github.com/openai/gpt-2/blob/master/src/encoder.py
- In this chapter, we are using the BPE tokenizer from OpenAI's open-source tiktoken library, which implements its core algorithms in Rust to improve computational performance
- I created a notebook in the ./bytepair_encoder that compares these two implementations side-by-side (tiktoken was about 5x faster on the sample text)

In [38]:
import tiktoken

tiktoken.__version__

'0.11.0'

In [39]:
tokenizer = tiktoken.get_encoding("gpt2")

In [40]:
tokenizer.encode("Hello World")

[15496, 2159]

In [41]:
tokenizer.decode(tokenizer.encode("Hello World"))

'Hello World'

In [44]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)

tokens = tokenizer.encode(text, allowed_special={'<|endoftext|>'})

tokens

[15496,
 11,
 466,
 345,
 588,
 8887,
 30,
 220,
 50256,
 554,
 262,
 4252,
 18250,
 8812,
 2114,
 1659,
 617,
 34680,
 27271,
 13]

### Data sampling with a sliding window

In [45]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

encoded_text = tokenizer.encode(raw_text)

encoded_text

[40,
 367,
 2885,
 1464,
 1807,
 3619,
 402,
 271,
 10899,
 2138,
 257,
 7026,
 15632,
 438,
 2016,
 257,
 922,
 5891,
 1576,
 438,
 568,
 340,
 373,
 645,
 1049,
 5975,
 284,
 502,
 284,
 3285,
 326,
 11,
 287,
 262,
 6001,
 286,
 465,
 13476,
 11,
 339,
 550,
 5710,
 465,
 12036,
 11,
 6405,
 257,
 5527,
 27075,
 11,
 290,
 4920,
 2241,
 287,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 357,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 423,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 340,
 13,
 314,
 460,
 3285,
 9074,
 13,
 46606,
 536,
 5469,
 438,
 14363,
 938,
 4842,
 1650,
 353,
 438,
 2934,
 489,
 3255,
 465,
 48422,
 540,
 450,
 67,
 3299,
 13,
 366,
 5189,
 1781,
 340,
 338,
 1016,
 284,
 3758,
 262,
 1988,
 286,
 616,
 4286,
 705,
 1014,
 510,
 26,
 475,
 314,
 836,
 470,
 892,
 286,
 326,
 11,
 1770,
 13,
 8759,
 2763,
 438,
 1169,
 2994,
 284,
 943,
 17034,
 318,
 477,
 314,
 892,


In [46]:
len(encoded_text)

5145

In [49]:
encoded_sample = encoded_text[50:]

encoded_sample

[290,
 4920,
 2241,
 287,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 357,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 423,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 340,
 13,
 314,
 460,
 3285,
 9074,
 13,
 46606,
 536,
 5469,
 438,
 14363,
 938,
 4842,
 1650,
 353,
 438,
 2934,
 489,
 3255,
 465,
 48422,
 540,
 450,
 67,
 3299,
 13,
 366,
 5189,
 1781,
 340,
 338,
 1016,
 284,
 3758,
 262,
 1988,
 286,
 616,
 4286,
 705,
 1014,
 510,
 26,
 475,
 314,
 836,
 470,
 892,
 286,
 326,
 11,
 1770,
 13,
 8759,
 2763,
 438,
 1169,
 2994,
 284,
 943,
 17034,
 318,
 477,
 314,
 892,
 286,
 526,
 383,
 1573,
 11,
 319,
 9074,
 13,
 536,
 5469,
 338,
 11914,
 11,
 33096,
 663,
 4808,
 3808,
 62,
 355,
 996,
 484,
 547,
 12548,
 287,
 281,
 13079,
 410,
 12523,
 286,
 22353,
 13,
 843,
 340,
 373,
 407,
 691,
 262,
 9074,
 13,
 536,
 48819,
 508,
 25722,
 276,
 13,
 11161,
 407,
 262,
 40123,
 18113,


In [48]:
context_size = 4

x = encoded_sample[:context_size]
y = encoded_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [50]:
for i in range(1, context_size + 1):
    context = encoded_sample[:i]
    desired = encoded_sample[i]
    
    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [54]:
for i in range(1, context_size + 1):
    context = encoded_sample[:i]
    desired = encoded_sample[i]
    
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [55]:
import torch

torch.__version__

'2.8.0+cpu'

- We will take care of the next-word prediction in a later chapter after we covered the attention mechanism
- For now, we implement a simple data loader that iterates over the input dataset and returns the inputs and targets shifted by one

In [56]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [57]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                        stride=128, shuffle=True, drop_last=True,
                        num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [58]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [59]:
data_loader = create_dataloader_v1(
    txt=raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(data_loader)

first_batch = next(data_iter)

print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [60]:
second_batch = next(data_iter)

print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


- We can also create batched outputs
- Note that we **increase** the **stride** here so that we don't have overlaps between the batches, since more overlap could lead to increased overfitting

In [61]:
dataloader = create_dataloader_v1(txt=raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [62]:
next_inputs, next_targets = next(data_iter)

print("Inputs:\n", next_inputs)
print("\nTargets:\n", next_targets)

Inputs:
 tensor([[  287,   262,  6001,   286],
        [  465, 13476,    11,   339],
        [  550,  5710,   465, 12036],
        [   11,  6405,   257,  5527],
        [27075,    11,   290,  4920],
        [ 2241,   287,   257,  4489],
        [   64,   319,   262, 34686],
        [41976,    13,   357, 10915]])

Targets:
 tensor([[  262,  6001,   286,   465],
        [13476,    11,   339,   550],
        [ 5710,   465, 12036,    11],
        [ 6405,   257,  5527, 27075],
        [   11,   290,  4920,  2241],
        [  287,   257,  4489,    64],
        [  319,   262, 34686, 41976],
        [   13,   357, 10915,   314]])


### Creating Token Embeddings ??

In [63]:
## Suppose we have the following four input examples with input ids 2, 3, 5, and 1 (after tokenization):

input_ids = torch.tensor([2, 3, 5, 1])

- For the sake of simplicity, suppose we have a small vocabulary of only 6 words and we want to create embeddings of size 3:

In [65]:
vocab_size = 6
output_dim = 3

torch.manual_seed(42)

embedding_layer =  torch.nn.Embedding(vocab_size, output_dim)

embedding_layer

Embedding(6, 3)

In [66]:
embedding_layer.weight

Parameter containing:
tensor([[ 1.9269,  1.4873, -0.4974],
        [ 0.4396, -0.7581,  1.0783],
        [ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [-0.2316,  0.0418, -0.2516],
        [ 0.8599, -0.3097, -0.3957]], requires_grad=True)

In [67]:
embedding_layer(torch.tensor([0]))

tensor([[ 1.9269,  1.4873, -0.4974]], grad_fn=<EmbeddingBackward0>)

In [68]:
embedding_layer(torch.tensor([1]))

tensor([[ 0.4396, -0.7581,  1.0783]], grad_fn=<EmbeddingBackward0>)

In [69]:
embedding_layer(torch.tensor([2]))

tensor([[0.8008, 1.6806, 0.3559]], grad_fn=<EmbeddingBackward0>)

In [70]:
embedding_layer(input_ids)

tensor([[ 0.8008,  1.6806,  0.3559],
        [-0.6866,  0.6105,  1.3347],
        [ 0.8599, -0.3097, -0.3957],
        [ 0.4396, -0.7581,  1.0783]], grad_fn=<EmbeddingBackward0>)

### Encoding Word Positions ??

In [73]:
# vocab_size = tokenizer.n_vocab
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [75]:
max_length=  4

dataloader = create_dataloader_v1(
    txt=raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)


data_iter = iter(dataloader)
inputs, targets = next(data_iter)


print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [77]:
token_embeddings = token_embedding_layer(inputs)

token_embeddings.shape

torch.Size([8, 4, 256])

In [78]:
token_embeddings[0, 0]

tensor([-1.6311e+00,  1.0768e+00,  3.5037e-01, -1.1459e+00, -1.1873e+00,
         4.9456e-01,  6.1900e-01, -1.4419e-01, -4.8149e-01,  9.9837e-01,
         2.9892e-01, -1.0865e-01, -7.6192e-01,  3.0956e-01,  3.3328e-01,
        -6.5021e-01, -4.4607e-02, -6.7148e-02,  4.5217e-01, -1.3183e+00,
        -1.3610e-01, -1.1859e+00,  7.0825e-01,  1.8976e-01, -5.2505e-01,
         6.6413e-01,  1.0247e+00, -1.0226e+00, -2.5561e-01,  1.0574e+00,
         7.7255e-01, -2.3293e-01,  1.4470e-01,  2.4755e-01,  2.7673e+00,
        -8.7606e-01,  1.4913e-01, -1.1731e-01,  6.8149e-01, -4.6548e-01,
        -6.1932e-01, -5.6322e-01,  2.5859e-01,  1.0003e+00,  8.2677e-01,
        -1.1263e+00, -1.4417e+00,  1.1668e+00,  5.0012e-01,  3.4664e-01,
         1.3220e+00,  7.5988e-02, -2.1000e+00, -1.7296e+00, -8.3492e-01,
         7.8889e-01, -5.8008e-01, -2.4468e-01,  8.7869e-01,  1.0423e+00,
         6.7644e-01,  4.7097e-01,  4.3847e-01, -1.3032e+00, -7.9606e-01,
         6.6295e-01, -3.2373e+00, -1.0817e+00, -4.5

In [79]:
context_length = max_length

pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [82]:
pos_embedding_layer.weight

Parameter containing:
tensor([[-1.6906,  1.4235, -1.1807,  ..., -0.5013, -1.8112,  0.8139],
        [-0.2355, -0.9599, -1.9416,  ...,  0.1938, -0.6389,  1.4994],
        [-0.8247, -0.0990,  0.4568,  ..., -0.1388, -0.9858, -0.4023],
        [ 0.0946,  1.2584,  0.9178,  ...,  0.5571, -0.0555,  0.4133]],
       requires_grad=True)

In [80]:
torch.arange(max_length)

tensor([0, 1, 2, 3])

In [83]:
pos_embedding_layer(torch.arange(max_length))

tensor([[-1.6906,  1.4235, -1.1807,  ..., -0.5013, -1.8112,  0.8139],
        [-0.2355, -0.9599, -1.9416,  ...,  0.1938, -0.6389,  1.4994],
        [-0.8247, -0.0990,  0.4568,  ..., -0.1388, -0.9858, -0.4023],
        [ 0.0946,  1.2584,  0.9178,  ...,  0.5571, -0.0555,  0.4133]],
       grad_fn=<EmbeddingBackward0>)

In [81]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))

print(pos_embeddings.shape)

torch.Size([4, 256])


In [84]:
token_embeddings.shape

torch.Size([8, 4, 256])

In [85]:
pos_embeddings.shape

torch.Size([4, 256])

In [86]:
token_embeddings[0] + pos_embeddings[0]

tensor([[-3.3217,  2.5003, -0.8304,  ...,  0.5256, -2.9149,  0.3768],
        [-0.6779,  2.1142, -1.1894,  ..., -0.7713, -0.9675, -0.8805],
        [-1.5022,  2.5453, -1.6560,  ..., -0.7514, -1.0629,  1.4081],
        [-2.2974, -2.2064, -0.7850,  ..., -0.3458, -3.2211,  2.3953]],
       grad_fn=<AddBackward0>)

In [87]:
token_embeddings[0] + pos_embeddings

tensor([[-3.3217,  2.5003, -0.8304,  ...,  0.5256, -2.9149,  0.3768],
        [ 0.7773, -0.2691, -1.9503,  ..., -0.0762,  0.2047, -0.1950],
        [-0.6363,  1.0228, -0.0185,  ..., -0.3889, -0.2375,  0.1919],
        [-0.5122, -2.3715,  1.3135,  ...,  0.7126, -1.4655,  1.9947]],
       grad_fn=<AddBackward0>)

In [88]:
token_embeddings + pos_embeddings

tensor([[[-3.3217,  2.5003, -0.8304,  ...,  0.5256, -2.9149,  0.3768],
         [ 0.7773, -0.2691, -1.9503,  ..., -0.0762,  0.2047, -0.1950],
         [-0.6363,  1.0228, -0.0185,  ..., -0.3889, -0.2375,  0.1919],
         [-0.5122, -2.3715,  1.3135,  ...,  0.7126, -1.4655,  1.9947]],

        [[-1.2377,  0.1138, -1.8319,  ..., -1.9485, -1.8134, -0.6321],
         [ 1.5197, -1.9227, -0.9065,  ...,  0.2602, -1.2787,  0.2797],
         [-0.4078,  0.5751, -1.0158,  ..., -1.3934, -0.8358, -0.7707],
         [-0.9811,  1.6607,  0.4198,  ...,  0.3754, -2.1474, -0.1933]],

        [[-1.9892, -0.1741, -2.0083,  ..., -1.6731, -0.9430,  1.2402],
         [-1.4311, -0.8608, -3.8426,  ...,  0.0089,  0.1472,  2.3237],
         [ 0.0674,  1.4740,  1.2798,  ..., -0.9884, -0.8819,  1.5094],
         [-1.8497,  2.0809, -0.0266,  ...,  0.3238,  0.9423, -0.3340]],

        ...,

        [[-1.6714,  1.1834,  1.1910,  ...,  0.4821, -2.6470,  1.1042],
         [-0.5286, -1.1202, -1.6385,  ...,  1.5283, -0.09

In [89]:
input_embeddings = token_embeddings + pos_embeddings

input_embeddings.shape

torch.Size([8, 4, 256])