In [1]:
import os
import urllib.request

In [2]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    text_data = f.read()

In [3]:
text_data

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [4]:
len(text_data)

20479

In [5]:
#tokenize the raw data
import re

In [6]:
text_token = re.split(r'([,.:;?_!"()\']|--|\s)', text_data)
text_token = [item.strip() for item in text_token if item.strip()]
print(text_token)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--', 'deploring', 'his', 'unaccountable', 'abdication', '.', '"', 'Of', 'course', 'it', "'", 's', 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture', "'", 'way', 'up', ';', 'but', 'I', 'don', "'", 't', 'think', 'of', 'that', ',

In [7]:
len(text_token)

4690

Converting Raw Text(Tokens in this case) into Token IDs

In [8]:
# Get rid of dupilates and sort the text
all_vars = sorted(set(text_token))
unique_var = len(all_vars)
print(unique_var)

1130


In [9]:
# build the vocabulary
vocab = {token:integer for integer, token in enumerate(all_vars)}
vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'A': 11,
 'Ah': 12,
 'Among': 13,
 'And': 14,
 'Are': 15,
 'Arrt': 16,
 'As': 17,
 'At': 18,
 'Be': 19,
 'Begin': 20,
 'Burlington': 21,
 'But': 22,
 'By': 23,
 'Carlo': 24,
 'Chicago': 25,
 'Claude': 26,
 'Come': 27,
 'Croft': 28,
 'Destroyed': 29,
 'Devonshire': 30,
 'Don': 31,
 'Dubarry': 32,
 'Emperors': 33,
 'Florence': 34,
 'For': 35,
 'Gallery': 36,
 'Gideon': 37,
 'Gisburn': 38,
 'Gisburns': 39,
 'Grafton': 40,
 'Greek': 41,
 'Grindle': 42,
 'Grindles': 43,
 'HAD': 44,
 'Had': 45,
 'Hang': 46,
 'Has': 47,
 'He': 48,
 'Her': 49,
 'Hermia': 50,
 'His': 51,
 'How': 52,
 'I': 53,
 'If': 54,
 'In': 55,
 'It': 56,
 'Jack': 57,
 'Jove': 58,
 'Just': 59,
 'Lord': 60,
 'Made': 61,
 'Miss': 62,
 'Money': 63,
 'Monte': 64,
 'Moon-dancers': 65,
 'Mr': 66,
 'Mrs': 67,
 'My': 68,
 'Never': 69,
 'No': 70,
 'Now': 71,
 'Nutley': 72,
 'Of': 73,
 'Oh': 74,
 'On': 75,
 'Once': 76,
 'Only': 77,
 '

In [10]:
#print(dir(all_vars))

### Use the vocabulary created above to convert the training text into token ids

In [11]:
# Create a tokenizer class
class SimpleTokenizer:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        text_token = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        text_token = [
            item.strip() for item in text_token if item.strip()
        ]
        ids = [self.str_to_int[s] for s in text_token]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        #replace spacies before specified punctuations
        text = re.sub(r'\s+([,.?_!"()\'])', r'\1', text)
        return text

        

In [12]:
tokenizer = SimpleTokenizer(vocab)

Expand Vocabulary with additional Special Tokens

In [13]:
all_tokens = sorted(list(set(text_token)))
all_tokens.extend(["<endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [14]:
len(vocab.items())

1132

In [15]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<endoftext|>', 1130)
('<|unk|>', 1131)


In [16]:
# Create a tokenizer class
class SimpleTokenizer_1:
    def __init__(self,vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        text_token = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        text_token = [
            item.strip() for item in text_token if item.strip()
        ]
        text_token = [
            item if item in sself.str_to_int
            else "<|unk|>" for item in text_token
        ]
        ids = [self.str_to_int[s] for s in text_token]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        #replace spacies before specified punctuations
        text = re.sub(r'\s+([,.?_!"()\'])', r'\1', text)
        return text

        

In [17]:
tokenizer = SimpleTokenizer_1(vocab)

## Byte Pair Encoding

In [18]:
import tiktoken 

In [19]:
tiktoken.__version__

'0.9.0'

In [20]:
tokenizer = tiktoken.get_encoding("gpt2") 


In [21]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    text_data = f.read()

enc_text = tokenizer.encode(text_data)
print(len(enc_text))

5145


## Remove the first 50 tokens for demonstration Purposes

In [22]:
enc_examp = enc_text[50:]

In [23]:
context_size = 4

x = enc_examp[:context_size]
y = enc_examp[1:context_size]

print(f"x: {x}")
print(f"y:    {y}")

x: [290, 4920, 2241, 287]
y:    [4920, 2241, 287]


In [24]:
for i in range (1, context_size+1):
    context = enc_examp[:i]
    desired = enc_examp[i]

    print(tokenizer.decode(context), "----->", tokenizer.decode([desired]))

 and ----->  established
 and established ----->  himself
 and established himself ----->  in
 and established himself in ----->  a


In [25]:
import torch
from torch.utils.data import Dataset, DataLoader

In [26]:
class GPTDataset_V1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokens the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftxt|>"})

        # Use a sliding window to chunk the book into toverlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length +1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
                return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [27]:
def create_dataloader_V1(txt, batch_size=2,max_length=256, stride=128,
                        shuffle = True, drop_last=True, num_workers=0):
    #Initialize tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    #Create the dataset
    data_set = GPTDataset_V1(txt, tokenizer, max_length, stride)

    #Create dataloader
    dataloader = DataLoader(
        data_set,
        batch_size, 
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers= num_workers
     )

    return dataloader

In [28]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    text_data = f.read()

In [29]:
dataloader = create_dataloader_V1(text_data, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
batch_one = next(data_iter)
print(batch_one)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [30]:
batch_two = next(data_iter)
print(batch_two)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [31]:
dataloader = create_dataloader_V1(text_data, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


## Create Token Embeddings

In [32]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [33]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


## Implementing Positional Encoding

In [34]:
vocab_size = 50257
output_dim = 256

token_embed_layer = torch.nn.Embedding(vocab_size, output_dim)

In [35]:
max_legth = 4
dataloader = create_dataloader_V1(
    text_data, batch_size=8, max_length=max_legth, stride=max_legth, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [36]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [37]:
token_embed = token_embed_layer(inputs)
token_embed.shape

torch.Size([8, 4, 256])

In [38]:
# creating positional embedding layer
context_length = max_legth
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [39]:
torch.arange(max_legth)

tensor([0, 1, 2, 3])

In [40]:
pos_embedding_layer.weight

Parameter containing:
tensor([[ 1.7375, -0.5620, -0.6303,  ..., -0.2277,  1.5748,  1.0345],
        [ 1.6423, -0.7201,  0.2062,  ...,  0.4118,  0.1498, -0.4628],
        [-0.4651, -0.7757,  0.5806,  ...,  1.4335, -0.4963,  0.8579],
        [-0.6754, -0.4628,  1.4323,  ...,  0.8139, -0.7088,  0.4827]],
       requires_grad=True)

In [41]:
pos_embedding_layer(torch.arange(max_legth))

tensor([[ 1.7375, -0.5620, -0.6303,  ..., -0.2277,  1.5748,  1.0345],
        [ 1.6423, -0.7201,  0.2062,  ...,  0.4118,  0.1498, -0.4628],
        [-0.4651, -0.7757,  0.5806,  ...,  1.4335, -0.4963,  0.8579],
        [-0.6754, -0.4628,  1.4323,  ...,  0.8139, -0.7088,  0.4827]],
       grad_fn=<EmbeddingBackward0>)

In [42]:
pos_embedding = pos_embedding_layer(torch.arange(max_legth))
print(pos_embedding.shape)

torch.Size([4, 256])


In [43]:
token_embed.shape

torch.Size([8, 4, 256])

In [44]:
pos_embedding.shape

torch.Size([4, 256])

In [45]:
input_embeddings = token_embed + pos_embedding
print(input_embeddings)

tensor([[[ 2.2288,  0.5619,  0.8286,  ..., -0.6272, -0.2987,  0.8900],
         [ 2.0903, -0.4664, -0.0593,  ...,  0.9115, -1.0493, -1.6473],
         [-0.7158, -0.8304,  1.2494,  ...,  2.3952,  1.8773,  0.8051],
         [ 0.2703,  0.4029,  3.0514,  ...,  0.3595, -1.4548,  0.8310]],

        [[ 3.2835,  1.1749, -1.4150,  ..., -0.3281,  2.4332,  0.6924],
         [-0.2199, -0.9114, -0.1750,  ...,  1.5337, -0.1998,  0.1462],
         [ 1.5197, -1.4240,  0.4391,  ...,  1.0494, -1.4318,  2.3057],
         [ 0.2893,  0.8346, -0.1884,  ...,  1.9602,  0.8709,  0.8796]],

        [[ 0.9662,  0.0952, -0.4640,  ..., -1.0320,  1.6290,  1.7771],
         [ 2.4468, -0.2154,  1.4984,  ...,  1.8766,  0.5595, -0.1423],
         [-0.3856, -2.5393,  1.1556,  ...,  3.6157,  1.3267,  0.4944],
         [-0.2487, -0.5275,  2.0009,  ...,  0.2930,  0.5977,  1.3300]],

        ...,

        [[ 0.1219,  0.3991, -3.2740,  ..., -1.1921,  2.6637,  2.6728],
         [ 1.2438, -1.6436, -1.1101,  ..., -0.7464, -0.98