## **CREATING TOKENS**

In [None]:
with open("the-verdict.txt", "r", encoding ="utf-8") as f:
  raw_text = f.read()
print("Total number of characters:", len(raw_text))
print(raw_text[:99])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [None]:
import re
sent = "Hello, I am waiting for my turn since morning."
res = re.split(r'(\s)', sent)
print(res)

['Hello,', ' ', 'I', ' ', 'am', ' ', 'waiting', ' ', 'for', ' ', 'my', ' ', 'turn', ' ', 'since', ' ', 'morning.']


In [None]:
res = re.split(r'([,.] |\s)', sent)
print(res)

['Hello', ', ', 'I', ' ', 'am', ' ', 'waiting', ' ', 'for', ' ', 'my', ' ', 'turn', ' ', 'since', ' ', 'morning.']


In [None]:
result = [i for i in res if i.strip()]
print(result)

['Hello', ', ', 'I', 'am', 'waiting', 'for', 'my', 'turn', 'since', 'morning.']


# **Removing whitepaces**

In [None]:
sentence  = " Wonderful ! , you have made a   beautiful picture -- . Could you make it for me ? "

res_re = re.split(r'([,."!?;:\'_()[]{}]|--|\s)', sentence )
res_y = [item.strip() for item in res_re if item.strip()]
print(res_y)

['Wonderful', '!', ',', 'you', 'have', 'made', 'a', 'beautiful', 'picture', '--', '.', 'Could', 'you', 'make', 'it', 'for', 'me', '?']


# **Applying this method to the entire "the-verdict"**

In [None]:
preprocessed = re.split(r'([.,!?;:_(){}\[\]\s]|--)', raw_text)
res_pre = [item.strip() for item in preprocessed if item.strip()]
print(res_pre[:200])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"The', 'height', 'of', 'his', 'glory"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--', 'deploring', 'his', 'unaccountable', 'abdication', '.', '"Of', 'course', "it's", 'going', 'to', 'send', 'the', 'value', 'of', 'my', 'picture', "'way", 'up', ';', 'but', 'I', "don't", 'think', 'of', 'that', ',', 'Mr', '.', 'Rickham', '--', '

In [None]:
print(len(res_pre))

4435


# **Creating Token IDs**

### **Creating a list of unique tokens & sorting them alphabetically to determine vocabulary size**

In [None]:
words = sorted(set(res_pre))
print(len(words))

1183


In [None]:
vocab = {i:index for index , i in enumerate(words)}
print(vocab)

{'!': 0, '"': 1, '"Ah': 2, '"Be': 3, '"Begin': 4, '"By': 5, '"Come': 6, '"Destroyed': 7, '"Don\'t': 8, '"Gisburns"': 9, '"Grindles': 10, '"Hang': 11, '"Has': 12, '"How': 13, '"I': 14, '"I\'d': 15, '"If': 16, '"It': 17, '"It\'s': 18, '"Jack': 19, '"Money\'s': 20, '"Moon-dancers"': 21, '"Mr': 22, '"Mrs': 23, '"My': 24, '"Never': 25, '"Of': 26, '"Oh': 27, '"Once': 28, '"Only': 29, '"Or': 30, '"That': 31, '"The': 32, '"Then': 33, '"There': 34, '"This': 35, '"We': 36, '"Well': 37, '"What': 38, '"When': 39, '"Why': 40, '"Yes': 41, '"You': 42, '"but': 43, '"deadening': 44, '"dragged': 45, '"effects"': 46, '"interesting"': 47, '"lift': 48, '"obituary"': 49, '"strongest': 50, '"strongly"': 51, '"sweetly"': 52, "'": 53, "'Are": 54, "'It's": 55, "'coming'": 56, "'done'": 57, "'subject": 58, "'technique'": 59, "'way": 60, '(': 61, ')': 62, ',': 63, '--': 64, '.': 65, ':': 66, ';': 67, '?': 68, 'A': 69, 'Among': 70, 'And': 71, 'Arrt': 72, 'As': 73, 'At': 74, 'Burlington': 75, 'But': 76, 'By': 77, '


*   Creating Tokenizer Class with 2 methods
*   Method : 1 | Encode Method : Converting token to token IDs
*  Method : 2 | Decode Method :  Converting token IDs to text



In [None]:
class tokens_encode_decode:
  def __init__(self, vocab):
    self.str_2_int = vocab # this vocab is a dictionary in the form of tokens : index
    self.int_2_str = {index:token for token,index in vocab.items()}
  def encode(self, text):
    pre_pro = re.split(r'([,.:;"\'()]|--|\s)', text)
    pre_pro = [item.strip() for item in pre_pro if item.strip()]
    ids  = [self.str_2_int[index] for index in pre_pro]
    return ids
  def decode(self, ids):
    text = " ".join([self.int_2_str[token] for token in ids])
    text = re.sub(r'([,.;:\'?"!]|--|\s)',r'\1',text)
    return text



In [None]:
sol = tokens_encode_decode(vocab)

txt = "Chicago is a  place to"
ids = sol.encode(txt)
print(ids)


[79, 637, 165, 826, 1065]


In [None]:
back_to_text = sol.decode(ids)
print(back_to_text)

Chicago is a place to


# **Adding Special text Tokens**

In [None]:
all_tokens = sorted(set(res_pre))
all_tokens.extend(['<|unk|>','<|endoftext|>'])

vocab = {tokens:index for index,tokens in enumerate(all_tokens)}
print(len(vocab)) #earlier the lenght was 1183 after adding these 2 tokens lenght is 1185

1185


In [None]:
print(vocab.items())

dict_items([('!', 0), ('"', 1), ('"Ah', 2), ('"Be', 3), ('"Begin', 4), ('"By', 5), ('"Come', 6), ('"Destroyed', 7), ('"Don\'t', 8), ('"Gisburns"', 9), ('"Grindles', 10), ('"Hang', 11), ('"Has', 12), ('"How', 13), ('"I', 14), ('"I\'d', 15), ('"If', 16), ('"It', 17), ('"It\'s', 18), ('"Jack', 19), ('"Money\'s', 20), ('"Moon-dancers"', 21), ('"Mr', 22), ('"Mrs', 23), ('"My', 24), ('"Never', 25), ('"Of', 26), ('"Oh', 27), ('"Once', 28), ('"Only', 29), ('"Or', 30), ('"That', 31), ('"The', 32), ('"Then', 33), ('"There', 34), ('"This', 35), ('"We', 36), ('"Well', 37), ('"What', 38), ('"When', 39), ('"Why', 40), ('"Yes', 41), ('"You', 42), ('"but', 43), ('"deadening', 44), ('"dragged', 45), ('"effects"', 46), ('"interesting"', 47), ('"lift', 48), ('"obituary"', 49), ('"strongest', 50), ('"strongly"', 51), ('"sweetly"', 52), ("'", 53), ("'Are", 54), ("'It's", 55), ("'coming'", 56), ("'done'", 57), ("'subject", 58), ("'technique'", 59), ("'way", 60), ('(', 61), (')', 62), (',', 63), ('--', 64), 

In [None]:
for i , item in enumerate(list(vocab.items())[-5:]):
  print(item)

('younger', 1180)
('your', 1181)
('yourself', 1182)
('<|unk|>', 1183)
('<|endoftext|>', 1184)


In [None]:
class SimpleTokenizerX:
  def __init__(self,vocab):
    self.str_2_int = vocab
    self.int_2_str = {index:token for token,index in vocab.items()}
  def encode(self,text):
    preprocessed = re.split(r'([.,!?;:_(){}\[\]\s]|--)',text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    preprocessed = [item if item in self.str_2_int else '<|unk|>' for item in preprocessed]
    ids = [self.str_2_int[token] for token in preprocessed]
    return ids
  def decode(self,ids):
    text = " ".join([self.int_2_str[i] for i in ids])
    text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
    return text

In [None]:
tokenizer = SimpleTokenizerX(vocab)

text_1 = "Hello, do you like Tea?"
text_2 = "In the sunlit terraces of the palace"

text = " <|endoftext|> ".join((text_1,text_2))
print(text)

Hello, do you like Tea? <|endoftext|> In the sunlit terraces of the palace


In [None]:
ids = tokenizer.encode(text)
print(ids)

[1183, 63, 404, 1177, 683, 1183, 68, 1184, 108, 1035, 1006, 1031, 775, 1035, 1183]


In [None]:
code = tokenizer.decode(ids)
print(code)

<|unk|>, do you like <|unk|>? <|endoftext|> In the sunlit terraces of the <|unk|>


In [None]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like <|unk|>? <|endoftext|> In the sunlit terraces of the <|unk|>'

## Byte Pair Encoding

(https://github.com/openai/tiktoken)

In [None]:
!pip3 install tiktoken  #tiktoken is a fast BPE tokeniser for use with OpenAI's models.

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [None]:
import importlib
import tiktoken

print("tiktoken version :", importlib.metadata.version('tiktoken'))

tiktoken version : 0.9.0


In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
sent = "I don't like coffee or tea <|endoftext|> I have studied history all night, Are there anyunknownplaces here"
text = tokenizer.encode(sent, allowed_special = {'<|endoftext|>'})
print(text)
#If this word - anyunknownplaces would have come in word level tokenization then it would lead to Out Of Vocab problem & no Token ID would
#have been generated. This word has been encoded in BPE , this word would have been broken down into sub words & then encoding has been done

#Note : <|endoftext|> token has been given token ID 50256


[40, 836, 470, 588, 6891, 393, 8887, 220, 50256, 314, 423, 9713, 2106, 477, 1755, 11, 4231, 612, 597, 34680, 23625, 994]


In [None]:
strings = tokenizer.decode(text)
print(strings)

I don't like coffee or tea <|endoftext|> I have studied history all night, Are there anyunknownplaces here


In [None]:
#Example
integers  = tokenizer.encode("Alfasm oops canghs")
strings = tokenizer.decode(integers)
print(strings)
print(integers)

Alfasm oops canghs
[32, 1652, 8597, 267, 2840, 460, 456, 82]


# Creating input-target pairs

In [None]:
enc_text = tokenizer.encode(raw_text) #this is a BPE Tokenizer
print(len(enc_text))

5145


In [None]:
enc_sample  = enc_text[50:]

In [None]:
context_size = 4 # this tells us how many words are taken in input
x = enc_sample[:context_size]
y = enc_sample[1:context_size + 1]
print(f'x : {x}')
print(f'y : {y}')
#this means if the input is 290 next word is 4920 , if input is 290 + 4920 output is 2241, if input is 290 +
#4920 +2241 , output is 287 , if input is 290,4920,2241 & 287 then output is 257

x : [290, 4920, 2241, 287]
y : [4920, 2241, 287, 257]


In [None]:
#another representation of the above code
for i in range(1, context_size +1):
  context = enc_sample[:i]
  desired = enc_sample[i]
  print(context ,'----->', desired )

[290] -----> 4920
[290, 4920] -----> 2241
[290, 4920, 2241] -----> 287
[290, 4920, 2241, 287] -----> 257


In [None]:
for i in range(1, context_size +1):
  context = enc_sample[:i]
  desired = enc_sample[i]
  print(tokenizer.decode(context) ,'----->', tokenizer.decode([desired]))

 and ----->  established
 and established ----->  himself
 and established himself ----->  in
 and established himself in ----->  a


## Importing DataLoader

In [None]:
from torch.utils.data import Dataset , DataLoader
class GPTDatasetV1(Dataset):
  def __init__(self, txt , tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    #Tokenizing the entire text
    token_ids = tokenizer.encode(txt, allowed_special = {"<|endoftext|>"})

    for i in range(0, len(token_ids) - max_length , stride):
      input_chunk = token_ids[i:i + max_length]
      target_chunk = token_ids[i + 1 : i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)
  def __getitem__(self,idx):
    return self.input_ids[idx], self.target_ids[idx]


In [None]:
def create_dataloader_v1(txt, batch_size = 4, max_length = 256, stride = 128 , shuffle = True,
                         drop_last = True, num_workers = 0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(txt , tokenizer , max_length , stride)
  dataloader = DataLoader( dataset, batch_size = batch_size , shuffle = shuffle, drop_last = drop_last,
                          num_workers = num_workers)
  return dataloader

In [None]:
with open("the-verdict.txt", "r",encoding = "utf-8") as f:
  raw_text = f.read()

In [None]:
import torch
print("PyTorch Version:" , torch.__version__)
dataloader = create_dataloader_v1( raw_text , batch_size = 1, max_length = 4, stride = 1, shuffle = False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

PyTorch Version: 2.6.0+cu124
[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [None]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [None]:
dataloader = create_dataloader_v1(raw_text, batch_size = 8, max_length = 4, stride = 4, shuffle = False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs : \n", inputs)
print("\nTargets : \n", targets)

Inputs : 
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets : 
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
