In [53]:
pip install tiktoken

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [40]:
import re
import tiktoken
import torch

In [18]:
with open ("the-verdict.txt","r",encoding="utf-8") as f:
    raw_text = f.read()
print(len(raw_text))

20480


In [19]:
print(raw_text[:100])


I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [20]:
# Splitting into tokens

preprocessed=re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
preprocessed=[item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:20])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was']


In [21]:
# Creating a base vocabulary
all_words =sorted(set(preprocessed))
vocab_size=len(all_words)
print(vocab_size)


1130


In [22]:
vocab={token:integer for integer,token in enumerate(all_words)}
for i,item in enumerate(vocab.items()):
    print(item)
    if i>=20:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)


In [23]:
# Adding special tokens 
all_tokens=sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>","<|unk|>"])
vocab = { token:integer for integer,token in enumerate(all_tokens)}
print(len(vocab))

1132


In [24]:
for i,item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [25]:
class SimpleTokenizer:
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={id:token for token,id in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([.,:;?_!"()\']|--|\s)',text)
        preprocessed = [token.strip() for token in preprocessed if token.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids=[self.str_to_int[item] for item in preprocessed]
        return ids
    
    def decode(self,ids):
        text = " ".join([self.int_to_str[id] for id in ids])
        text = re.sub(r'\s+([.,:;?!"()\'])',r'\1',text)
        return text

In [26]:
tokenizer=SimpleTokenizer(vocab)
text1="Hello, I am a girl."
text2="I was surprised to meet them."
text=" <|endoftext|> ".join((text1,text2))
print(text)
embedding = tokenizer.encode(text)
print(embedding)
print(tokenizer.decode(embedding))

Hello, I am a girl. <|endoftext|> I was surprised to meet them.
[1131, 5, 53, 150, 115, 1131, 7, 1130, 53, 1077, 962, 1016, 1131, 990, 7]
<|unk|>, I am a <|unk|>. <|endoftext|> I was surprised to <|unk|> them.


In [27]:
tokenizer=tiktoken.get_encoding("gpt2")

In [28]:
IDs = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(IDs)

[15496, 11, 314, 716, 257, 2576, 13, 220, 50256, 314, 373, 6655, 284, 1826, 606, 13]


In [29]:
print(tokenizer.decode(IDs))

Hello, I am a girl. <|endoftext|> I was surprised to meet them.


In [30]:
# Exercise 2.1 : Byte Pair Encoding of Unknown words  

string="Akwirw ier"
encoding=tokenizer.encode(string)
print(encoding)
for token in encoding:
    print(tokenizer.decode([token]))
print(tokenizer.decode(encoding))

[33901, 86, 343, 86, 220, 959]
Ak
w
ir
w
 
ier
Akwirw ier


In [32]:
enc_text = tokenizer.encode(raw_text)
enc_sample = enc_text[50:]

In [33]:
context_size=4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(x)
print(y)

[11, 290, 4920, 2241]
[290, 4920, 2241, 287]


In [37]:
for i in range(1, context_size+1):
    context=enc_sample[:i]
    desired=enc_sample[i]
    print(context,"---->",desired)
    print(tokenizer.decode(context),"---->",tokenizer.decode([desired]))

[11] ----> 290
, ---->  and
[11, 290] ----> 4920
, and ---->  established
[11, 290, 4920] ----> 2241
, and established ---->  himself
[11, 290, 4920, 2241] ----> 287
, and established himself ---->  in


In [41]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids=[]
        self.target_ids=[]

        token_ids=tokenizer.encode(txt)
        for i in range(0,len(token_ids)-max_length,stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self,idx):
        return self.input_ids[idx],self.target_ids[idx]

In [43]:
dataloader = create_dataloader_v1(raw_text,batch_size=1,max_length=4,stride=1,shuffle=False)
data_iter = iter(dataloader)
print(next(data_iter))

[tensor([[ 198,   40,  367, 2885]]), tensor([[  40,  367, 2885, 1464]])]


In [51]:
# Exercise 2.2: Data Loaders with different strides and context sizes

dataloader_test=create_dataloader_v1(raw_text,batch_size=1,max_length=2,stride=2,shuffle=False)
data_iter_test=iter(dataloader_test)
print(next(data_iter_test))

[tensor([[198,  40]]), tensor([[ 40, 367]])]


In [42]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer=tiktoken.get_encoding('gpt2')
    dataset=GPTDatasetV1(txt, tokenizer, max_length,stride)
    dataloader=DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [52]:
# sampling with batch size greater than 1

dataloader=create_dataloader_v1(raw_text,batch_size=8,max_length=4,stride=4,shuffle=False)
data_iter=iter(dataloader)
inputs,targets = next(data_iter)
print("Inputs: \n",inputs)
print("Targets:\n",targets)

Inputs: 
 tensor([[  198,    40,   367,  2885],
        [ 1464,  1807,  3619,   402],
        [  271, 10899,  2138,   257],
        [ 7026, 15632,   438,  2016],
        [  257,   922,  5891,  1576],
        [  438,   568,   340,   373],
        [  645,  1049,  5975,   284],
        [  502,   284,  3285,   326]])
Targets:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])


In [54]:
vocab_size

1130

In [55]:
vocab_size=50257
output_dim=256
token_embedding_layer= torch.nn.Embedding(vocab_size,output_dim)

In [56]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [None]:
# Adding Positional embeddings

context_length=4
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)


torch.Size([4, 256])


In [59]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
