# Data Preprocessing Pipline

<div class="alert alert-block alert-info">

1.tokenizer  --> word based / <b>subword based (BPE tokenizer)</b> / character based

2.token embeddings --> converting token IDs to vectors

3.position embeddings --> encoding information about position

<b> 4.input embeddings = token embedding + positional embedding

</div>

## word-based tokenizer

In [3]:
with open("../data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of charaters:", len(raw_text))

Total number of charaters: 20479


In [4]:
import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item for item in  preprocessed if item.strip()]
print(preprocessed[:30])

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

for i,item in enumerate(list(vocab.items())[-5:]):
    print(item)

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [5]:
class SimpleTokenizer:
    def __init__(self, vocab): # 传入词汇表
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)

        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" 
            for item in preprocessed if item.strip()
        ]
        
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        puncts = {',', '.', ':', ';', '?', '_', '!', '"', '(', ')', "'"}
        tokens = [self.int_to_str[i] for i in ids]
        text = ''.join(f' {token}' if token not in puncts else token for token in tokens)
        return text


In [6]:
tokenizer = SimpleTokenizer(vocab)
result_text = tokenizer.encode(raw_text)

print(result_text[:30])

[53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6, 908, 585, 1077, 709, 508, 961, 1016, 663, 1016, 535, 987, 5, 568]


## Subword-based tokenizer

In [7]:
import importlib
import importlib.metadata
import tiktoken


print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.9.0


In [8]:
tokenizer = tiktoken.get_encoding("gpt2")

In [9]:
text = (
    '''
    (1) Tokenization: Word based, Subword based (BPE tokenizer), Character based
    (2) Token embeddings
    (3) Positional embeddings
    (4) Input embeddings = Token embeddings + Positional embeddings
    The key reference book which this video series very closely follows is Build a Large Language Model from Scratch by Manning Publications. All schematics and their descriptions are borrowed from this incredible book!
    This book serves as a comprehensive guide to understanding and building large language models, covering key concepts, techniques, and implementations.    
    '''
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[198, 220, 220, 220, 357, 16, 8, 29130, 1634, 25, 9678, 1912, 11, 3834, 4775, 1912, 357, 33, 11401, 11241, 7509, 828, 15684, 1912, 198, 220, 220, 220, 357, 17, 8, 29130, 11525, 67, 654, 198, 220, 220, 220, 357, 18, 8, 18574, 1859, 11525, 67, 654, 198, 220, 220, 220, 357, 19, 8, 23412, 11525, 67, 654, 796, 29130, 11525, 67, 654, 1343, 18574, 1859, 11525, 67, 654, 198, 220, 220, 220, 383, 1994, 4941, 1492, 543, 428, 2008, 2168, 845, 7173, 5679, 318, 10934, 257, 13601, 15417, 9104, 422, 1446, 36722, 416, 15281, 40865, 13, 1439, 3897, 6759, 873, 290, 511, 16969, 389, 22546, 422, 428, 8082, 1492, 0, 198, 220, 220, 220, 770, 1492, 9179, 355, 257, 9815, 5698, 284, 4547, 290, 2615, 1588, 3303, 4981, 11, 9505, 1994, 10838, 11, 7605, 11, 290, 25504, 13, 220, 220, 220, 220, 198, 220, 220, 220, 220]


In [10]:
encodings = {
    "gpt2": tiktoken.get_encoding("gpt2"),
    "gpt3": tiktoken.get_encoding("p50k_base"),
    "gpt4": tiktoken.get_encoding("cl100k_base")
}

vocab_size = {model:encoding.n_vocab for model,encoding in encodings.items()}

for model, size in vocab_size.items():
    print(f"The vocabulary size for {model.upper()} is: {size}")

The vocabulary size for GPT2 is: 50257
The vocabulary size for GPT3 is: 50281
The vocabulary size for GPT4 is: 100277


## Dataset&DataLoader

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids)-max_length, stride):
            input_chunck = token_ids[i: i+max_length]
            target_chunck = token_ids[i+1: i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunck))
            self.target_ids.append(torch.tensor(target_chunck))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [12]:
def create_dataloader(txt, max_length=256, stride=128,
                         batch_size=4, shuffle=False, drop_last=True,
                         num_workers=0):
    
    tokenizer =  tiktoken.get_encoding("gpt2")

    dataset = GPTDataset(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

In [13]:
print(torch.__version__)
dataloader = create_dataloader(
    raw_text, max_length=4, stride=1, batch_size=8 
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Toke IDs:\n", inputs)
print("\nTargets:\n", targets)
print("\nInputs shape:", inputs.shape)

2.6.0+cu124
Toke IDs:
 tensor([[   40,   367,  2885,  1464],
        [  367,  2885,  1464,  1807],
        [ 2885,  1464,  1807,  3619],
        [ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 2885,  1464,  1807,  3619],
        [ 1464,  1807,  3619,   402],
        [ 1807,  3619,   402,   271],
        [ 3619,   402,   271, 10899],
        [  402,   271, 10899,  2138],
        [  271, 10899,  2138,   257],
        [10899,  2138,   257,  7026]])

Inputs shape: torch.Size([8, 4])


## token embedding

In [14]:
vocab_size = 50257
output_dim = 256
context_length = 4

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


## positional embedding

In [15]:
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [16]:
input_embeddings = token_embeddings + pos_embeddings # 广播机制
print(input_embeddings.shape)

torch.Size([8, 4, 256])
