In [2]:
# Exercise 1.1

In [3]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [4]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

[33901, 86, 343, 86, 220, 959]


In [5]:
for i in integers:
    print(f"{i} -> {tokenizer.decode([i])}")

33901 -> Ak
86 -> w
343 -> ir
86 -> w
220 ->  
959 -> ier


In [6]:
tokenizer.encode("Ak")

[33901]

In [7]:
tokenizer.encode("w")

[86]

In [8]:
tokenizer.encode("ir")

[343]

In [9]:
tokenizer.encode("w")

[86]

In [10]:
tokenizer.encode(" ")

[220]

In [11]:
tokenizer.encode("ier")

[959]

In [12]:
tokenizer.decode([33901, 86, 343, 86, 220, 959])

'Akwirw ier'

# Exercise 1.2

In [13]:
import tiktoken
import torch
from torch.utils.data import DataLoader, Dataset

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        '''
        :param txt: 文本 
        :param tokenizer: 
        :param max_length: 滑动窗口最大长度 
        :param stride: 滑动步长
        '''
        self.input_ids = []
        self.tar_ids = []
        
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            tar_chunk = token_ids[i + 1:i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.tar_ids.append(torch.tensor(tar_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.tar_ids[idx]
    
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=0
    )
    return dataloader

In [14]:
with open(r'../../../Data/the-verdict.txt', 'r', encoding='utf-8') as f:
    raw_text = f.read()
    
tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(raw_text)

vocab_size = 50257
output_dim = 256
max_len = 4
context_length = max_len

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [16]:
dataloader = create_dataloader_v1(raw_text, batch_size=4, max_length=2, stride=2)

for batch in dataloader:
    x, y = batch
    break

x

tensor([[  339,  2993],
        [  438,   292],
        [17972,    13],
        [ 7721,   257]])

In [17]:
dataloader = create_dataloader_v1(raw_text, batch_size=4, max_length=8, stride=2)

for batch in dataloader:
    x, y = batch
    break

x

tensor([[1654,  345,  760,  810,  345,  821, 2406,  503],
        [ 651,  749,  503,  286,  340,   26,  290, 3619],
        [ 262, 1711,   13,  383, 7099, 6802,  373,  531],
        [2588,  856,  607, 2781,  526,  198,  198,    1]])