In [9]:
# Now we will do Byte Pair Encoding (BPE) on a text file using the tiktoken library-used by OpenAI.
import importlib
import tiktoken



In [10]:
#implementing a dataloader
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, text,tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1:i + 1 + max_length]
            self.input_ids.append(input_chunk)
            self.target_ids.append(target_chunk)

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    
    
    


In [11]:
def create_dataloader(text, batch_size = 4, max_length = 256, stride = 128, shuffle = True, drop_last = True, num_workers = 0):
    
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = TextDataset(text, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader


In [12]:
# Now we will create input target pairs for training. This is the last step before creating vector embeddings.
#given a sentence, the input goes to the LLM and the target will be the last token to be predicted. the input increase over time as targets of the previous step are added to the input.
import os
data_folder = "data"
raw_texts = []
for filename in os.listdir(data_folder):
    file_path = os.path.join(data_folder, filename)
    if os.path.isfile(file_path):
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                raw_texts.append(f.read())
        except UnicodeDecodeError:
            print(f"Skipped non-text file: {filename}")

raw_text = "\n".join(raw_texts)


Skipped non-text file: .DS_Store


In [13]:
dataloader = create_dataloader(raw_text, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print("First batch input tokens:", first_batch)

First batch input tokens: [[tensor([ 257,  438,  373, 1200]), tensor([8258,  372, 1900,   13]), tensor([ 804, 7586,   13, 3244]), tensor([ 290, 1021, 2399,  339]), tensor([ 1139, 26834,    59,  1816]), tensor([ 59, 257, 198, 656]), tensor([  198,    59, 43561,   262]), tensor([11246,   198,  2913, 22634]), tensor([2618,   73, 1817, 3650]), tensor([3382, 1942,  581,  284]), tensor([ 284,  774, 4714,   59]), tensor([ 2740, 34967,   262,   198]), tensor([  284,   355,  1109, 17846]), tensor([683, 673, 326, 257]), tensor([ 2354, 24178,   339, 43836]), tensor([   13,   656,  2900, 38987]), tensor([705, 607, 510, 438]), tensor([3237, 2151,  287,  273]), tensor([ 826,  379, 2968, 3737]), tensor([4032,  262,   59,  691]), tensor([1139, 3420,  198,  257]), tensor([10018,    13,  2118,  5166]), tensor([  88,   59, 2899,  286]), tensor([  290,   198,  1187, 47291]), tensor([ 6140,    59,   351, 12163]), tensor([ 59, 198, 607, 438]), tensor([  198, 27202,   290,  6058]), tensor([ 1462, 22461,    1