In [1]:
import os
import sentencepiece as spm

In [2]:
data_path = os.path.join("..", "data", "cleaned_text.txt")
tokenizer_path = os.path.join("..", "tokenizer_model", "hadith_unigram")

In [3]:
with open(data_path, "r", encoding="utf-8") as f:
    data = f.read()

In [4]:
data[:50]

'التزات النبوي صحيح البخارية للإماراتي عبد الله محم'

In [5]:
sp = spm.SentencePieceProcessor()
sp.load(tokenizer_path + ".model" )

True

In [6]:
enc_data = sp.encode(data)

In [7]:
len(enc_data)

627394

In [8]:
enc_example = enc_data[:50]

In [9]:
# enc_data1


In [10]:
context_size = 4 

x = enc_example[:context_size]
y = enc_example[1:context_size+1]


In [11]:
x

[833, 16801, 7333, 1933]

In [12]:
y

[16801, 7333, 1933, 838]

In [13]:
for i in range(1, context_size+1):
    context = enc_example[:i]
    desired = enc_example[i]

    print(sp.Decode(context), "---->", sp.Decode([desired]))

الت ----> زات
التزات ----> النبوي
التزات النبوي ----> صحيح
التزات النبوي صحيح ----> البخاري


In [14]:
import torch
from torch.utils.data import Dataset , DataLoader


In [15]:
torch.__version__

'2.8.0+cpu'

In [16]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.Encode(txt)
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [19]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    sp = spm.SentencePieceProcessor()
    sp.load(tokenizer_path + ".model" )

    # Create dataset
    dataset = GPTDatasetV1(txt, sp, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [20]:
dataloader = create_dataloader_v1(
    data[:50], batch_size=1, max_length=4, stride=4, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  833, 16801,  7333,  1933]]), tensor([[16801,  7333,  1933,   838]])]
