In [1]:
from datasets import load_dataset, DatasetDict

In [2]:
from json import load


data_files = {
    "train": "dataset/v2/tamil_train.txt",
    "test": "dataset/v2/tamil_test.txt",
}

dataset = load_dataset(
    "minimalist-ai/TamilDataset",
)

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [4]:
from transformers import PreTrainedTokenizerFast

context_length = 1000
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="bpe_tamil_tokenizer/tamil_bpe_tokenizer.json"
)

outputs = tokenizer(
    dataset["train"][:2]['text'],
    truncation=True,
    max_length=context_length,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

input IDs length: 2
Input chunk lengths: [229, 0]
Chunk mapping: [0, 1]


In [6]:
def tokenize(element):
    outputs = tokenizer(
        element['text'],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )

    input_batch = []
    for length, input_ids in zip(outputs['length'], outputs['input_ids']):

        if length == context_length:
            input_batch.append(input_ids)
    return {
        'input_ids': input_batch
    }

tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    num_proc=4,
)

tokenized_dataset

Map (num_proc=4):   0%|          | 0/23273243 [00:00<?, ? examples/s]

ArrowInvalid: Column 1 named input_ids expected length 1000 but got length 26

In [7]:
def tokenize(element):
    outputs = tokenizer(
        element['text'],
        truncation=True,
        max_length=context_length,
        padding="max_length",  # Add padding to ensure consistent length
        return_overflowing_tokens=True,
        return_length=True,
    )

    input_batch = []
    for input_ids in outputs['input_ids']:
        input_batch.append(input_ids)
    return {
        'input_ids': input_batch
    }

tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    num_proc=4,
    remove_columns=dataset['train'].column_names  # Remove original columns
)

tokenized_dataset

Map (num_proc=4):   0%|          | 0/23273243 [00:00<?, ? examples/s]

ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [5]:
dataset['train'].column_names

['text']

In [9]:
def find_min_length():
    for element in dataset['train']:
        yield len(element['text'])



In [None]:
texts_lens = []
for length in find_min_length():
    texts_lens.append(length)