In [1]:
base_file_path = "open_web_text_lines_files\\open_web_text_dataset_file"
checkpoint = 'bert-base-cased'
dataset_folder_path = "..\\custom_datasets\\open_web_text_dataset"

In [2]:
from datasets import Dataset, DatasetDict
from tqdm.auto import tqdm
from transformers import AutoTokenizer

In [3]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)



In [4]:
def open_web_text_dataset_gen(shards, dataset_name):
    progress_bar = tqdm(range(len(shards)))
    idx = 0
    for shard in shards:
        with open(shard, "r", encoding="utf-8") as file:
            for text in file:
                yield {"idx": idx, "text": tokenizer(text[0:-1], max_length = 512, truncation = True)['input_ids'][1:-1]}
                idx += 1
            progress_bar.update()

In [5]:
def build_open_web_text_dataset_dict():
    train_shards = [base_file_path + str(i) + ".txt" for i in range(84)]
    return DatasetDict({
        "train": Dataset.from_generator(open_web_text_dataset_gen, gen_kwargs={"shards": train_shards, "dataset_name": "train"})
    })

In [6]:
open_web_text_dataset_dict = build_open_web_text_dataset_dict()

Generating train split: 0 examples [00:00, ? examples/s]

  0%|          | 0/84 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/143 [00:00<?, ?it/s]

In [7]:
print(open_web_text_dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['idx', 'text'],
        num_rows: 166005717
    })
})


In [8]:
open_web_text_dataset_dict.save_to_disk(dataset_folder_path)

Saving the dataset (0/143 shards):   0%|          | 0/166005717 [00:00<?, ? examples/s]