In [2]:
#Definição das Variáveis Básicas do Script

default_label_non_masked_token = -100
train_max_lines = 100000
validation_max_lines = 25000
test_max_lines = 50000
bookcorpus_dataset_path = '..\\custom_datasets\\bookcorpus_lines_dataset'
wikipedia_dataset_path = '..\\custom_datasets\\wikipedia_lines_dataset'
tokenized_bookcorpus_dataset_path = '..\\custom_datasets\\tokenized_bookcorpus_lines_dataset'
tokenized_wikipedia_dataset_path = '..\\custom_datasets\\tokenized_wikipedia_lines_dataset'
checkpoint = 'bert-base-cased'

In [3]:
import random
import torch
from datasets import load_from_disk, Dataset, DatasetDict
from transformers import AutoTokenizer
from tqdm.auto import tqdm

In [4]:
bookcorpus_dataset = load_from_disk(bookcorpus_dataset_path)
wikipedia_dataset = load_from_disk(wikipedia_dataset_path)

FileNotFoundError: Directory ..\custom_datasets\wikipedia_lines_dataset is neither a `Dataset` directory nor a `DatasetDict` directory.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def apply_mask(token_sequence):
    masked_sequence = []
    label_sequence = []
    for i, token in enumerate(token_sequence):
        if(token != 101 and token != 102):
            probability = random.random()

            if(probability <= 0.15):
                probability /= 0.15

                if(probability <= 0.8):
                    masked_sequence.append(103)
                elif(probability <= 0.9):
                    masked_sequence.append(random.randrange(len(tokenizer.vocab)))
                else:
                    masked_sequence.append(token)

                label_sequence.append(token)
                continue
        
        masked_sequence.append(token)
        label_sequence.append(default_label_non_masked_token)
    return masked_sequence, label_sequence

In [None]:
def generate_dataset_entry(dataset, section_name, entry_index, max_dataset_lines):
    probability = random.random()

    if(probability < 0.5):
        next_sentence_label = 1
        second_sentence_index = entry_index
        while second_sentence_index == entry_index or second_sentence_index == entry_index + 1:
            second_sentence_index = random.randrange(0, max_dataset_lines)
    else:
        next_sentence_label = 0
        second_sentence_index = entry_index + 1

    tokenized_entry = tokenizer(
        dataset[section_name][entry_index]['line'],
        dataset[section_name][second_sentence_index]['line'],
        add_special_tokens=True,
        return_tensors='pt'
    )
    tokenized_entry['next_sentence_label'] = next_sentence_label
    
    return tokenized_entry

In [None]:
def tokenized_dataset_generator(dataset, section_name, max_lines=-1):
    new_dataset_length = dataset.num_rows[section_name] if max_lines < 0 else max_lines * 2
    for index in range(0, new_dataset_length, 2):
        dataset_entry = generate_dataset_entry(dataset, section_name, index, new_dataset_length)
        
        dataset_entry['input_ids'], dataset_entry['labels'] = apply_mask(dataset_entry['input_ids'][0])
        yield dataset_entry

In [15]:
tokenized_bookcorpus_dataset = DatasetDict({
    "train": Dataset.from_generator(
        tokenized_dataset_generator,
        gen_kwargs={"dataset": bookcorpus_dataset, "section_name": "train", "max_lines": train_max_lines},
    ),
    "validation": Dataset.from_generator(
        tokenized_dataset_generator,
        gen_kwargs={"dataset": bookcorpus_dataset, "section_name": "validation", "max_lines": validation_max_lines}
    ),
    "test": Dataset.from_generator(
        tokenized_dataset_generator,
        gen_kwargs={"dataset": bookcorpus_dataset, "section_name": "test", "max_lines": test_max_lines}
    ),
})

tokenized_wikipedia_dataset = DatasetDict({
    "train": Dataset.from_generator(
        tokenized_dataset_generator,
        gen_kwargs={"dataset": wikipedia_dataset, "section_name": "train", "max_lines": train_max_lines}
    ),
    "validation": Dataset.from_generator(
        tokenized_dataset_generator,
        gen_kwargs={"dataset": wikipedia_dataset, "section_name": "validation", "max_lines": validation_max_lines}
    ),
    "test": Dataset.from_generator(
        tokenized_dataset_generator,
        gen_kwargs={"dataset": wikipedia_dataset, "section_name": "test", "max_lines": test_max_lines}
    ),
})

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (801 > 512). Running this sequence through the model will result in indexing errors


In [17]:
tokenized_bookcorpus_dataset.save_to_disk(tokenized_bookcorpus_dataset_path)
tokenized_wikipedia_dataset.save_to_disk(tokenized_wikipedia_dataset_path)

Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/100000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]