#####  1. Dataset preparation

The common large-scale data sets for pre-training language models can be downloaded and loaded directly in the Dataset library. For example, if the English corpus of Wikipedia is used, the data can be obtained directly through the following code:

In [3]:
import os

# 设置环境变量
os.environ["HF_DATASETS_CACHE"] = 'E:\\huggingface_model'
os.environ["HF_HOME"] = 'E:\\huggingface_model'
os.environ["HUGGINGFACE_HUB_CACHE"] = 'E:\\huggingface_model'
os.environ['TRANSFORMERS_CACHE'] = 'E:\\huggingface_model'

# 确保目录存在
cache_dir = 'E:\\huggingface_model'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

# 导入需要的库
from datasets import concatenate_datasets, load_dataset

# 加载数据集
bookcorpus = load_dataset("bookcorpus", split="train", cache_dir=cache_dir, trust_remote_code=True)
wiki = load_dataset("wikipedia", "20220301.en", split="train", cache_dir=cache_dir, trust_remote_code=True)
wiki = wiki.remove_columns([col for col in wiki.column_names if col != 'text'])

# 合并数据集
dataset = concatenate_datasets([bookcorpus, wiki])

# 划分训练集和测试集
d = dataset.train_test_split(test_size=0.1)

# 打印一些信息来确认
print(d)


Downloading data: 100%|██████████| 41/41 [00:36<00:00,  1.12files/s]
Generating train split: 100%|██████████| 6458670/6458670 [00:40<00:00, 157701.13 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 72416608
    })
    test: Dataset({
        features: ['text'],
        num_rows: 8046290
    })
})


Next, the training and test data are saved in the local file respectively.

In [5]:
def dataset_to_text(dataset, output_filename = "data.txt"):
    with open(output_filename, 'w') as f:
        for t in dataset["text"]:
            print(t, file=f)

dataset_to_text(d["train"], "train.txt")
dataset_to_text(d["test"], "test.txt")

#####  2. Training the Tokenizer

BERT use wordpiece word segmentation to determine whether to cut a complete word into multiple lexical elements according to the word frequency in the training corpus.

In [None]:
special_tokens = [
    "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
]

files = ["train.txt"]
vocab_size = 30_522
max_length = 512
truncate_longer_sample = False
