In [5]:
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer

In [6]:
dataset = load_dataset("./datasets/clue-ner", split="train")
dataset

Dataset({
    features: ['text', 'entities'],
    num_rows: 10748
})

In [16]:
tokenizer = AutoTokenizer.from_pretrained("./pretrained_models/bert-base-chinese")
def encode(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")

new_dataset = dataset.map(encode, batched=True)
new_dataset

Dataset({
    features: ['text', 'entities', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10748
})

In [17]:
new_dataset = new_dataset.map(lambda examples: {"labels": examples["entities"]}, batched=True)
new_dataset

Map: 100%|██████████| 10748/10748 [00:00<00:00, 38063.33 examples/s]


Dataset({
    features: ['text', 'entities', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 10748
})

In [21]:
new_dataset.set_format(type="torch", columns=['input_ids', 'token_type_ids', 'attention_mask'])
next(iter(new_dataset))

{'input_ids': tensor([ 101, 3851, 1555, 7213, 6121,  821,  689,  928, 6587, 6956, 1383, 5439,
         3424, 1300, 1894, 1156,  794, 1369,  671,  702, 6235, 2428, 2190,  758,
         6887, 7305, 3546, 6822, 6121,  749, 6237, 6438,  511, 1383, 5439, 3424,
         6371,  711, 8024, 2190, 4680, 1184, 1744, 1079, 1555,  689, 7213, 6121,
         5445, 6241, 8024,  102,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0, 

# 自定义DataSet

### List创建

In [25]:
import numpy as np
# 从list中加载数据，行数据集
data1 = {"data": np.random.random(), "label": np.random.randint(0, 3)}
data2 = {"data": np.random.random(), "label": np.random.randint(0, 3)}
ds = Dataset.from_list([data1, data2])
ds

Dataset({
    features: ['data', 'label'],
    num_rows: 2
})

### Dict创建

In [27]:
# 从dict中加载数据， 列数据集
data = np.random.random(10)
label = np.random.randint(0, 3, size=(10, ))
ds = Dataset.from_dict(
    {
        "data": data,
        "label": label
    }
)
ds

Dataset({
    features: ['data', 'label'],
    num_rows: 10
})

### Generator创建

In [28]:
# 通过生成器加载， 大数据集
def gen(nums):
    for i in range(nums):
        yield {"data": np.random.random(), "label": np.random.randint(0, 3)}

ds = Dataset.from_generator(gen, gen_kwargs={"nums": 5})
ds

Generating train split: 5 examples [00:00, 2266.70 examples/s]


Dataset({
    features: ['data', 'label'],
    num_rows: 5
})