# 加载本地自己的数据
## 加载文件作为数据集

In [1]:
from datasets import *

In [3]:
# 此处加载model实训中的酒店评价文件
dataset = load_dataset("csv", data_files="../03-model/comment_classification/ChnSentiCorp_htl_all.csv")
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 7766
    })
})

In [4]:
dataset = Dataset.from_csv("../03-model/comment_classification/ChnSentiCorp_htl_all.csv")
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['label', 'review'],
    num_rows: 7766
})

## 以文件夹进行加载

In [7]:
# 将点评数据复制三分到all_data文件夹下
dataset = load_dataset("csv", data_dir="./all_data", split="train")
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 23298
})

## 通过其它方式完成预先加载后转换数据集格式

In [9]:
import pandas as pd

data = pd.read_csv("all_data/ChnSentiCorp_htl_all_1.csv")
dataset = Dataset.from_pandas(data)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7766
})

In [10]:
# list格式的数据需要内嵌{}，因为dataset是含有features的，所以需要明确字段数据
# data = ["abd", "def"]       # 无法载入成功
data = [{"text": "abc"}, {"text": "def"}]
Dataset.from_list(data)

Dataset({
    features: ['text'],
    num_rows: 2
})

# DataCollator

In [2]:
from transformers import DataCollatorWithPadding

In [3]:
dataset = load_dataset("csv", data_files="all_data/ChnSentiCorp_htl_all_1.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/7766 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

In [7]:
def process_function(examples):
    tokenizer_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenizer_examples["labels"] = examples["label"]
    return tokenizer_examples

In [8]:
tokenized_data = dataset.map(process_function, batched=True, remove_columns=dataset.column_names)
tokenized_data

Map:   0%|          | 0/7765 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 7765
})

In [11]:
# 未进行填充可以发现，分词的长度是不一致的，且attention_mask均为1
print(tokenized_data[:2])

{'input_ids': [[101, 6655, 4895, 2335, 3763, 1062, 6662, 6772, 6818, 117, 852, 3221, 1062, 769, 2900, 4850, 679, 2190, 117, 1963, 3362, 3221, 107, 5918, 7355, 5296, 107, 4638, 6413, 117, 833, 7478, 2382, 7937, 4172, 119, 2456, 6379, 4500, 1166, 4638, 6662, 5296, 119, 2791, 7313, 6772, 711, 5042, 1296, 119, 102], [101, 1555, 1218, 1920, 2414, 2791, 8024, 2791, 7313, 2523, 1920, 8024, 2414, 3300, 100, 2160, 8024, 3146, 860, 2697, 6230, 5307, 3845, 2141, 2669, 679, 7231, 106, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [13]:
# 上述未进行填充，此处进行
from torch.utils.data import DataLoader
collator = DataCollatorWithPadding(tokenizer=tokenizer)
dl = DataLoader(tokenized_data, batch_size=4, collate_fn=collator, shuffle=True)

In [16]:
for bach in dl:
    print(bach["input_ids"].size())
# 观察结果可知，并不是所有数据都填充至128，有些长度整体较短的batch会动态填充至批次最长

torch.Size([4, 48])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 125])
torch.Size([4, 128])
torch.Size([4, 126])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 80])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 119])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 93])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 79])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 92])
torch.Size([4, 128])
torch.Size([4, 88])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 114])
torch.Size([4, 128])
torch.Size([4, 121])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 65])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 78])
torch.Size([4, 76])
torch.Size([4, 128])
torch.Size([4, 128])
torch.Size([4, 83])
to