In [1]:
file_path = "annotated_data.jsonl"
utf8_file_path = "annotated_data_utf8.jsonl"

with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

with open(utf8_file_path, "w", encoding="utf-8") as f:
    f.writelines(lines)

In [9]:
import pandas as pd
import json

file_path = "annotated_data_utf8.jsonl"
data = []
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)

# 檢查字段類型
print(df.dtypes)

# 如果需要轉換類型，例如將 'labels' 轉為字串
df["text"] = df["text"].astype(str)
df["label"] = df["label"].astype(str)
df["Comments"] = df["Comments"].astype(str)

print(df.dtypes)

# 保存修復後的數據
df.to_json("annotated_data_corrected.jsonl", orient="records", lines=True, force_ascii=False)


id           int64
text        object
label       object
Comments    object
dtype: object
id           int64
text        object
label       object
Comments    object
dtype: object


In [10]:
from datasets import Dataset

file_path = "annotated_data_corrected.jsonl"
data = []

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

# 創建 Hugging Face Dataset
dataset = Dataset.from_list(data)
print(dataset)


Dataset({
    features: ['id', 'text', 'label', 'Comments'],
    num_rows: 105
})


In [12]:
from datasets import DatasetDict, Dataset

# 分離訓練和測試數據
train_data = data[:int(0.8 * len(data))]  # 前 80% 為訓練集
test_data = data[int(0.8 * len(data)):]   # 後 20% 為測試集

# 創建 DatasetDict
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "test": Dataset.from_list(test_data)
})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'Comments'],
        num_rows: 84
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'Comments'],
        num_rows: 21
    })
})


In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")

# 標籤映射
labels = dataset["train"].unique("label") if "label" in dataset["train"].column_names else None
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

In [18]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["token"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(example["label"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # 對應的 word ID
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # 忽略的標籤
            elif word_idx != previous_word_idx:  # 新的詞
                label_ids.append(label2id[label[word_idx]])
            else:  # 相同詞的後續部分
                label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["label"] = labels
    return tokenized_inputs

In [19]:
# 應用 Tokenization
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/84 [00:00<?, ? examples/s]


KeyError: 'token'