In [8]:
# 安装必要库
!pip install --upgrade pip
!pip install transformers datasets seqeval accelerate evaluate -q



In [9]:
import pandas as pd
import numpy as np
from ast import literal_eval
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
import torch
import os
import evaluate


In [10]:
DATASET_PATH = "/content/drive/MyDrive/COMP4211/PROJECT"

In [11]:
# 1. 数据加载与预处理
def load_data(file_path):
    df = pd.read_csv(file_path)
    # 转换字符串列表为实际列表
    df['Sentence'] = df['Sentence'].apply(literal_eval)
    df['NER Tag'] = df['NER Tag'].apply(literal_eval)
    return df

train_df = load_data(os.path.join(DATASET_PATH, "train.csv"))
# test_df = load_data(os.path.join(DATASET_PATH, "test.csv"))

# 划分验证集
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# 转换为HuggingFace Dataset格式
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
# test_ds = Dataset.from_pandas(test_df)

# 2. 标签处理
label_list = sorted(list(set(tag for tags in train_df['NER Tag'] for tag in tags)))
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

# 3. 加载Tokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 4. 数据预处理函数
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["Sentence"],
        truncation=True,
        is_split_into_words=True,
        padding="max_length",
        max_length=128
    )

    labels = []
    for i, tags in enumerate(examples["NER Tag"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[tags[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# 应用预处理
tokenized_train = train_ds.map(tokenize_and_align_labels, batched=True)
tokenized_val = val_ds.map(tokenize_and_align_labels, batched=True)
# tokenized_test = test_ds.map(tokenize_and_align_labels, batched=True)

OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [6]:
# 5. 创建模型
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# 6. 评估指标
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# 7. 训练参数设置
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir='./logs',
    report_to="none"
)

# 8. 数据收集器
data_collator = DataCollatorForTokenClassification(tokenizer)

# 9. 创建Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0991,0.0942,0.825725,0.823767,0.824745,0.970415
2,0.0772,0.087333,0.83401,0.841606,0.837791,0.972633
3,0.0603,0.089071,0.836319,0.847858,0.842049,0.973004


  _warn_prf(average, modifier, msg_start, len(result))


('./ner_model/tokenizer_config.json',
 './ner_model/special_tokens_map.json',
 './ner_model/vocab.txt',
 './ner_model/added_tokens.json',
 './ner_model/tokenizer.json')

In [None]:
trainer.train()

model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")

In [7]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params / 1e6:.1f}M")

Total parameters: 107.7M


In [10]:
!mv "ner_model" "/content/drive/MyDrive/COMP4211/PROJECT"

In [None]:
def load_test_data(file_path):
    df = pd.read_csv(file_path)
    df['Sentence'] = df['Sentence'].apply(literal_eval)
    # 测试集可能没有 'NER Tag' 列
    if 'NER Tag' in df.columns:
        df = df.drop(columns=['NER Tag'])
    return df

test_df = load_test_data(os.path.join(DATASET_PATH,"test.csv"))
test_ds = Dataset.from_pandas(test_df)

In [None]:
def predict_and_save(model_path, test_ds, output_file="submission.csv"):
    # 加载训练好的模型
    model = AutoModelForTokenClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # 数据预处理（无需标签）
    def tokenize(examples):
        return tokenizer(
            examples["Sentence"],
            truncation=True,
            is_split_into_words=True,
            padding="max_length",
            max_length=128
        )

    tokenized_test = test_ds.map(tokenize, batched=True)

    # 推理预测
    trainer = Trainer(model=model, tokenizer=tokenizer)
    predictions = trainer.predict(tokenized_test)
    preds = np.argmax(predictions.predictions, axis=2)

    # 对齐标签与原始单词
    final_preds = []
    for i in range(len(preds)):
        word_ids = tokenized_test[i]["word_ids"]
        previous_word_idx = None
        word_labels = []
        for word_idx, pred in zip(word_ids, preds[i]):
            if word_idx is None:
                continue
            if word_idx != previous_word_idx:
                word_labels.append(id2label[pred])
            previous_word_idx = word_idx
        final_preds.append(word_labels)

    # 保存结果
    test_df["Predicted NER Tag"] = final_preds
    submission_df = test_df[["id", "Predicted NER Tag"]]
    submission_df.to_csv(output_file, index=False)
    print(f"Submission saved to {output_file}")

# 使用保存的模型进行预测
predict_and_save(os.path.join(DATASET_PATH,"ner_model"), test_ds)