In [1]:
__shop__ = "AINLP"
__link__ = "https://shop128061183.taobao.com/"
__date__ = 2023 / 5 / 4

### 1. 安装所需要的包

In [2]:
# !pip install transformers -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install torch -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install scikit_learn -i https://pypi.tuna.tsinghua.edu.cn/simple

In [1]:
import json
import logging
import random
from collections import OrderedDict

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn import preprocessing
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.model_selection import train_test_split
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm
from transformers import AdamW, BertForSequenceClassification, BertTokenizer

logging.basicConfig(
    level=logging.INFO,
    filename="bert.log",
    filemode="w",
    format="%(asctime)s - %(message)s",
)
# logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')

In [2]:
SEED = 9999
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### 2.分词器，wordpiece

In [3]:
# huggingface
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

### 3.读入数据

In [4]:
def get_evidence_num(json_file):
    f_json = json.load(open(json_file))
    evidences = []
    for k, v in f_json.items():
        evidences.extend(v["evidences"])
    return set(evidences)

In [9]:
train_evidences = get_evidence_num("project-data/train-claims.json")
dev_evidences = get_evidence_num("project-data/dev-claims.json")

In [10]:
len(dev_evidences)

463

In [12]:
all_evidences = list(train_evidences | dev_evidences)
len(all_evidences)

3443

In [13]:
def get_evidence_data(json_file):
    f_json = json.load(open(json_file))
    return f_json

In [14]:
evidence_dict = get_evidence_data("project-data/evidence.json")

In [15]:
len(evidence_dict)

1208827

In [16]:
def read_data(file):
    texts = []
    labels = []
    with open(file, "r", encoding="utf-8") as f:
        data = json.load(f)
        for k, v in data.items():
            text1 = str(v["claim_text"]).strip()
            for evi in v["evidences"]:
                text2 = str(evidence_dict[evi]).strip()
                label = 1
                texts.append([text1, text2])
                labels.append(label)
                ran = random.choice(all_evidences)
                while ran in v["evidences"]:
                    ran = random.choice(all_evidences)
                texts.append([text1, evidence_dict[ran]])
                labels.append(0)
    assert len(texts) == len(labels)
    return texts, labels

In [17]:
train_texts, train_labels = read_data("project-data/train-claims.json")
val_texts, val_labels = read_data("project-data/dev-claims.json")

In [22]:
len(train_texts), len(val_texts)

(8244, 982)

### 5.查看text和label

In [23]:
train_texts[:3]

[['Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.',
  'At very high concentrations (100 times atmospheric concentration, or greater), carbon dioxide can be toxic to animal life, so raising the concentration to 10,000 ppm (1%) or higher for several hours will eliminate pests such as whiteflies and spider mites in a greenhouse.'],
 ['Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.',
  '"It\'s a fact: climate change made Hurricane Harvey more deadly".'],
 ['Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.',
  'Plants can grow as much as 50 percent faster in concentrations of 1,000 ppm CO 2 when compared with ambient conditions, though this assumes no change in climate and no limi

In [19]:
train_labels[:3]

[1, 0, 1]

### 6.求最大长度，为后面分词做准备

In [24]:
max_len = max([len(item) for item in train_texts])
print(max_len)

max_len = max([len(item) for item in val_texts])
print(max_len)

2
2


### 7. label和id进行映射

In [30]:
label2id = {
    item: idx for idx, item in enumerate(sorted(set(train_labels + val_labels)))
}
id2label = {v: k for k, v in label2id.items()}

In [31]:
label2id, id2label

({0: 0, 1: 1}, {0: 0, 1: 1})

### 8.训练集和验证集 分词

In [None]:
train_encodings = tokenizer(
    [i[0] for i in train_texts],
    [i[1] for i in train_texts],
    truncation=True,
    padding="max_length",
    max_length=256,
)
val_encodings = tokenizer(
    [i[0] for i in val_texts],
    [i[1] for i in val_texts],
    truncation=True,
    padding="max_length",
    max_length=256,
)

### 9.创建Dataset

In [38]:
class CuDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        idx = int(idx)
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [39]:
train_dataset = CuDataset(train_encodings, train_labels)
val_dataset = CuDataset(val_encodings, val_labels)

### 10.创建Dataloader

In [41]:
batch_size = 2
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

### 11.加载模型

In [43]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(label2id)
)

device = (
    torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
)  # 使用cpu或者gpu
model.to(device)
model.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

### 12.计算Accuracy，Precision，Recall，F1 score

In [44]:
def compute_metrics(labels, preds):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average="macro")
    recall = recall_score(labels, preds, average="macro")
    f1 = f1_score(labels, preds, average="macro")
    logging.info(f"accuracy: {accuracy}")
    logging.info(f"precision: {precision}")
    logging.info(f"recall: {recall}")
    logging.info(f"f1: {f1}\n")
    return accuracy, precision, recall, f1

### 13.评估模型

In [45]:
@torch.no_grad()
def eval_model(model, eval_loader):
    model.eval()
    labels = []
    preds = []
    for idx, batch in enumerate(eval_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        labels.extend(batch["labels"].numpy())
        outputs = model(
            input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
        )  # 输出所有概率
        preds.extend(torch.argmax(outputs[0], dim=-1).cpu().numpy())  # 拿到标签
    macro_f = compute_metrics(labels, preds)
    model.train()
    return macro_f

### 14.训练模型

In [47]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "gamma", "beta"]
optimizer_grouped_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay_rate": 0.01,
    },
    {
        "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        "weight_decay_rate": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

step = 0
best_acc = 0
epoch = 2
writer = SummaryWriter(log_dir="model_best")
for epoch in tqdm(range(epoch)):
    for idx, batch in tqdm(
        enumerate(train_loader), total=len(train_texts) // batch_size, leave=False
    ):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels,
        )
        loss = outputs[0]  # 计算Loss
        logging.info(f"Epoch-{epoch}, Step-{step}, Loss: {loss.cpu().detach().numpy()}")
        step += 1
        loss.backward()
        optimizer.step()
        writer.add_scalar("train_loss", loss.item(), step)
    logging.info(f"Epoch {epoch}, present best acc: {best_acc}, start evaluating.")
    accuracy, precision, recall, f1 = eval_model(model, eval_loader)  # 评估模型
    writer.add_scalar("dev_accuracy", accuracy, step)
    writer.add_scalar("dev_precision", precision, step)
    writer.add_scalar("dev_recall", recall, step)
    writer.add_scalar("dev_f1", f1, step)
    if accuracy > best_acc:
        model.save_pretrained("model_best")  # 保存模型
        tokenizer.save_pretrained("model_best")
        best_acc = accuracy

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/4122 [00:00<?, ?it/s]

: 

: 