In [13]:
import warnings
warnings.filterwarnings("ignore")

tag2idx = {
    "O": 0,
    "B-NAME": 1,
    "M-NAME": 2,
    "E-NAME": 3,
    "S-NAME": 4,
    "B-CONT": 5,
    "M-CONT": 6,
    "E-CONT": 7,
    "S-CONT": 8,
    "B-EDU": 9,
    "M-EDU": 10,
    "E-EDU": 11,
    "S-EDU": 12,
    "B-TITLE": 13,
    "M-TITLE": 14,
    "E-TITLE": 15,
    "S-TITLE": 16,
    "B-ORG": 17,
    "M-ORG": 18,
    "E-ORG": 19,
    "S-ORG": 20,
    "B-RACE": 21,
    "M-RACE": 22,
    "E-RACE": 23,
    "S-RACE": 24,
    "B-PRO": 25,
    "M-PRO": 26,
    "E-PRO": 27,
    "S-PRO": 28,
    "B-LOC": 29,
    "M-LOC": 30,
    "E-LOC": 31,
    "S-LOC": 32,
    "START": 33, 
    "STOP": 34
}
sort_labels = [
    "O",
    "B-NAME",
    "M-NAME",
    "E-NAME",
    "S-NAME",
    "B-CONT",
    "M-CONT",
    "E-CONT",
    "S-CONT",
    "B-EDU",
    "M-EDU",
    "E-EDU",
    "S-EDU",
    "B-TITLE",
    "M-TITLE",
    "E-TITLE",
    "S-TITLE",
    "B-ORG",
    "M-ORG",
    "E-ORG",
    "S-ORG",
    "B-RACE",
    "M-RACE",
    "E-RACE",
    "S-RACE",
    "B-PRO",
    "M-PRO",
    "E-PRO",
    "S-PRO",
    "B-LOC",
    "M-LOC",
    "E-LOC",
    "S-LOC",
]

#保存数据
def data2txt(predict, path):
    with open(path, "w", encoding="utf-8") as file:
        for item in predict:
            for i in range(len(item[0])):
                file.write("{} {}\n".format(item[0][i], item[1][i]))
            file.write("\n")

# 数据预处理

In [2]:
import torch
from torch.utils.data import Dataset

def DataProcess(path):
    data = []
    sentence = []
    tag = []
    s = []
    t = []
    with open(path, "r", encoding="utf-8") as file:
        for line in file:
            if line != "\n":
                line = line.rstrip().split()
                sentence.append(line[0])
                tag.append(line[1])
            else:
                data.append([sentence, tag])
                s.append(sentence)
                t.append(tag)
                sentence = []
                tag = []
    return data

def Idx2tag():
    idx2tag = {}
    for key, value in tag2idx.items():
        idx2tag[value] = key
    return idx2tag

def char_list(data):
    character = []
    for i in range(len(data)):
        for t in data[i][0]:
            character.append(t)
    return list(set(character))

def CharDict(path):
    data = DataProcess(path)
    char2idx = {'PADDING': 0, 'UNKNOWN': 1}
    idx2char = {0: 'PADDING', 1: 'UNKNOWN'}
    l = char_list(data)
    for i, v in enumerate(l):
        char2idx[v] = i+2
        idx2char[i+2] = v
    return char2idx,idx2char

class Mydataset(Dataset):
    def __init__(self, file_path, idx2tag, chardict):
        self.data = DataProcess(file_path)
        self.label_map = tag2idx
        self.label_map_inv = idx2tag
        self.char2idx, self.idx2char = chardict
        self.examples = []
        for text, label in self.data:
            t = [self.char2idx.get(t, self.char2idx['UNKNOWN']) for t in text]
            l = [self.label_map[l] for l in label]
            self.examples.append([t, l])

    def __getitem__(self, item):
        return self.examples[item]

    def __len__(self):
        return len(self.data)

    def collect_fn(self, batch):
        text = [t for t, _ in batch]
        label = [l for _, l in batch]
        seq_len = [len(i) for i in text]
        max_len = max(seq_len)
        # 由于LSTM需要保证一个batch的输入长度一致
        # 因此需要将所有将所有句子填充到最大长度，用PADDING补齐，标签用'O'补齐
        text = [t + [self.char2idx['PADDING']] * (max_len - len(t)) for t in text]
        label = [l + [self.label_map['O']] * (max_len - len(l)) for l in label]

        text = torch.tensor(text, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.long)
        seq_len = torch.tensor(seq_len, dtype=torch.long)

        return text, label, seq_len


In [3]:
print(tag2idx)
print(Idx2tag())

{'O': 0, 'B-NAME': 1, 'M-NAME': 2, 'E-NAME': 3, 'S-NAME': 4, 'B-CONT': 5, 'M-CONT': 6, 'E-CONT': 7, 'S-CONT': 8, 'B-EDU': 9, 'M-EDU': 10, 'E-EDU': 11, 'S-EDU': 12, 'B-TITLE': 13, 'M-TITLE': 14, 'E-TITLE': 15, 'S-TITLE': 16, 'B-ORG': 17, 'M-ORG': 18, 'E-ORG': 19, 'S-ORG': 20, 'B-RACE': 21, 'M-RACE': 22, 'E-RACE': 23, 'S-RACE': 24, 'B-PRO': 25, 'M-PRO': 26, 'E-PRO': 27, 'S-PRO': 28, 'B-LOC': 29, 'M-LOC': 30, 'E-LOC': 31, 'S-LOC': 32, 'START': 33, 'STOP': 34}
{0: 'O', 1: 'B-NAME', 2: 'M-NAME', 3: 'E-NAME', 4: 'S-NAME', 5: 'B-CONT', 6: 'M-CONT', 7: 'E-CONT', 8: 'S-CONT', 9: 'B-EDU', 10: 'M-EDU', 11: 'E-EDU', 12: 'S-EDU', 13: 'B-TITLE', 14: 'M-TITLE', 15: 'E-TITLE', 16: 'S-TITLE', 17: 'B-ORG', 18: 'M-ORG', 19: 'E-ORG', 20: 'S-ORG', 21: 'B-RACE', 22: 'M-RACE', 23: 'E-RACE', 24: 'S-RACE', 25: 'B-PRO', 26: 'M-PRO', 27: 'E-PRO', 28: 'S-PRO', 29: 'B-LOC', 30: 'M-LOC', 31: 'E-LOC', 32: 'S-LOC', 33: 'START', 34: 'STOP'}


In [4]:
char2idx,idx2char = CharDict("../NER/Chinese/train.txt")
print(char2idx)
print(idx2char)

{'PADDING': 0, 'UNKNOWN': 1, '汨': 2, '欣': 3, '另': 4, '则': 5, '单': 6, '谭': 7, '跨': 8, '脏': 9, '好': 10, '判': 11, '水': 12, '整': 13, '肇': 14, '定': 15, '江': 16, '萧': 17, '曹': 18, '过': 19, '城': 20, '谷': 21, '青': 22, '饪': 23, '吉': 24, '塑': 25, '正': 26, '印': 27, '暨': 28, '齐': 29, '；': 30, '锋': 31, '泽': 32, '相': 33, '丁': 34, '挂': 35, '套': 36, '护': 37, '批': 38, '丽': 39, '科': 40, '薛': 41, '勃': 42, '陆': 43, '改': 44, '仁': 45, '卓': 46, '年': 47, '数': 48, '监': 49, '用': 50, '益': 51, '涌': 52, '昊': 53, '皖': 54, '册': 55, '肃': 56, '壳': 57, '折': 58, '滔': 59, '赴': 60, '未': 61, '徐': 62, '伦': 63, '观': 64, '癀': 65, '启': 66, '控': 67, '询': 68, '苏': 69, '贴': 70, '强': 71, '纤': 72, '狮': 73, '炉': 74, '莞': 75, '零': 76, '找': 77, '迎': 78, '矩': 79, '禽': 80, '看': 81, '彭': 82, '。': 83, '湖': 84, '臣': 85, '沿': 86, '务': 87, '宽': 88, '得': 89, '富': 90, 'b': 91, '高': 92, '省': 93, '炯': 94, '啤': 95, '熊': 96, '鉴': 97, '荷': 98, '鸿': 99, '隆': 100, '琦': 101, '人': 102, '段': 103, '鸽': 104, '礼': 105, '感': 106, '保': 107, '稻': 108, '然': 10

# BiLSTM-CRF

In [5]:
import torch
import torch.nn as nn


def argmax(vec):
    _, idx = torch.max(vec, 1)
    return idx.item()


def log_sum_exp(loss):
    max_score, _ = torch.max(loss, dim=-1)
    max_score_broadcast = max_score.unsqueeze(-1).repeat_interleave(loss.shape[-1], dim=-1)
    return max_score + torch.log(torch.sum(torch.exp(loss - max_score_broadcast), dim=-1))


class BiLSTM_CRF(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab, label_map, device='cpu'):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim  # 词向量维度
        self.hidden_dim = hidden_dim  
        self.vocab_size = len(vocab)  # 词表大小
        self.tag_size = len(label_map)  # 标签个数
        self.device = device
        self.state = 'train'  # 模型有'train'、'eval'、'pred'三种状态

        self.word_embeds = nn.Embedding(self.vocab_size, embedding_dim)
        self.dropout = nn.Dropout(p=0.5, inplace=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=2, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, self.tag_size, bias=True)
        self.crf = CRF(label_map, device)
        self.layer_norm = nn.LayerNorm(self.hidden_dim)

    def _get_lstm_features(self, sentence, seq_len):
        embeds = self.word_embeds(sentence)
        self.dropout(embeds)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embeds, seq_len.to('cpu'), batch_first=True, enforce_sorted=False)
        lstm_out, _ = self.lstm(packed)
        seq_unpacked, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        seqence_output = self.layer_norm(seq_unpacked)
        lstm_features = self.hidden2tag(seqence_output)
        return lstm_features

    def forward(self, sentence, seq_len, tags=''):
        features = self._get_lstm_features(sentence, seq_len)
        if self.state == 'train':
            loss = self.crf.criterion(features, tags, seq_len)
            return loss
        elif self.state == 'eval':
            tag = []
            for i, feat in enumerate(features):
                tag.append(self.crf._viterbi(feat[:seq_len[i]])[1])
            return tag
        else:
            return self.crf._viterbi(features[0])[1]

class CRF:
    def __init__(self, label_map, device='cpu'):
        self.label_map = label_map
        self.label_map_inv = {v: k for k, v in label_map.items()}
        self.tag_size = len(self.label_map)
        self.device = device

        # 转移概率矩阵
        self.transitions = nn.Parameter(
            torch.randn(self.tag_size, self.tag_size)).to(self.device)

        # 增加开始和结束标志
        self.START_TAG = "START"
        self.STOP_TAG = "STOP"
        self.transitions.data[self.label_map[self.START_TAG], :] = -10000
        self.transitions.data[:, self.label_map[self.STOP_TAG]] = -10000

    def _forward(self, feats, seq_len):
        init_alphas = torch.full((self.tag_size,), -10000.)
        init_alphas[self.label_map[self.START_TAG]] = 0.

        forward = torch.zeros(feats.shape[0], feats.shape[1] + 1, feats.shape[2], dtype=torch.float32,
                                  device=self.device)
        forward[:, 0, :] = init_alphas

        transitions = self.transitions.unsqueeze(0).repeat(feats.shape[0], 1, 1)
        for seq_i in range(feats.shape[1]):
            emit_score = feats[:, seq_i, :]
            tag_var = (
                    forward[:, seq_i, :].unsqueeze(1).repeat(1, feats.shape[2], 1)  # (batch_size, tag_size, tag_size)
                    + transitions
                    + emit_score.unsqueeze(2).repeat(1, 1, feats.shape[2])
            )
            cloned = forward.clone()
            cloned[:, seq_i + 1, :] = log_sum_exp(tag_var)
            forward = cloned

        forward = forward[range(feats.shape[0]), seq_len, :]
        last = forward + self.transitions[self.label_map[self.STOP_TAG]].unsqueeze(0).repeat(feats.shape[0], 1)
        alpha = log_sum_exp(last)
        return alpha

    def _score(self, feats, tags, seq_len):
        score = torch.zeros(feats.shape[0], device=self.device)
        start = torch.tensor([self.label_map[self.START_TAG]], device=self.device).unsqueeze(0).repeat(feats.shape[0], 1)
        tags = torch.cat([start, tags], dim=1)
        for batch_i in range(feats.shape[0]):
            score[batch_i] = torch.sum(
                self.transitions[tags[batch_i, 1:seq_len[batch_i] + 1], tags[batch_i, :seq_len[batch_i]]]) \
                             + torch.sum(feats[batch_i, range(seq_len[batch_i]), tags[batch_i][1:seq_len[batch_i] + 1]])
            score[batch_i] += self.transitions[self.label_map[self.STOP_TAG], tags[batch_i][seq_len[batch_i]]]
        return score

    def _viterbi(self, feats):
        path = []
        init_vvars = torch.full((1, self.tag_size), -10000., device=self.device)
        init_vvars[0][self.label_map[self.START_TAG]] = 0

        forward = init_vvars
        for feat in feats:
            tag_path = [] 
            score_path = [] 
            for next_tag in range(self.tag_size):
                next_tag_var = forward + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                tag_path.append(best_tag_id)
                score_path.append(next_tag_var[0][best_tag_id].view(1))
            forward = (torch.cat(score_path) + feat).view(1, -1)
            path.append(tag_path)

        last = forward + self.transitions[self.label_map[self.STOP_TAG]]
        best_tag_id = argmax(last)
        path_score = last[0][best_tag_id]

        best_path = [best_tag_id]
        for tag_path in reversed(path):
            best_tag_id = tag_path[best_tag_id]
            best_path.append(best_tag_id)
        start = best_path.pop()
        assert start == self.label_map[self.START_TAG]
        best_path.reverse()
        return path_score, best_path

    def criterion(self, feats, tags, seq_len):
        forward_score = self._forward(feats, seq_len)
        gold_score = self._score(feats, tags, seq_len)
        return torch.mean(forward_score - gold_score)

# 训练

In [6]:
from tqdm import tqdm
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn import metrics

torch.manual_seed(904)

embedding_size = 128
hidden_dim = 768
epochs = 10
batch_size = 64
device = "cuda" if torch.cuda.is_available() else "cpu"

chardict = CharDict("../NER/Chinese/train.txt")
valid_data = DataProcess("../NER/Chinese/validation.txt")
train_dataset = Mydataset("../NER/Chinese/train.txt", Idx2tag(), chardict)
valid_dataset = Mydataset("../NER/Chinese/validation.txt", Idx2tag(), chardict)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0, pin_memory=True, shuffle=True,
                              collate_fn=train_dataset.collect_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=1, num_workers=0, pin_memory=False, shuffle=False,
                              collate_fn=valid_dataset.collect_fn)
model = BiLSTM_CRF(embedding_size, hidden_dim, train_dataset.char2idx, train_dataset.label_map, device).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)


In [9]:

def train():
    best_score = 0
    for epoch in tqdm(range(epochs)):
        model.train()
        model.state = 'train'
        train_loss = []
        for (text, label, seq_len) in tqdm(train_dataloader):
            model.zero_grad()
            text = text.to(device)
            label = label.to(device)
            seq_len = seq_len.to(device)

            loss = model(text, seq_len, label)

            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())

        train_loss = sum(train_loss) / len(train_loss)
        print(f"第{epoch + 1}个epoch训练结果: 损失 = {train_loss:.5f}")

        # 每周期验证一次，保存最优参数
        score = evaluate()
        if score > best_score:
            print(f"第{epoch + 1}个epoch验证结果: F1 score = {score:.5f} -> 最高分数")
            print("保存参数")
            best_score = score
            torch.save(model.state_dict(), "zh_best_parameter.ckpt")
        else:
            print(f"第{epoch + 1}个epoch验证结果: F1 score = {score:.5f}")

# 计算当前得分
def evaluate():
    idx2tag = Idx2tag()
    model.eval()
    model.state = 'pred'
    predict_tag = []
    with torch.no_grad():
        for sentence in tqdm(valid_data):
            text = sentence[0]
            text = [char2idx.get(t, char2idx['UNKNOWN']) for t in text]
            seq_len = torch.tensor(len(text), dtype=torch.long).unsqueeze(0)
            seq_len = seq_len.to(device)
            text = torch.tensor(text, dtype=torch.long).unsqueeze(0)
            text = text.to(device)
            batch_tag = model(text, seq_len)
            pred = [idx2tag[t] for t in batch_tag]
            predict_tag.append([sentence[0],pred])
    print("开始计算结果")
    data2txt(predict_tag, "./my_Chinese_result.txt")
    return check(gold_path="../NER/Chinese/validation.txt",
        my_path="./my_Chinese_result.txt",if_print=False)

def check(gold_path, my_path, if_print):
    y_true = []
    y_pred = []
    with open(gold_path, "r", encoding="utf-8") as g_f, open(my_path, "r", encoding="utf-8") as m_f:
        g_lines = g_f.readlines()
        m_lines = m_f.readlines()
        assert len(g_lines) == len(m_lines), "Length is Not Equal."
        for i in tqdm(range(len(g_lines))):
            if g_lines[i] == "\n":
                continue
            g_word, g_tag = g_lines[i].strip().split(" ")
            m_word, m_tag = m_lines[i].strip().split(" ")
            y_true.append(g_tag)
            y_pred.append(m_tag)
    if if_print: print(metrics.classification_report(y_true=y_true, y_pred=y_pred, labels=sort_labels[1:], digits=4))
    return metrics.f1_score(y_true=y_true, y_pred=y_pred, average='micro', labels=sort_labels[1:])


In [10]:
print(device)
train()

  0%|          | 0/60 [00:00<?, ?it/s]

cuda


100%|██████████| 60/60 [00:16<00:00,  3.69it/s]


第1个epoch训练结果: 损失 = 6.77394


100%|██████████| 14344/14344 [00:00<00:00, 843210.88it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

开始计算结果
第1个epoch验证结果: F1 score = 0.94069 -> 最高分数
保存参数


100%|██████████| 60/60 [00:16<00:00,  3.71it/s]


第2个epoch训练结果: 损失 = 5.31858


100%|██████████| 14344/14344 [00:00<00:00, 831062.35it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

开始计算结果
第2个epoch验证结果: F1 score = 0.93952


100%|██████████| 60/60 [00:16<00:00,  3.68it/s]


第3个epoch训练结果: 损失 = 4.41065


100%|██████████| 14344/14344 [00:00<00:00, 755381.27it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

开始计算结果
第3个epoch验证结果: F1 score = 0.95258 -> 最高分数
保存参数


100%|██████████| 60/60 [00:16<00:00,  3.70it/s]


第4个epoch训练结果: 损失 = 3.43873


100%|██████████| 14344/14344 [00:00<00:00, 834289.19it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

开始计算结果
第4个epoch验证结果: F1 score = 0.95051


100%|██████████| 60/60 [00:17<00:00,  3.50it/s]


第5个epoch训练结果: 损失 = 2.89506


100%|██████████| 14344/14344 [00:00<00:00, 862788.38it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

开始计算结果
第5个epoch验证结果: F1 score = 0.95207


100%|██████████| 60/60 [00:16<00:00,  3.67it/s]


第6个epoch训练结果: 损失 = 2.61127


100%|██████████| 14344/14344 [00:00<00:00, 847570.50it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

开始计算结果
第6个epoch验证结果: F1 score = 0.93926


100%|██████████| 60/60 [00:16<00:00,  3.66it/s]


第7个epoch训练结果: 损失 = 2.23477


100%|██████████| 14344/14344 [00:00<00:00, 851035.40it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

开始计算结果
第7个epoch验证结果: F1 score = 0.95392 -> 最高分数
保存参数


100%|██████████| 60/60 [00:16<00:00,  3.74it/s]


第8个epoch训练结果: 损失 = 1.84466


100%|██████████| 14344/14344 [00:00<00:00, 787598.79it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

开始计算结果
第8个epoch验证结果: F1 score = 0.95320


100%|██████████| 60/60 [00:16<00:00,  3.58it/s]


第9个epoch训练结果: 损失 = 1.56825


100%|██████████| 14344/14344 [00:00<00:00, 806962.60it/s]
  0%|          | 0/60 [00:00<?, ?it/s]

开始计算结果
第9个epoch验证结果: F1 score = 0.95566 -> 最高分数
保存参数


100%|██████████| 60/60 [00:16<00:00,  3.61it/s]


第10个epoch训练结果: 损失 = 1.40962


100%|██████████| 14344/14344 [00:00<00:00, 794861.89it/s]

开始计算结果
第10个epoch验证结果: F1 score = 0.95004





In [14]:
check(gold_path="../NER/Chinese/validation.txt", my_path="./my_Chinese_result.txt", if_print=True)

100%|██████████| 14344/14344 [00:00<00:00, 724054.02it/s]


              precision    recall  f1-score   support

      B-NAME     0.9903    1.0000    0.9951       102
      M-NAME     0.9146    1.0000    0.9554        75
      E-NAME     0.9439    0.9902    0.9665       102
      S-NAME     1.0000    1.0000    1.0000         8
      B-CONT     1.0000    1.0000    1.0000        33
      M-CONT     1.0000    1.0000    1.0000        64
      E-CONT     1.0000    1.0000    1.0000        33
      S-CONT     0.0000    0.0000    0.0000         0
       B-EDU     0.9720    0.9811    0.9765       106
       M-EDU     0.9672    1.0000    0.9833       177
       E-EDU     0.9811    0.9811    0.9811       106
       S-EDU     0.0000    0.0000    0.0000         0
     B-TITLE     0.9070    0.9347    0.9207       689
     M-TITLE     0.9058    0.9358    0.9205      1479
     E-TITLE     0.9787    0.9985    0.9885       689
     S-TITLE     0.0000    0.0000    0.0000         0
       B-ORG     0.9419    0.9636    0.9527       522
       M-ORG     0.9513    

0.9500410460888941

# 面试

In [43]:
print(device)
test_data = DataProcess("./NER/Chinese/chinese_test.txt")
model.load_state_dict(torch.load("./zh_best_parameter.ckpt"))
model.eval()
model.state = 'pred'
predict_tag = []
idx2tag = Idx2tag()
print("开始测试")
with torch.no_grad():
    for sentence in tqdm(test_data):
        text = sentence[0]
        text = [char2idx.get(t, char2idx['UNKNOWN']) for t in text]
        seq_len = torch.tensor(len(text), dtype=torch.long).unsqueeze(0)
        seq_len = seq_len.to(device)
        text = torch.tensor(text, dtype=torch.long).unsqueeze(0)
        text = text.to(device)
        batch_tag = model(text, seq_len)
        pred = [idx2tag[t] for t in batch_tag]
        predict_tag.append([sentence[0],pred])

data2txt(predict_tag, "./my_Chinese_test_result.txt")

cuda
开始测试


100%|██████████| 476/476 [02:22<00:00,  3.35it/s]
