In [2]:
import warnings
warnings.filterwarnings("ignore")

tag2idx = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-MISC": 7,
    "I-MISC": 8,
    "START": 9, 
    "STOP": 10
}
sort_labels = [
    "O",
    "B-PER",
    "I-PER",
    "B-ORG",
    "I-ORG",
    "B-LOC",
    "I-LOC",
    "B-MISC",
    "I-MISC",
]

def data2txt(predict, path):
    with open(path, "w", encoding="utf-8") as file:
        for item in predict:
            for i in range(len(item[0])):
                file.write("{} {}\n".format(item[0][i], item[1][i]))
            file.write("\n")

# 数据预处理

In [3]:
import torch
from torch.utils.data import Dataset

def DataProcess(path):
    data = []
    sentence = []
    tag = []
    s = []
    t = []
    with open(path, "r", encoding="utf-8") as file:
        for line in file:
            if line != "\n":
                line = line.rstrip().split()
                sentence.append(line[0])
                tag.append(line[1])
            else:
                data.append([sentence, tag])
                s.append(sentence)
                t.append(tag)
                sentence = []
                tag = []
    return data

def Idx2tag():
    idx2tag = {}
    for key, value in tag2idx.items():
        idx2tag[value] = key
    return idx2tag

def char_list(data):
    character = []
    for i in range(len(data)):
        for t in data[i][0]:
            character.append(t)
    return list(set(character))

def CharDict(path):
    data = DataProcess(path)
    char2idx = {'PADDING': 0, 'UNKNOWN': 1}
    idx2char = {0: 'PADDING', 1: 'UNKNOWN'}
    l = char_list(data)
    for i, v in enumerate(l):
        char2idx[v] = i+2
        idx2char[i+2] = v
    return char2idx, idx2char

class Mydataset(Dataset):
    def __init__(self, file_path, idx2tag, chardict):
        self.data = DataProcess(file_path)
        self.label_map = tag2idx
        self.label_map_inv = idx2tag
        self.char2idx, self.idx2char = chardict
        self.examples = []
        for text, label in self.data:
            t = [self.char2idx.get(t, self.char2idx['UNKNOWN']) for t in text]
            l = [self.label_map[l] for l in label]
            self.examples.append([t, l])

    def __getitem__(self, item):
        return self.examples[item]

    def __len__(self):
        return len(self.data)

    def collect_fn(self, batch):
        text = [t for t, _ in batch]
        label = [l for _, l in batch]
        seq_len = [len(i) for i in text]
        max_len = max(seq_len)
        # 由于LSTM需要保证一个batch的输入长度一致
        # 因此需要将所有将所有句子填充到最大长度，用PADDING补齐，标签用'O'补齐
        text = [t + [self.char2idx['PADDING']] * (max_len - len(t)) for t in text]
        label = [l + [self.label_map['O']] * (max_len - len(l)) for l in label]

        text = torch.tensor(text, dtype=torch.long)
        label = torch.tensor(label, dtype=torch.long)
        seq_len = torch.tensor(seq_len, dtype=torch.long)

        return text, label, seq_len

In [4]:
print(tag2idx)
print(Idx2tag())

{'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8, 'START': 9, 'STOP': 10}
{0: 'O', 1: 'B-PER', 2: 'I-PER', 3: 'B-ORG', 4: 'I-ORG', 5: 'B-LOC', 6: 'I-LOC', 7: 'B-MISC', 8: 'I-MISC', 9: 'START', 10: 'STOP'}


In [5]:
char2idx,idx2char = CharDict("../NER/English/train.txt")
print(char2idx)
print(idx2char)



# BiLSTM-CRF

In [6]:
import torch
import torch.nn as nn


def argmax(vec):
    _, idx = torch.max(vec, 1)
    return idx.item()


def log_sum_exp(loss):
    max_score, _ = torch.max(loss, dim=-1)
    max_score_broadcast = max_score.unsqueeze(-1).repeat_interleave(loss.shape[-1], dim=-1)
    return max_score + torch.log(torch.sum(torch.exp(loss - max_score_broadcast), dim=-1))

# https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html
class BiLSTM_CRF(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab, label_map, device='cpu'):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim  # 词向量维度
        self.hidden_dim = hidden_dim  
        self.vocab_size = len(vocab)  # 词表大小
        self.tag_size = len(label_map)  # 标签个数
        self.device = device
        self.state = 'train'  # 模型有'train'、'eval'、'pred'三种状态

        self.word_embeds = nn.Embedding(self.vocab_size, embedding_dim)
        self.dropout = nn.Dropout(p=0.5, inplace=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=2, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Linear(hidden_dim, self.tag_size, bias=True)
        self.crf = CRF(label_map, device)
        self.layer_norm = nn.LayerNorm(self.hidden_dim)

    def _get_lstm_features(self, sentence, seq_len):
        embeds = self.word_embeds(sentence)
        self.dropout(embeds)
        packed = torch.nn.utils.rnn.pack_padded_sequence(embeds, seq_len.to('cpu'), batch_first=True, enforce_sorted=False)
        lstm_out, _ = self.lstm(packed)
        seq_unpacked, _ = torch.nn.utils.rnn.pad_packed_sequence(lstm_out, batch_first=True)
        seqence_output = self.layer_norm(seq_unpacked)
        lstm_features = self.hidden2tag(seqence_output)
        return lstm_features

    def forward(self, sentence, seq_len, tags=''):
        features = self._get_lstm_features(sentence, seq_len)
        if self.state == 'train':
            loss = self.crf.criterion(features, tags, seq_len)
            return loss
        elif self.state == 'eval':
            tag = []
            for i, feat in enumerate(features):
                tag.append(self.crf._viterbi(feat[:seq_len[i]])[1])
            return tag
        else:
            return self.crf._viterbi(features[0])[1]

class CRF:
    def __init__(self, label_map, device='cpu'):
        self.label_map = label_map
        self.label_map_inv = {v: k for k, v in label_map.items()}
        self.tag_size = len(self.label_map)
        self.device = device

        # 转移概率矩阵
        self.transitions = nn.Parameter(
            torch.randn(self.tag_size, self.tag_size)).to(self.device)

        # 增加开始和结束标志
        self.START_TAG = "START"
        self.STOP_TAG = "STOP"
        self.transitions.data[self.label_map[self.START_TAG], :] = -10000
        self.transitions.data[:, self.label_map[self.STOP_TAG]] = -10000

    def _forward(self, feats, seq_len):
        init_alphas = torch.full((self.tag_size,), -10000.)
        init_alphas[self.label_map[self.START_TAG]] = 0.

        forward = torch.zeros(feats.shape[0], feats.shape[1] + 1, feats.shape[2], dtype=torch.float32,
                                  device=self.device)
        forward[:, 0, :] = init_alphas

        transitions = self.transitions.unsqueeze(0).repeat(feats.shape[0], 1, 1)
        for seq_i in range(feats.shape[1]):
            emit_score = feats[:, seq_i, :]
            tag_var = (
                    forward[:, seq_i, :].unsqueeze(1).repeat(1, feats.shape[2], 1)  # (batch_size, tag_size, tag_size)
                    + transitions
                    + emit_score.unsqueeze(2).repeat(1, 1, feats.shape[2])
            )
            cloned = forward.clone()
            cloned[:, seq_i + 1, :] = log_sum_exp(tag_var)
            forward = cloned

        forward = forward[range(feats.shape[0]), seq_len, :]
        last = forward + self.transitions[self.label_map[self.STOP_TAG]].unsqueeze(0).repeat(feats.shape[0], 1)
        alpha = log_sum_exp(last)
        return alpha

    def _score(self, feats, tags, seq_len):
        score = torch.zeros(feats.shape[0], device=self.device)
        start = torch.tensor([self.label_map[self.START_TAG]], device=self.device).unsqueeze(0).repeat(feats.shape[0], 1)
        tags = torch.cat([start, tags], dim=1)
        for batch_i in range(feats.shape[0]):
            score[batch_i] = torch.sum(
                self.transitions[tags[batch_i, 1:seq_len[batch_i] + 1], tags[batch_i, :seq_len[batch_i]]]) \
                             + torch.sum(feats[batch_i, range(seq_len[batch_i]), tags[batch_i][1:seq_len[batch_i] + 1]])
            score[batch_i] += self.transitions[self.label_map[self.STOP_TAG], tags[batch_i][seq_len[batch_i]]]
        return score

    def _viterbi(self, feats):
        path = []
        init_vvars = torch.full((1, self.tag_size), -10000., device=self.device)
        init_vvars[0][self.label_map[self.START_TAG]] = 0

        forward = init_vvars
        for feat in feats:
            tag_path = [] 
            score_path = [] 
            for next_tag in range(self.tag_size):
                next_tag_var = forward + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                tag_path.append(best_tag_id)
                score_path.append(next_tag_var[0][best_tag_id].view(1))
            forward = (torch.cat(score_path) + feat).view(1, -1)
            path.append(tag_path)

        last = forward + self.transitions[self.label_map[self.STOP_TAG]]
        best_tag_id = argmax(last)
        path_score = last[0][best_tag_id]

        best_path = [best_tag_id]
        for tag_path in reversed(path):
            best_tag_id = tag_path[best_tag_id]
            best_path.append(best_tag_id)
        start = best_path.pop()
        assert start == self.label_map[self.START_TAG]
        best_path.reverse()
        return path_score, best_path

    def criterion(self, feats, tags, seq_len):
        forward_score = self._forward(feats, seq_len)
        gold_score = self._score(feats, tags, seq_len)
        return torch.mean(forward_score - gold_score)

# 训练

In [7]:
from tqdm import tqdm
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn import metrics

# 设置torch随机种子
torch.manual_seed(904)

embedding_size = 128
hidden_dim = 768
epochs = 10
batch_size = 64
device = "cuda" if torch.cuda.is_available() else "cpu"
valid_data = DataProcess("../NER/English/validation.txt")

# 建立词表，扫描训练集所有字符得到，'PAD'在batch填充时使用，'UNK'用于替换字表以外的新字符
chardict = CharDict("../NER/English/train.txt")
train_dataset = Mydataset("../NER/English/train.txt", Idx2tag(), chardict)
valid_dataset = Mydataset("../NER/English/validation.txt", Idx2tag(), chardict)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=0, pin_memory=True, shuffle=True,
                              collate_fn=train_dataset.collect_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=1, num_workers=0, pin_memory=False, shuffle=False,
                              collate_fn=valid_dataset.collect_fn)
model = BiLSTM_CRF(embedding_size, hidden_dim, train_dataset.char2idx, train_dataset.label_map, device).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)


In [8]:
def train():
    best_score = 0
    for epoch in range(epochs):
        model.train()
        model.state = 'train'
        train_loss = []
        for (text, label, seq_len) in tqdm(train_dataloader):
            model.zero_grad()
            text = text.to(device)
            label = label.to(device)
            seq_len = seq_len.to(device)

            loss = model(text, seq_len, label)

            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())

        train_loss = sum(train_loss) / len(train_loss)
        print(f"第{epoch + 1}个epoch训练结果: 损失 = {train_loss:.5f}")

        # 每epoch验证一次，保存最优参数
        print("开始验证")
        score = evaluate()
        if score > best_score:
            print(f"第{epoch + 1}个epoch训练结果: F1 score = {score:.5f} -> 最高分数")
            print("保存参数")
            best_score = score
            torch.save(model, "en_best_parameter.pt")
        else:
            print(f"第{epoch + 1}个epoch训练结果: F1 score = {score:.5f}")


def evaluate():
    idx2tag = Idx2tag()
    model.eval()
    model.state = 'pred'
    predict_tag = []
    with torch.no_grad():
        for sentence in tqdm(valid_data):
            text = sentence[0]
            text = [char2idx.get(t, char2idx['UNKNOWN']) for t in text]
            seq_len = torch.tensor(len(text), dtype=torch.long).unsqueeze(0)
            seq_len = seq_len.to(device)
            text = torch.tensor(text, dtype=torch.long).unsqueeze(0)
            text = text.to(device)
            batch_tag = model(text, seq_len)
            pred = [idx2tag[t] for t in batch_tag]
            predict_tag.append([sentence[0],pred])
    print("开始计算结果")
    data2txt(predict_tag, "./my_English_result.txt")
    return check(gold_path="../NER/English/validation.txt",
        my_path="./my_English_result.txt", if_print=False)

def check(gold_path, my_path, if_print):
    y_true = []
    y_pred = []
    with open(gold_path, "r", encoding="utf-8") as g_f, open(my_path, "r", encoding="utf-8") as m_f:
        g_lines = g_f.readlines()
        m_lines = m_f.readlines()
        assert len(g_lines) == len(m_lines), f"Length is Not Equal. len(g_lines)={len(g_lines)}, len(m_lines)={len(m_lines)}"
        for i in range(len(g_lines)):
            if g_lines[i] == "\n":
                continue
            g_word, g_tag = g_lines[i].strip().split(" ")
            m_word, m_tag = m_lines[i].strip().split(" ")
            y_true.append(g_tag)
            y_pred.append(m_tag)
    if if_print: print(metrics.classification_report(y_true=y_true, y_pred=y_pred, labels=sort_labels[1:], digits=4))
    return metrics.f1_score(y_true=y_true, y_pred=y_pred, average='micro', labels=sort_labels[1:])


In [9]:
print(device)
train()

cuda


100%|██████████| 220/220 [00:17<00:00, 12.37it/s]


第1个epoch训练结果: 损失 = 9.15788
开始验证


100%|██████████| 3250/3250 [00:35<00:00, 90.60it/s] 


开始计算结果
第1个epoch训练结果: F1 score = 0.49913 -> 最高分数
保存参数


100%|██████████| 220/220 [00:17<00:00, 12.23it/s]


第2个epoch训练结果: 损失 = 5.65812
开始验证


100%|██████████| 3250/3250 [00:36<00:00, 88.49it/s] 


开始计算结果
第2个epoch训练结果: F1 score = 0.65432 -> 最高分数
保存参数


100%|██████████| 220/220 [00:18<00:00, 11.97it/s]


第3个epoch训练结果: 损失 = 4.36557
开始验证


100%|██████████| 3250/3250 [00:37<00:00, 87.54it/s] 


开始计算结果
第3个epoch训练结果: F1 score = 0.72599 -> 最高分数
保存参数


100%|██████████| 220/220 [00:18<00:00, 12.14it/s]


第4个epoch训练结果: 损失 = 3.51835
开始验证


100%|██████████| 3250/3250 [00:35<00:00, 92.20it/s] 


开始计算结果
第4个epoch训练结果: F1 score = 0.76209 -> 最高分数
保存参数


100%|██████████| 220/220 [00:17<00:00, 12.63it/s]


第5个epoch训练结果: 损失 = 2.84865
开始验证


100%|██████████| 3250/3250 [00:35<00:00, 91.95it/s] 


开始计算结果
第5个epoch训练结果: F1 score = 0.76259 -> 最高分数
保存参数


100%|██████████| 220/220 [00:17<00:00, 12.72it/s]


第6个epoch训练结果: 损失 = 2.40353
开始验证


100%|██████████| 3250/3250 [00:34<00:00, 93.45it/s] 


开始计算结果
第6个epoch训练结果: F1 score = 0.79302 -> 最高分数
保存参数


100%|██████████| 220/220 [00:17<00:00, 12.79it/s]


第7个epoch训练结果: 损失 = 1.99573
开始验证


100%|██████████| 3250/3250 [00:34<00:00, 92.89it/s] 


开始计算结果
第7个epoch训练结果: F1 score = 0.80369 -> 最高分数
保存参数


100%|██████████| 220/220 [00:17<00:00, 12.62it/s]


第8个epoch训练结果: 损失 = 1.72893
开始验证


100%|██████████| 3250/3250 [00:36<00:00, 89.41it/s] 


开始计算结果
第8个epoch训练结果: F1 score = 0.80778 -> 最高分数
保存参数


100%|██████████| 220/220 [00:17<00:00, 12.61it/s]


第9个epoch训练结果: 损失 = 1.49605
开始验证


100%|██████████| 3250/3250 [00:34<00:00, 93.86it/s] 


开始计算结果
第9个epoch训练结果: F1 score = 0.82345 -> 最高分数
保存参数


100%|██████████| 220/220 [00:17<00:00, 12.41it/s]


第10个epoch训练结果: 损失 = 1.26767
开始验证


100%|██████████| 3250/3250 [00:36<00:00, 89.60it/s] 


开始计算结果
第10个epoch训练结果: F1 score = 0.83011 -> 最高分数
保存参数


In [11]:
check(gold_path="../NER/English/test.txt", my_path="./my_English_result.txt", if_print=True)


AssertionError: Length is Not Equal. len(g_lines)=49888, len(m_lines)=54612

In [12]:
# model.cpu()
# torch.save(model.state_dict(), "./eh_best_parameter.ckpt")
# model.to(device)

# 面试

In [13]:
test_data = DataProcess("../NER/English/test.txt")
model = torch.load('./en_best_parameter.pt').to(device)
model.eval()
model.state = 'pred'
predict_tag = []
idx2tag = Idx2tag()
with torch.no_grad():
    for sentence in tqdm(test_data):
        text = sentence[0]
        text = [char2idx.get(t, char2idx['UNKNOWN']) for t in text]
        seq_len = torch.tensor(len(text), dtype=torch.long).unsqueeze(0)
        seq_len = seq_len.to(device)
        text = torch.tensor(text, dtype=torch.long).unsqueeze(0)
        text = text.to(device)
        batch_tag = model(text, seq_len)
        pred = [idx2tag[t] for t in batch_tag]
        predict_tag.append([sentence[0],pred])

data2txt(predict_tag, "./my_English_test_result.txt")

100%|██████████| 3453/3453 [00:33<00:00, 102.74it/s]


In [14]:
check(gold_path="../NER/English/test.txt", my_path="./my_English_test_result.txt", if_print=True)


              precision    recall  f1-score   support

       B-PER     0.8986    0.7013    0.7878      1617
       I-PER     0.9141    0.7362    0.8155      1156
       B-ORG     0.8429    0.6689    0.7459      1661
       I-ORG     0.8692    0.6683    0.7556       835
       B-LOC     0.9073    0.7332    0.8110      1668
       I-LOC     0.8394    0.6304    0.7200       257
      B-MISC     0.8139    0.6353    0.7136       702
      I-MISC     0.7546    0.5694    0.6491       216

   micro avg     0.8756    0.6913    0.7726      8112
   macro avg     0.8550    0.6679    0.7498      8112
weighted avg     0.8751    0.6913    0.7724      8112



0.7726114210925121