# bi-lstm+crf，使用pytorch-crf库实现crf，可cuda加速。

数据集说明：

1: B-BANK 代表银行实体的开始

2: I-BANK 代表银行实体的内部

3: B-PRODUCT 代表产品实体的开始

4: I-PRODUCT 代表产品实体的内部

5: O 代表不属于标注的范围

6: B-COMMENTS_N 代表用户评论（名词）

7: I-COMMENTS_N 代表用户评论（名词）实体的内部

8: B-COMMENTS_ADJ 代表用户评论（形容词）

9: I-COMMENTS_ADJ 代表用户评论（形容词）实体的内部

In [1]:
import pandas as pd
import sys
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from torchcrf import CRF
torch.manual_seed(1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('{}能用'.format(device))

cuda能用


In [2]:
train_data = pd.read_csv('./train_data_public.csv')
train_data.drop('Unnamed: 0', axis=1, inplace=True)
test_data = pd.read_csv('./test_public.csv')

In [3]:
train_data.info()
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        10000 non-null  object
 1   BIO_anno    10000 non-null  object
 2   class       10000 non-null  int64 
 3   bank_topic  7636 non-null   object
dtypes: int64(1), object(3)
memory usage: 312.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5093 entries, 0 to 5092
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5093 non-null   int64 
 1   text    5093 non-null   object
dtypes: int64(1), object(1)
memory usage: 79.7+ KB


In [4]:
train_data.head(3)

Unnamed: 0,text,BIO_anno,class,bank_topic
0,交行14年用过，半年准备提额，却直接被降到1Ｋ，半年期间只T过一次三千，其它全部真实消费，第...,B-BANK I-BANK O O O O O O O O O O B-COMMENTS_N...,0,建设银行
1,单标我有了，最近visa双标返现活动好,B-PRODUCT I-PRODUCT O O O O O O B-PRODUCT I-PR...,1,建设银行
2,建设银行提额很慢的……,B-BANK I-BANK I-BANK I-BANK B-COMMENTS_N I-COM...,0,建设银行


In [5]:
# 把text和标注按单个字分隔开，放进列表
train_data['BIO_anno'] = train_data['BIO_anno'].apply(lambda x:x.split(' '))
# 将text和标注组合存进元组
train_data['training_data'] = train_data.apply(lambda row: [list(row['text']),row['BIO_anno']], axis=1)
test_data['testing_data'] = test_data.apply(lambda row: list(row['text']), axis=1)



In [6]:
num = train_data['training_data'].apply(lambda x:type(x[0])!=type([]))

In [7]:
num.sum()

0

In [8]:
training_data_txt = train_data['training_data'].to_list()
testing_data_txt = test_data['testing_data'].to_list()
print('训练集大小：',len(training_data_txt), '测试集大小：',len(testing_data_txt))

训练集大小： 10000 测试集大小： 5093


In [9]:
# 定义一些工具函数

# 句子转idx
def prepare_sequence(seq, word2idx):
    idxs = [word2idx[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def argmax(vec):
    # return the argmax as a python int
    # 返回vec的dim为1维度上的最大值索引
    _, idx = torch.max(vec,axis=1)
    return idx.item()

# Compute log sum exp in a numerically stable way for the forward algorithm
# 前向算法是不断累积之前的结果，这样就会有个缺点
# 指数和累积到一定程度后，会超过计算机浮点值的最大值，变成inf，这样取log后也是inf
# 为了避免这种情况，用一个合适的值clip去提指数和的公因子，这样就不会使某项变得过大而无法计算
# SUM = log(exp(s1)+exp(s2)+...+exp(s100))
#     = log{exp(clip)*[exp(s1-clip)+exp(s2-clip)+...+exp(s100-clip)]}
#     = clip + log[exp(s1-clip)+exp(s2-clip)+...+exp(s100-clip)]
# where clip=max
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [10]:
# 定义网络结构：bi-lstm + crf
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, tag2idx, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.vocab_size = vocab_size
        self.tag2idx = tag2idx
        self.tagset_size = len(tag2idx)
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim

        self.word_embeds = nn.Embedding(vocab_size, self.embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=self.hidden_dim//2, # 双向lstm，最后拼接后就是hidden_dim了。
                            num_layers=1,
                            bidirectional=True)
        # 将BiLSTM提取的特征向量映射到特征空间，即经过全连接得到发射分数
        self.hidden2tag = nn.Linear(self.hidden_dim, self.tagset_size)

        # 转移矩阵的参数初始化，transitions[i,j]代表的是从第j个tag转移到第i个tag的转移分数
        self.transitions = nn.Parameter(torch.randn(self.tagset_size, self.tagset_size))

        # 初始化所有其他tag转移到START_TAG的分数非常小，即不可能由其他tag转移到START_TAG
        # 初始化STOP_TAG转移到所有其他tag的分数非常小，即不可能由STOP_TAG转移到其他tag
        self.transitions.data[tag2idx[START_TAG], :] = -10000
        self.transitions.data[:, tag2idx[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()
    def init_hidden(self):
        # 初始化lstm参数
        # h0, c0的shape: (num_layers*2, bs, hidden_size), 双向就乘2
        h0 = torch.randn(2, 1, self.hidden_dim//2)
        c0 = torch.randn(2, 1, self.hidden_dim//2)
        return (h0, c0)

    def _get_lstm_features(self, sentence):
        # 通过bi-lstm提取特征
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence),1,-1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        print(lstm_out.shape)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        print(lstm_out.shape)
        lstm_feats = self.hidden2tag(lstm_out)
        print(lstm_feats.shape)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # 计算给定tag序列的分数，即一条路径的分数
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag2idx[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            # 递推计算路径分数：转移分数 + 发射分数
            score = score + self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag2idx[STOP_TAG], tags[-1]]
        return score

    def _forward_alg(self, feats):
        # 通过前向算法递推计算
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # 初始化step 0即START位置的发射分数，START_TAG取0其他位置取-10000
        init_alphas[0][self.tag2idx[START_TAG]] = 0.

        # 将初始化START位置为0的发射分数赋值给previous
        previous = init_alphas

        # 迭代整个句子
        for obs in feats:
            # 当前时间步的前向tensor
            alphas_t = []
            for next_tag in range(self.tagset_size):
                # 取出当前tag的发射分数，与之前时间步的tag无关
                emit_score = obs[next_tag].view(1, -1).expand(1, self.tagset_size)
                # 取出当前tag由之前tag转移过来的转移分数
                trans_score = self.transitions[next_tag].view(1, -1)
                # 当前路径的分数：之前时间步分数 + 转移分数 + 发射分数
                next_tag_var = previous + trans_score + emit_score
                # 对当前分数取log-sum-exp
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            # 更新previous 递推计算下一个时间步
            previous = torch.cat(alphas_t).view(1, -1)
        # 考虑最终转移到STOP_TAG
        terminal_var = previous + self.transitions[self.tag2idx[STOP_TAG]]
        # 计算最终的分数
        scores = log_sum_exp(terminal_var)
        return scores


    def _viterbi_decode(self, feats):
        backpointers = []

        # 初始化viterbi的previous变量
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag2idx[START_TAG]] = 0

        previous = init_vvars
        for obs in feats:
            # 保存当前时间步的回溯指针
            bptrs_t = []
            # 保存当前时间步的viterbi变量
            viterbivars_t = []

            for next_tag in range(self.tagset_size):
                # 维特比算法记录最优路径时只考虑上一步的分数以及上一步tag转移到当前tag的转移分数
                # 并不取决与当前tag的发射分数
                next_tag_var = previous + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # 更新previous，加上当前tag的发射分数obs
            previous = (torch.cat(viterbivars_t) + obs).view(1, -1)
            # 回溯指针记录当前时间步各个tag来源前一步的tag
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        # 考虑转移到STOP_TAG的转移分数
        terminal_var = previous + self.transitions[self.tag2idx[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # 通过回溯指针解码出最优路径
        best_path = [best_tag_id]
        # best_tag_id作为线头，反向遍历backpointers找到最优路径
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # 去除START_TAG
        start = best_path.pop()
        assert start == self.tag2idx[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        # CRF损失函数由两部分组成，真实路径的分数和所有路径的总分数。
        # 真实路径的分数应该是所有路径中分数最高的。
        # log真实路径的分数/log所有可能路径的分数，越大越好，构造crf loss函数取反，loss越小越好
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score


    def forward(self, sentence):
        # 通过BiLSTM提取发射分数
        lstm_feats = self._get_lstm_features(sentence)
        # 根据发射分数以及转移分数，通过viterbi解码找到一条最优路径
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [11]:
class Model(nn.Module):
    def __init__(self, config):
        super(Model, self).__init__()
        self.config = config
        # if config.embedding_pretrained is not None:
        #     self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained,
        #                                                   freeze=False)  # 表示训练过程词嵌入向量会更新
        # else:
        self.embedding = nn.Embedding(self.config.vocab_len, self.config.embedding_dim,
                                      padding_idx=self.configs.word2idx['<PAD>'])  # PAD索引填充

        if self.config.bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1


        self.rnn = nn.LSTM(input_size=self.config.embedding_dim,
                           hidden_size=self.config.hidden_size,
                           num_layers=self.config.num_layers,
                           batch_first=True,
                           bidirectional=self.config.bidirectional)

        self.tag2idx = configs.tag2idx

        # 转换参数矩阵 输入i,j是得分从j转换到i
        self.tagset_size = len(self.tag2idx)
        # 将lstm的输出映射到标记空间
        self.hidden2tag = nn.Linear(self.config.hidden_size*self.num_directions, self.tagset_size)  # -> (B, num_class+2)  加上了START END
        self.crf = CRF(num_tags=self.tagset_size,batch_first=True)

    def _forward_alg(self, feats):
        # 使用前向算法计算分区函数
        init_alphas = self._make_tensor(torch.full((1, self.tagset_size), -10000.))
        # START_TAG 包含所有得分
        init_alphas[0][self.tag2idx[START_TAG]] = 0.

        # 包装一个变量 以便获得自动反向提升
        forward_var = init_alphas

        # 通过句子迭代
        for feat in feats:
            alphas_t = []  # the forward tensor at this timestep
            for next_tag in range(self.tagset_size):
                # 广播发射得分：无论之前的标记是怎样的都是相同的
                emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
                # trans_score 的第i个条目是从i转移到next_tag的分数
                trans_score = self.transitions[next_tag].view(1, -1)
                # next_tag_var 的第i个条目是执行log-sum-exp之前的变（i -> next_tag）的值
                next_tag_var = forward_var + trans_score + emit_score
                # 此标记的转发变量是所有分数的log-sum-exp
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag2idx[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _score_sentence(self, feats, tags):
        # Give the score of a provided tag sequence
        score = self._make_tensor(torch.zeros(1))
        tags = self._make_tensor(torch.cat([self._make_tensor(torch.tensor([self.tag2idx[START_TAG]], dtype=torch.long)),tags]))
        for i, feat in enumerate(feats):
            score = score + self.transitions[tags[i+1], tags[i]]+feat[tags[i+1]]
        score = score + self.transitions[self.tag2idx[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = self._make_tensor(torch.full((1, self.tagset_size), -10000.))
        init_vvars[0][self.tag2idx[START_TAG]] = 0

        # forward_var at step o holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # hold the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i]保存上一步的标签i的viterbi变量
                # 加上标签i转换到next_tag的分数 我们这里不包括emission分数 因为最大值不依赖于它们（在下面添加它们）
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # 现在添加emission分数 并将forward_var分配给刚计算的viterbi变量集
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # 过渡到STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag2idx[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # 按照后退指针解码最佳路径
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # 弹出开始标记（我们不想将器返回给调用者）
        start = best_path.pop()
        assert start == self.tag2idx[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def _get_lstm_features(self, x):
        # 数据预处理时，x被处理成是一个tuple,其内容是: (word, label).
        # x:b_size
        x = self.embedding(x)  # B -> (B, e_d)
        x = x.unsqueeze(1)  # (B, e_b) -> (B, 1, e_b)
        h_0, c_0 = self._init_hidden(batchs=x.size(0))
        out, (hidden, c) = self.rnn(x,(h_0, c_0))  # out:(B, 1, num_directions*hidden_size) hidden:(num_layer*nun_directions, B,  hidden_size)
        # out = out.squeeze(1)
        # output is batch_first but hidden not
        out = self.hidden2tag(out)  # (B,num_directions*hidden_size) -> (B, num_class)
        out = out.transpose(0, 1)
        return out

    def neg_log_likelihood(self, x, tags):  # 损失函数
        tags = tags.unsqueeze(0)
        feats = self._get_lstm_features(x)
        return -self.crf(feats, tags)



    def _init_hidden(self, batchs):  # 初始化h_0和c_0 与GRU不同的是多了c_0（细胞状态）
        h_0 = torch.zeros(self.config.num_layers*self.num_directions, batchs,  self.config.hidden_size)
        c_0 = torch.zeros(self.config.num_layers*self.num_directions, batchs, self.config.hidden_size)
        return self._make_tensor(h_0), self._make_tensor(c_0)

    def _make_tensor(self, tensor):
        # 函数说明： 将传入的tensor转移到cpu或gpu内

        tensor_ret = tensor.to(self.config.device)
        return tensor_ret

    # def getTagLs(self, config):
    #     tag_ls = config.class_ls
    #     tag_ls.append("<START>")
    #     tag_ls.append("<STOP>")
    #     return tag_ls
    #
    # def getTagDic(self):
    #     tag_dic = {}
    #     for idx, label in enumerate(self.tag_ls):
    #         tag_dic[label] = idx
    #     return tag_dic
    #
    # def idx2Tag(self, idx):
    #     return self.tag_ls[idx]


    def forward(self, x):
        # 数据预处理时，x被处理成是一个tuple,其内容是: (word, label).
        # x:b_size
        lstm_feats = self._get_lstm_features(x)  # 获取BiLSTM的emission分数

        out = self.crf.decode(lstm_feats)
        return out

In [12]:
from utils.param_configs import Configs
configs = Configs()

# 将训练集汉字使用数字表示
# 为了方便调试，先用100条数据进行训练，调试好后可用全量数据进行训练
training_data = training_data_txt[:]
# --------------------------建立字典，字: idx-------------------------------------
word2idx = {}
# 训练集的
for sentence, tags in training_data:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = len(word2idx)
# 测试集的
testing_data = testing_data_txt
for sentence in testing_data:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

# 加2个特殊字符
word2idx['<UNK>'] = len(word2idx)
word2idx['<PAD>'] = len(word2idx)

configs.word2idx = word2idx
# ------------------------------------------------------------------------------

In [13]:
len(training_data_txt[0][0]),len(training_data_txt[0][1])
len(training_data_txt)

10000

In [15]:
from utils.data_process import create_data_loader
train_data_loader = create_data_loader(training_data_txt, configs)
# test_data_loader = create_data_loader(testing_data_txt, configs) # 没有标签的测试集就不这样构建，因为没有label

In [15]:
# testing_data_txt

In [16]:
for sample in train_data_loader:
    print(sample)
    break


{'sentence': ['交 行 1 4 年 用 过 ， 半 年 准 备 提 额 ， 却 直 接 被 降 到 1 Ｋ ， 半 年 期 间 只 T 过 一 次 三 千 ， 其 它 全 部 真 实 消 费 ， 第 六 个 月 的 时 候 为 了 增 加 评 分 提 额 ， 还 特 意 分 期 两 万 ， 但 降 额 后 电 话 投 诉 ， 申 请 提 . . . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>', '单 标 我 有 了 ， 最 近 v i s a 双 标 返 现 活 动 好 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>', '建 设 银 行 提 额 很 慢 的 … … <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <

In [17]:
len(train_data_loader)

625

In [17]:

configs.word2idx['交']

0

In [18]:
sample['sentence']

['交 行 1 4 年 用 过 ， 半 年 准 备 提 额 ， 却 直 接 被 降 到 1 Ｋ ， 半 年 期 间 只 T 过 一 次 三 千 ， 其 它 全 部 真 实 消 费 ， 第 六 个 月 的 时 候 为 了 增 加 评 分 提 额 ， 还 特 意 分 期 两 万 ， 但 降 额 后 电 话 投 诉 ， 申 请 提 . . . <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>',
 '单 标 我 有 了 ， 最 近 v i s a 双 标 返 现 活 动 好 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>',
 '建 设 银 行 提 额 很 慢 的 … … <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> 

In [19]:
sample['label']


['B-BANK I-BANK O O O O O O O O O O B-COMMENTS_N I-COMMENTS_N O O O O O B-COMMENTS_ADJ I-COMMENTS_ADJ O O O O O O O O O O O O O O O O O O O O O B-COMMENTS_N I-COMMENTS_N O O O O O O O O O O B-COMMENTS_N I-COMMENTS_N O O B-COMMENTS_N I-COMMENTS_N O O O O B-PRODUCT I-PRODUCT O O O O B-COMMENTS_ADJ O O O O O O O O O O O O O O O O O O O O O O O O O O O O O',
 'B-PRODUCT I-PRODUCT O O O O O O B-PRODUCT I-PRODUCT I-PRODUCT I-PRODUCT B-PRODUCT I-PRODUCT B-COMMENTS_N I-COMMENTS_N I-COMMENTS_N I-COMMENTS_N B-COMMENTS_ADJ O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O',
 'B-BANK I-BANK I-BANK I-BANK B-COMMENTS_N I-COMMENTS_N B-COMMENTS_ADJ I-COMMENTS_ADJ O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O',
 'O O O O O B-BANK I-BANK I-BANK I-BANK I-BANK

In [19]:
sample['sentence_tensor']

tensor([[   0,    1,    2,  ..., 2621, 2621, 2621],
        [  63,   64,   65,  ..., 2621, 2621, 2621],
        [  79,   80,   81,  ..., 2621, 2621, 2621],
        ...,
        [  65,  111,   79,  ..., 2621, 2621, 2621],
        [ 251,  252,  140,  ..., 2621, 2621, 2621],
        [ 247,  258,  105,  ..., 2621, 2621, 2621]])

In [17]:
sample['mask_tensor'][0].shape

torch.Size([100])

In [19]:
sample['label_tensor']

tensor([[1, 2, 0,  ..., 0, 0, 0],
        [3, 4, 0,  ..., 0, 0, 0],
        [1, 2, 2,  ..., 0, 0, 0],
        ...,
        [0, 0, 1,  ..., 0, 0, 0],
        [3, 4, 4,  ..., 0, 0, 0],
        [7, 8, 0,  ..., 0, 0, 0]])

In [None]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 11
HIDDEN_DIM = 6


# 将训练集汉字使用数字表示
# 为了方便调试，先用100条数据进行训练，调试好后可用全量数据进行训练
training_data = training_data_txt[:]

# --------------------------建立字典，字: idx-------------------------------------
word2idx = {}
# 训练集的
for sentence, tags in training_data:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = len(word2idx)
# 测试集的
testing_data = testing_data_txt
for sentence in testing_data:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = len(word2idx)
# ------------------------------------------------------------------------------
START_TAG = "<START>"
STOP_TAG = "<STOP>"
# 标签：idx
tag2idx = { "O": 0, "B-BANK": 1, "I-BANK": 2, "B-PRODUCT":3,'I-PRODUCT':4,
             'B-COMMENTS_N':5, 'I-COMMENTS_N':6, 'B-COMMENTS_ADJ':7,
             'I-COMMENTS_ADJ':8, START_TAG: 9, STOP_TAG: 10}

In [None]:
model = BiLSTM_CRF(len(word2idx), tag2idx, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

In [None]:
# 检查下模型输入输出
precheck_sent = prepare_sequence(training_data[0][0], word2idx)
precheck_tags = torch.tensor([tag2idx[t] for t in training_data[0][1]], dtype=torch.long)
print(model(precheck_sent))

In [None]:
import time
t = time.time()
# Make sure prepare_sequence from earlier in the LSTM section is loaded
for epoch in range(100):
    print('the',epoch,' epoch')
    print(f'Time Taken: {round(time.time()-t)} seconds')
    for sentence, tags in training_data:
        # 第一步，pytorch梯度累积，需要清零梯度
        model.zero_grad()

        # 第二步，将输入转化为tensors
        sentence_in = prepare_sequence(sentence, word2idx)
        targets = torch.tensor([tag2idx[t] for t in tags], dtype=torch.long)

        # 进行前向计算，取出crf loss
        loss = model.neg_log_likelihood(sentence_in, targets)

        # 第四步，计算loss，梯度，通过optimier更新参数
        loss.backward()
        optimizer.step()


In [None]:
# 训练结束查看模型预测结果，对比观察模型是否学到
# 标签：idx
idx2tag = { 0:"O", 1:"B-BANK", 2:"I-BANK", 3:"B-PRODUCT",4:'I-PRODUCT',
             5:'B-COMMENTS_N', 6:'I-COMMENTS_N', 7:'B-COMMENTS_ADJ',
             8:'I-COMMENTS_ADJ', 9:START_TAG, 10:STOP_TAG}
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[4][0], word2idx)
    print(model(precheck_sent))
    a = model(precheck_sent) # model return score, tag_seq
    # a = pd.Series(a)
    print('句子为：', ''.join(training_data[4][0]))
    print('实体标注结果为：', ' '.join([idx2tag[i] for i in a[1]]))
