# bi-lstm+crf，使用pytorch-crf库实现crf，可cuda加速。

数据集说明：

1: B-BANK 代表银行实体的开始

2: I-BANK 代表银行实体的内部

3: B-PRODUCT 代表产品实体的开始

4: I-PRODUCT 代表产品实体的内部

5: O 代表不属于标注的范围

6: B-COMMENTS_N 代表用户评论（名词）

7: I-COMMENTS_N 代表用户评论（名词）实体的内部

8: B-COMMENTS_ADJ 代表用户评论（形容词）

9: I-COMMENTS_ADJ 代表用户评论（形容词）实体的内部

In [1]:
import pandas as pd
import sys
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from torchcrf import CRF
torch.manual_seed(1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('{}能用'.format(device))

cuda能用


In [2]:
from sklearn.model_selection import train_test_split
data = pd.read_csv('./train_data_public.csv')
data.drop('Unnamed: 0', axis=1, inplace=True)
train_data, valid_data = train_test_split(data, test_size = 0.2, random_state=42)
train_data.index = list(range(len(train_data)))
valid_data.index = list(range(len(valid_data)))
test_data = pd.read_csv('./test_public.csv')

In [3]:
train_data.info()
valid_data.info()
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8000 entries, 0 to 7999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        8000 non-null   object
 1   BIO_anno    8000 non-null   object
 2   class       8000 non-null   int64 
 3   bank_topic  6124 non-null   object
dtypes: int64(1), object(3)
memory usage: 312.5+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   text        2000 non-null   object
 1   BIO_anno    2000 non-null   object
 2   class       2000 non-null   int64 
 3   bank_topic  1512 non-null   object
dtypes: int64(1), object(3)
memory usage: 78.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5093 entries, 0 to 5092
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5093 non-null 

In [4]:
train_data.head(3)

Unnamed: 0,text,BIO_anno,class,bank_topic
0,封卡只是鸡的原因吗？,B-COMMENTS_N I-COMMENTS_N O O O O O O O O,2,交通银行
1,销卡后45天后等征信更新再申请,B-COMMENTS_N I-COMMENTS_N O O O O O O B-PRODUC...,2,建设银行
2,有8w多的，招行的,O O O O O O B-BANK I-BANK O,2,


In [5]:
# 把text和标注按单个字分隔开，放进列表
train_data['BIO_anno'] = train_data['BIO_anno'].apply(lambda x:x.split(' '))
valid_data['BIO_anno'] = valid_data['BIO_anno'].apply(lambda x:x.split(' '))
# 将text和标注组合存进元组
train_data['training_data'] = train_data.apply(lambda row: [list(row['text']), row['BIO_anno']], axis=1)
valid_data['validating_data'] = valid_data.apply(lambda row: [list(row['text']), row['BIO_anno']], axis=1)
test_data['testing_data'] = test_data.apply(lambda row: list(row['text']), axis=1)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value inste

In [6]:
num = train_data['training_data'].apply(lambda x:type(x[0])!=type([]))

In [7]:
num.sum()

0

In [8]:
training_data_txt = train_data['training_data'].to_list()
validating_data_txt = valid_data['validating_data'].to_list()
testing_data_txt = test_data['testing_data'].to_list()
print('训练集大小：',len(training_data_txt))
print('验证集大小：',len(validating_data_txt))
print('测试集大小：',len(testing_data_txt))

训练集大小： 8000
验证集大小： 2000
测试集大小： 5093


In [9]:
# 定义一些工具函数

# 句子转idx
def prepare_sequence(seq, word2idx):
    idxs = [word2idx[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def argmax(vec):
    # return the argmax as a python int
    # 返回vec的dim为1维度上的最大值索引
    _, idx = torch.max(vec,axis=1)
    return idx.item()

# Compute log sum exp in a numerically stable way for the forward algorithm
# 前向算法是不断累积之前的结果，这样就会有个缺点
# 指数和累积到一定程度后，会超过计算机浮点值的最大值，变成inf，这样取log后也是inf
# 为了避免这种情况，用一个合适的值clip去提指数和的公因子，这样就不会使某项变得过大而无法计算
# SUM = log(exp(s1)+exp(s2)+...+exp(s100))
#     = log{exp(clip)*[exp(s1-clip)+exp(s2-clip)+...+exp(s100-clip)]}
#     = clip + log[exp(s1-clip)+exp(s2-clip)+...+exp(s100-clip)]
# where clip=max
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [41]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, configs):
        super(BiLSTM_CRF, self).__init__()
        print(configs.word2idx['<PAD>'])
        self.configs = configs
        # if config.embedding_pretrained is not None:
        #     self.embedding = nn.Embedding.from_pretrained(configs.embedding_pretrained,
        #                                                   freeze=False)  # 表示训练过程词嵌入向量会更新
        # else:

        self.embedding = nn.Embedding(configs.vocab_len, configs.embedding_dim,
                                      padding_idx=configs.word2idx['<PAD>'])  # PAD索引填充

        if configs.bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1


        self.rnn = nn.LSTM(input_size=configs.embedding_dim,
                           hidden_size=configs.hidden_size,
                           num_layers=configs.num_layers,
                           batch_first=True,
                           bidirectional=configs.bidirectional)

        self.tag2idx = configs.tag2idx

        # 转换参数矩阵 输入i,j是得分从j转换到i
        self.tagset_size = len(self.tag2idx)
        # 将lstm的输出映射到标记空间
        self.hidden2tag = nn.Linear(configs.hidden_size*self.num_directions, self.tagset_size)  # -> (B, num_class+2)  加上了START END
        self.crf = CRF(num_tags=self.tagset_size,batch_first=True)

    def _init_hidden(self, batchs):  # 初始化h_0和c_0 与GRU不同的是多了c_0（细胞状态）
        h_0 = torch.zeros(self.configs.num_layers*self.num_directions, batchs,  self.configs.hidden_size)
        c_0 = torch.zeros(self.configs.num_layers*self.num_directions, batchs, self.configs.hidden_size)
        return self._make_tensor(h_0), self._make_tensor(c_0)

    def _get_lstm_features(self, x):
        # x.shape: (bs, num_words)
        x = self.embedding(x)
        # x.shape: (bs, num_words, embedding_dim)
        h_0, c_0 = self._init_hidden(batchs=x.size(0))
        out, (hidden, c) = self.rnn(x,(h_0, c_0))
        # out.shape: (bs, num_words, hidden_size*2)
        out = self.hidden2tag(out)  # (B,num_directions*hidden_size) -> (B, num_class)
        # out.shape: (bs, num_words, tagset_size)
        return out

    def neg_log_likelihood(self, sentence_tensor=None, label_tensor=None, mask_tensor=None):  # 损失函数
        tags = label_tensor.unsqueeze(0)
        feats = self._get_lstm_features(sentence_tensor)
        return -self.crf(emissions=feats, tags=label_tensor, mask=mask_tensor)


    def _make_tensor(self, tensor):
        # 函数说明： 将传入的tensor转移到cpu或gpu内
        tensor_ret = tensor.to(self.configs.device)
        return tensor_ret


    def forward(self, sentence_tensor=None, mask_tensor=None):
        # 数据预处理时，x被处理成是一个tuple,其内容是: (word, label).
        # x:b_size
        lstm_feats = self._get_lstm_features(sentence_tensor)  # 获取BiLSTM的emission分数

        # Returns: List of list containing the best tag sequence for each batch.
        # 返回列表组成的标签
        out = self.crf.decode(emissions=lstm_feats,
                              mask=mask_tensor)
        return out

In [11]:
from utils.param_configs import Configs
configs = Configs()

# 将训练集汉字使用数字表示
# 为了方便调试，先用100条数据进行训练，调试好后可用全量数据进行训练
# training_data_txt = training_data_txt[:]
# --------------------------建立字典，字: idx-------------------------------------
word2idx = {}
# 训练集的
for sentence, tags in training_data_txt:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

# 验证集的
for sentence, tags in validating_data_txt:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

# 测试集的
testing_data = testing_data_txt
for sentence in testing_data:
    for word in sentence:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

# 加2个特殊字符
word2idx['<UNK>'] = len(word2idx)
word2idx['<PAD>'] = len(word2idx)

configs.word2idx = word2idx
configs.vocab_len = len(word2idx)
# ------------------------------------------------------------------------------

In [12]:
len(training_data_txt[0][0]),len(training_data_txt[0][1])
len(training_data_txt)

8000

In [13]:
from utils.data_process import create_data_loader
train_data_loader = create_data_loader(training_data_txt, configs)
valid_data_loader = create_data_loader(validating_data_txt, configs)
# test_data_loader = create_data_loader(testing_data_txt, configs) # 没有标签的测试集就不这样构建，因为没有label

In [14]:
len(train_data_loader),len(valid_data_loader)

(500, 125)

In [15]:
for sample in train_data_loader:
    print(sample)
    break


{'sentence': ['封 卡 只 是 鸡 的 原 因 吗 ？ <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>', '销 卡 后 4 5 天 后 等 征 信 更 新 再 申 请 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <P

In [16]:
len(train_data_loader)

500

In [17]:

configs.word2idx['封']

0

In [18]:
sample['sentence']

['封 卡 只 是 鸡 的 原 因 吗 ？ <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>',
 '销 卡 后 4 5 天 后 等 征 信 更 新 再 申 请 <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <P

In [19]:
sample['label']


['B-COMMENTS_N I-COMMENTS_N O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O',
 'B-COMMENTS_N I-COMMENTS_N O O O O O O B-PRODUCT I-PRODUCT O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O',
 'O O O O O O B-BANK I-BANK O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O',
 'O O B-COMMENTS_N I-COMMENTS_N O O O O O O O O O B-COMMENTS_N I-COMMENTS_N O O O O O O O O O O O B-COMMENTS_N I-COMMENTS_N O O O B-PRODUCT I-PRODUCT B-COMMENTS_N I-COMMENTS_N O O O O O O O O O O O O O O O O O O O O O O O B-PRODUCT I-PRODUCT B-COMMENTS_N I-COMMENTS_N O O O O O O O O O O O O O O O O O O O O O O O O O 

In [20]:
sample['sentence_tensor']

tensor([[   0,    1,    2,  ..., 2621, 2621, 2621],
        [  10,    1,   11,  ..., 2621, 2621, 2621],
        [  23,   24,   25,  ..., 2621, 2621, 2621],
        ...,
        [ 198,  199,   49,  ..., 2621, 2621, 2621],
        [ 211,  212,  213,  ..., 2621, 2621, 2621],
        [ 120,  161,  167,  ..., 2621, 2621, 2621]])

In [21]:
sample['mask_tensor'][0]

tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False])

In [22]:
sample['label_tensor'][-6]

tensor([1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0])

# <font color='red'>Model Training

In [23]:
model = BiLSTM_CRF(configs).to(device)
# 优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)
# 学习率指数衰减  每个epoch: lr = gamma*lr
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
# 损失函数。这个倒是不用定义了，因为pytorch-crf的crf自带loss的计算
# loss_fn = nn.CrossEntropyLoss()

2621


In [42]:
model = BiLSTM_CRF(configs).to(device)
for sample in train_data_loader:
    print(sample.keys())
    sentence_tensor = sample['sentence_tensor'].to(device)
    mask_tensor = sample['mask_tensor'].to(device)
    label_tensor = sample['label_tensor'].to(device)
    out = model(sentence_tensor, mask_tensor)
    loss = model.neg_log_likelihood(sentence_tensor=sentence_tensor,
                                    label_tensor=label_tensor,
                                    mask_tensor=mask_tensor)
    break

2621
dict_keys(['sentence', 'label', 'sentence_tensor', 'label_tensor', 'mask_tensor'])


In [25]:
configs.idx2tag

{0: 'O',
 1: 'B-BANK',
 2: 'I-BANK',
 3: 'B-PRODUCT',
 4: 'I-PRODUCT',
 5: 'B-COMMENTS_N',
 6: 'I-COMMENTS_N',
 7: 'B-COMMENTS_ADJ',
 8: 'I-COMMENTS_ADJ',
 9: '<START>',
 10: '<STOP>'}

In [43]:
loss

tensor(1182.7781, device='cuda:0', grad_fn=<NegBackward>)

In [34]:
sample['sentence_tensor'][2]

tensor([  23,   24,   25,   26,    5,   27,   28,   29,    5, 2621, 2621, 2621,
        2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621,
        2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621,
        2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621,
        2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621,
        2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621,
        2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621,
        2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621, 2621,
        2621, 2621, 2621, 2621])

In [36]:
predict_tag = []
for l in out:
    temp = []
    for i in l:
        temp.append(configs.idx2tag[i])
    predict_tag.append(temp)


In [37]:
predict_tag

[['I-BANK',
  'O',
  'B-BANK',
  '<STOP>',
  'B-COMMENTS_ADJ',
  'I-COMMENTS_N',
  'I-COMMENTS_N',
  'I-COMMENTS_N',
  'I-COMMENTS_N',
  'B-COMMENTS_ADJ'],
 ['I-COMMENTS_N',
  'I-COMMENTS_N',
  'I-COMMENTS_N',
  'I-BANK',
  '<STOP>',
  '<STOP>',
  '<STOP>',
  '<STOP>',
  '<STOP>',
  'I-COMMENTS_N',
  'I-PRODUCT',
  '<STOP>',
  'I-COMMENTS_ADJ',
  'B-BANK',
  '<STOP>'],
 ['I-COMMENTS_N',
  'B-PRODUCT',
  'I-BANK',
  'I-BANK',
  'I-BANK',
  'I-BANK',
  '<START>',
  'I-PRODUCT',
  'B-COMMENTS_N'],
 ['<STOP>',
  '<STOP>',
  '<STOP>',
  '<STOP>',
  '<STOP>',
  '<STOP>',
  'B-COMMENTS_ADJ',
  'B-PRODUCT',
  'B-COMMENTS_ADJ',
  'I-COMMENTS_N',
  'B-COMMENTS_ADJ',
  'I-COMMENTS_N',
  'I-COMMENTS_N',
  'I-COMMENTS_N',
  'I-COMMENTS_N',
  'B-PRODUCT',
  'I-BANK',
  '<START>',
  'I-BANK',
  'B-COMMENTS_ADJ',
  'B-COMMENTS_N',
  '<STOP>',
  '<STOP>',
  '<STOP>',
  '<STOP>',
  'B-COMMENTS_ADJ',
  'O',
  'O',
  'I-BANK',
  'O',
  '<STOP>',
  '<STOP>',
  'B-COMMENTS_ADJ',
  'I-COMMENTS_N',
  'I-COMME