## 数据处理

读取数据

In [1]:
import torch
import numpy as np
import random
import re
def readfile(filename):
    f = open(filename)
    sentences = []
    sentence = []
    label = []
    labels = []
    for line in f:
        if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
            if len(sentence) > 0:
                sentences.append(sentence)
                labels.append(label)
                label = []
                sentence = []
            continue
        line = re.sub('\n','',line)
        splits = line.split(' ')
        sentence.append(splits[0].lower())
        label.append(splits[-1])

    if len(sentence) > 0:
        sentences.append(sentence)
        labels.append(label)
        label = []
        sentence = []
    return sentences,labels

In [2]:
train_sentences,train_labels = readfile('Data/train.txt')
val_sentences,val_labels = readfile('Data/valid.txt')
test_sentences,test_labels = readfile('Data/test.txt')

构建词表

In [None]:
def get_vocab(data):
    vocab_list = []
    for text in data:
        for word in text:
            vocab_list.append(word)
    vocab_list = set(vocab_list)
    return vocab_list

In [None]:
vocab = get_vocab(train_sentences+val_sentences+test_sentences)

In [None]:
len(vocab)

In [None]:
word_to_idx = {word: i+1 for i, word in enumerate(vocab)}
word_to_idx['<unk>'] = 0
idx_to_word = {i+1: word for i, word in enumerate(vocab)}
idx_to_word[0] = '<unk>'

文本数据序列化

In [None]:
def text_to_sequence(data):
    text = []
    sequence = []
    for line in data:
        for word in line:
            sequence.append(word_to_idx[word])
        text.append(sequence)
        sequence = []
    return text

In [None]:
train_data = text_to_sequence(train_sentences)
val_data = text_to_sequence(val_sentences)
test_data = text_to_sequence(test_sentences)

In [None]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {"B-PER": 0, "B-LOC": 1, "B-ORG": 2, "B-MISC" : 3, "I-PER": 4, "I-LOC": 5, "I-ORG": 6, "I-MISC": 7, "O":8,
             START_TAG: 9, STOP_TAG: 10}
idx2Label = {tag_to_ix[key]: key for key in tag_to_ix}

In [None]:
def label_to_sequence(data):
    labels = []
    sequence = []
    for line in data:
        for idx in line:
            sequence.append(tag_to_ix[idx])
        labels.append(sequence)
        sequence = []
    return labels

In [None]:
y_train = label_to_sequence(train_labels)
y_val = label_to_sequence(val_labels)
y_test = label_to_sequence(test_labels)

划分数据batch，将长度一致的sentence放在一个batch里

In [None]:
def createBatches(data,labels):
    l = []
    for i in data:
        l.append(len(i))
    l = set(l)
    batches = []
    target = []
    batch_len = []
    z = 0
    for i in l:
        for j,batch in enumerate(data):
            if len(batch) == i:
                batches.append(batch)
                target.append(labels[j])
                z += 1
        batch_len.append(z)
    return batches,target,batch_len

In [None]:
X_train,y_train,train_batch_len=createBatches(train_data,y_train)
X_val,y_val,val_batch_len=createBatches(val_data,y_val)
X_test,y_test,test_batch_len=createBatches(test_data,y_test)

加载词向量

In [None]:
import torch
import gensim
#from gensim.test.utils import datapath, get_tmpfile
#from gensim.models import KeyedVectors
# 已有的glove词向量
#glove_file = datapath("D:/NLP Programme/NLP_beginner_TASK_4/glove.6B.50d.txt")
# 指定转化为word2vec格式后文件的位置
#tmp_file = get_tmpfile("D:/NLP Programme/NLP_beginner_TASK_4/task4_word2vec.txt")
#from gensim.scripts.glove2word2vec import glove2word2vec
#glove2word2vec(glove_file, tmp_file)
wvmodel = gensim.models.KeyedVectors.load_word2vec_format('task4_word2vec.txt', binary=False, encoding='utf-8')
vocab_size = len(vocab) + 1
embed_size = 50
weight = np.zeros((vocab_size, embed_size))
for word in vocab:
    if word in wvmodel.vocab:
        weight[word_to_idx[word],:] = wvmodel[word]
weight = torch.from_numpy(weight)

In [None]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
from torchcrf import CRF

class LSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim,embedding_matrix):
        super(LSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeds.weight.data.copy_(embedding_matrix)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            num_layers=1,batch_first = True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
        self.crf = CRF(self.tagset_size,batch_first=True)
    
    def forward(self, sentence,tags):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        embeds = self.word_embeds(sentence)
        lstm_out,_= self.lstm(embeds)
        lstm_feats = self.hidden2tag(lstm_out)
        loss = self.crf(lstm_feats,tags)
        # Find the best path, given the features.
        tag_seq = self.crf.decode(lstm_feats)
        return -loss,tag_seq

In [None]:
def compute_precision(predictions, correct):
    
    label_pred = []
    for sentence in predictions:
        label_pred.append([idx2Label[element] for element in sentence])

    label_correct = []
    for sentence in correct:
        label_correct.append([idx2Label[element] for element in sentence])
        
    correctCount = 0
    count = 0

    for sentenceIdx in range(len(label_pred)):
        guessed = label_pred[sentenceIdx]
        correct = label_correct[sentenceIdx]
        assert (len(guessed) == len(correct))
        idx = 0
        while idx < len(guessed):
            if guessed[idx][0] == 'B':  # a new chunk starts
                count += 1

                if guessed[idx] == correct[idx]:  # first prediction correct
                    idx += 1
                    correctlyFound = True

                    while idx < len(guessed) and guessed[idx][0] == 'I':  # scan entire chunk
                        if guessed[idx] != correct[idx]:
                            correctlyFound = False 

                        idx += 1

                    if idx < len(guessed):
                        if correct[idx][0] == 'I':  # chunk in correct was longer
                            correctlyFound = False

                    if correctlyFound:
                        correctCount += 1
                else:
                    idx += 1
            else:
                idx += 1

    precision = 0
    if count > 0:
        precision = float(correctCount) / count

    return precision

In [None]:
def get_batch(data,label,batch_len,i):
    if i == 0:
        x = data[0:batch_len[i]]
        y = label[0:batch_len[i]]
    else :
        x = data[batch_len[i-1]:batch_len[i]]
        y = label[batch_len[i-1]:batch_len[i]]
    return torch.tensor(x,dtype=torch.long),torch.tensor(y,dtype=torch.long)

In [None]:
model = LSTM_CRF(vocab_size, tag_to_ix, 50, 50,weight)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
best_f1 = -1
merics = None
for epoch in range(30):  
    model.train()
    predictions = []
    correct = []
    for i in range(len(train_batch_len)):
        model.zero_grad()
        x,y = get_batch(X_train,y_train,train_batch_len,i)
        loss,y_pred = model(x,y)
        loss.backward()
        optimizer.step()
    model.eval()
    for i in range(len(val_batch_len)):
        x,y = get_batch(X_val,y_val,val_batch_len,i)
    
        _,y_pred = model(x,y)
        predictions += y_pred
        correct += y.tolist()
    precision = compute_precision(predictions,correct)
    recall = compute_precision(correct,predictions)
    f1 = 2.0 * precision * recall / (precision + recall)
    if f1 > best_f1:
        best_f1 =f1
        merics = [precision,recall, f1]
        torch.save(model.state_dict(), "lstm_crf.pth")
    print('epoch: {}, precision: {:.6f}, recall: {:.6f}, f1: {:.6f}'.format(epoch,precision,recall, f1))
print(merics)
    
    

In [None]:
model.load_state_dict(torch.load('./rnn/crf.pth'))
for i in range(len(test_batch_len)):
    x,y = get_batch(X_test,y_test,test_batch_len,i)
    
    _,y_pred = model(x,y)
    predictions += y_pred
    correct += y.tolist()
precision = compute_precision(predictions,correct)
recall = compute_precision(correct,predictions)
f1 = 2.0 * precision * recall / (precision + recall)
print('testb:precision: {:.6f}, recall: {:.6f}, f1: {:.6f}'.format(precision,recall, f1))

#### My version(unfinished)

In [None]:
class LSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim,embedding_matrix):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeds.weight.data.copy_(embedding_matrix)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            num_layers=1,batch_first = True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000


    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        batch_size, seq,_ =feats.shape
        init_alphas = torch.full((batch_size, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[:,self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas
        for i in range(seq):
            emit_score = feats[:,i,:].view(batch_size,-1,1)
            tag_var = forward_var.view(batch_size,1, -1) + self.transitions + emit_score
            max_tag_var, _ = torch.max(tag_var, dim=2)
            tag_var = tag_var - max_tag_var.view(batch_size,-1, 1)
            forward_var = max_tag_var + torch.log(torch.sum(torch.exp(tag_var), dim=2))
        terminal_var = (forward_var + self.transitions[self.tag_to_ix[STOP_TAG]])
        alpha = torch.log(torch.sum(torch.exp(terminal_var),dim=1)).sum()

        return alpha / batch_size

    def _get_lstm_features(self, sentence):
        embeds = self.word_embeds(sentence)
        lstm_out,_= self.lstm(embeds)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        trans = torch.zeros(1)
        emit = feats[torch.tensor(range(batch_size)).view(-1,1),range(seq),y].sum()
        trans = trans + self.transitions[y[:,0],self.tag_to_ix[START_TAG]]
        for i in range(seq-1):
            trans = trans + self.transitions[tags[:,i+1],tags[:,i]]
        trans = trans + self.transitions[self.tag_to_ix[STOP_TAG], y[:,-1]]
        return (trans.sum()+emit)/batch_size

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq