In [1]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import re

data_train_dir='Data/train.txt'
data_dev_dir='Data/valid.txt'
data_test_dir='Data/test.txt'
embedding_dir='Data/glove.6B.100d.txt'
worddict_dir='Data/worddict.txt'

In [2]:
def read_data(data_dir):
    with open(data_dir,'r',encoding='utf-8')as f:
        sentences=[]
        sentence=[]
        labels=[]
        label=[]
        for line in f:
            if len(line) == 0 or line.startswith('-DOCSTART') or line[0] == "\n":
                if len(sentence) > 0:
                    sentences.append(sentence)
                    labels.append(label)
                    label = []
                    sentence = []
                continue
            line = re.sub('\n','',line)
            splits = line.split(' ')
            if len(splits[0])<20:
                sentence.append(splits[0])
                label.append(splits[-1])
        
    return {"sentences":sentences,
            "labels":labels
           }


def build_embedding_map(embedding_dir):
    with open(embedding_dir,'r',encoding='utf-8')as f:
        embed_map={}
        word_embeddings=[]
        for lines in f:
            lines=lines.strip().split(" ")
            word=lines[0]
            if(word) not in embed_map:
                embed_map[word]=lines[1:]
    return embed_map

def build_worddict(data_str,embedding_map):
    words=[]
    words.extend(["_PAD_","_OOV_","_BOS_","_EOS_"])
    for sentence in data_str["sentences"]:
        words.extend([word.lower() for word in sentence])
    for i,embed_word in enumerate(embedding_map):
        words.append(embed_word)
    word_id={}
    for index,word in enumerate(words):
        if word not in word_id:
            word_id[word]=index
    with open(worddict_dir, "w",encoding='utf-8') as f:
        for word in word_id:
            f.write("%s\t%d\n"%(word, word_id[word]))
            
    return word_id

def sentence2id(sentence,word_id):
    sentence_id=[]
    sentence_id.append(word_id["_BOS_"])
    for word in sentence:
        word=word.lower()
        if word in word_id:
            sentence_id.append(word_id[word])
        else:
            sentence_id.append(word_id["_OOV_"])
    sentence_id.append(word_id["_EOS_"])
    return sentence_id

def data2id(data_str,wordid,labelid):
    sentences_id=[]
    labels_id=[]
    char_level_id=[]
    for i,seq in enumerate(data_str["sentences"]):
        sentence_id=[]
        label_id=[]
        chars_id=[[95,66,79,83,95]]
        #sentence_id
        sentence_id.extend(sentence2id(seq,word_id))
        #char_level_id
        for word in data_str["sentences"][i]:
            char_id=[]
            for char in word:
                char_id.append(ord(char))
            chars_id.append(char_id)
        chars_id.append([95,69,79,83,95])
         #label_id   
        label_id.append(labelid["_BOS_"])
        for label in data_str["labels"][i]:
            label_id.append(labelid[label])
        label_id.append(labelid["_EOS_"])
        
        sentences_id.append(sentence_id)
        char_level_id.append(chars_id)
        labels_id.append(label_id)
    return {"sentences_id":sentences_id,
            "char_level_id":char_level_id,
            "labels_id":labels_id
           }

def build_embed_matrix(embed_map,word_id):
    vocab_size=len(word_id)
    embed_dim=len(embed_map["a"])
    matrix=np.zeros((vocab_size,embed_dim))
    missed_cnt=0
    for index,word in enumerate(word_id):
        if word in embed_map:
            matrix[index]=embed_map[word]
        else:
            if word=="_PAD_":
                continue
            else:
                missed_cnt+=1
                matrix[index]=np.random.normal(size=embed_dim)
    print("missed word count: %d"%(missed_cnt)) 
    return matrix

In [3]:
data_train_str=read_data(data_train_dir)
data_dev_str=read_data(data_dev_dir)
embedding_map=build_embedding_map(embedding_dir)
word_id=build_worddict(data_train_str,embedding_map)

In [4]:
label_idx= {"_BOS_": 0,"B-PER": 1, "B-LOC": 2, "B-ORG": 3, "B-MISC" : 4,
"I-PER": 5, "I-LOC": 6, "I-ORG": 7, "I-MISC": 8, "O":9 , "_EOS_": 10}
data_train_id=data2id(data_train_str,word_id,label_idx)
data_dev_id=data2id(data_dev_str,word_id,label_idx)
embedding_matrix=build_embed_matrix(embedding_map,word_id)
print("embedding_matrix size: %d"%len(embedding_matrix))

missed word count: 2579
embedding_matrix size: 402580


In [5]:
class CONLLDataset(Dataset):
    def __init__(self,data,max_word_len,max_sentence_len=None):
        self.seq_number=len(data["sentences_id"])
        self.sentence_lens=[len(seq) for seq in data["sentences_id"]]
        self.max_sentence_len=max_sentence_len
        self.max_word_len=max_word_len
        if self.max_sentence_len==None:
            self.max_sentence_len=max(self.sentence_lens)
        print(self.seq_number,self.max_sentence_len,self.max_word_len)
        self.data={"sentence":torch.zeros((self.seq_number,self.max_sentence_len),
                                          dtype=torch.long),
                   "label":torch.zeros((self.seq_number,self.max_sentence_len),
                                          dtype=torch.long),
                   "char_level":torch.zeros((self.seq_number,self.max_sentence_len,
                                            self.max_word_len),dtype=torch.long)
                  }
        for i,seq in enumerate(data["sentences_id"]):
            l=len(seq)
            self.data["sentence"][i][:l]=torch.tensor(data["sentences_id"][i][:l])
            self.data["label"][i][:l]=torch.tensor(data["labels_id"][i][:l])
            for j,word in enumerate(data["char_level_id"][i]):
                w=len(word)
                self.data["char_level"][i][j][:w]=\
                     torch.tensor(data["char_level_id"][i][j][:w])
            
    def __len__(self):
        return self.seq_number
    
    def __getitem__(self,index):
        return {"sentence":self.data["sentence"][index],
                "label":self.data["label"][index],
                "char_level":self.data["char_level"][index],
                "length":min(self.sentence_lens[index], self.max_sentence_len)
        }

In [6]:
batch_size=128
hidden_size=128
dropout=0.5
lr=0.001
epochs=10
max_grad_norm=10.0
use_gpu=False
if torch.cuda.is_available():
    use_gpu=True
device=torch.device("cuda:0" if use_gpu else "cpu")


In [7]:
print(device)

cuda:0


In [8]:
train_dataset=CONLLDataset(data_train_id,20,max_sentence_len=None)
train_loader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
dev_dataset=CONLLDataset(data_dev_id,20,max_sentence_len=None)
dev_loader=DataLoader(dev_dataset,batch_size=batch_size,shuffle=True)

14026 115 20
3250 111 20


In [50]:
class char_cnn(nn.Module):
    def __init__(self,embed_size,num_filters=None,filter_size=None,
                device="cpu"):
        super(char_cnn,self).__init__()
        self.char_embed_size=embed_size
        self.num_filters=num_filters
        self.filter_size=filter_size
        if self.num_filters==None:
            self.num_filters=10
        if self.filter_size==None:
            self.filter_size=2
        print(self.char_embed_size,self.num_filters,self.filter_size)
        self.char_embedding=nn.Embedding(256,self.char_embed_size,padding_idx=0)
        self.char_embedding.weight.requires_grad = False
        self.conv_block=nn.Conv2d(1,self.num_filters,
                                  (self.filter_size,103),padding=1)
        self.maxpool=nn.MaxPool2d((21,num_filters))
        self.dropout=nn.Dropout(p=0.5)
        
        
    def forward(self,word_batch):
        embeds=self.char_embedding(word_batch)
        embeds=embeds.unsqueeze(1)
        conv_out=self.conv_block(embeds)
        conv_out=conv_out.transpose(1,2)
        conv_out=conv_out.reshape(conv_out.size()[0],conv_out.size()[1],-1)
        maxpool_out=self.maxpool(conv_out)
        #char_out=self.dropout(max_poolout)
        return maxpool_out

In [54]:
#  Code widely inspired from:
#  https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html
class CRF(nn.Module):
    def __init__(self,label_idx,device="cpu"):
        super(CRF,self).__init__()
        self.label_idx=label_idx
        self.label_size=len(label_idx)
        self.device=device 
        self.transitions = nn.Parameter(
             torch.randn(self.label_size, self.label_size)).to(self.device)
        self.transitions.data[label_idx["_BOS_"], :] = -10000.0
        self.transitions.data[:, label_idx["_EOS_"]] = -10000.0
        
            
    def _forward_alg(self,feats):# 本质上计算信息传递矩阵
        init_alphas=torch.full((1,self.label_size),-10000.).to(self.device)
        init_alphas[0][label_idx["_BOS_"]]=0.0
        forward_var = init_alphas
        for feat in feats:
            alphas_t=[]
            for next_label in range(self.label_size):
                emit_score=feat.view(1,-1).to(self.device)
                trans_score=self.transitions[next_label].view(1,-1).to(self.device)
                next_tag_var = forward_var + trans_score + emit_score
                alphas_t.append(torch.logsumexp(next_tag_var,dim=1))
            forward_var=torch.tensor(alphas_t).view(1,-1).to(self.device)
        #print(forward_var.shape)    
        terminal_var = forward_var + self.transitions[self.label_idx["_EOS_"]]
        alpha = torch.logsumexp(terminal_var,dim=1)
        return alpha
    
    def _sentence_score(self,feats,labels):
        score = torch.zeros(1).to(self.device)
        tags= torch.tensor(self.label_idx["_BOS_"]).view(1,-1).to(self.device)
        tags = torch.cat((tags,labels),dim=1).view(-1)
        for i, feat in enumerate(feats):
            score = score+self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.label_idx["_EOS_"], tags[-1]]
        return score
    
    def neg_log_likelihood(self,feats,labels):
        forward_score=self._forward_alg(feats).to(device)
        gold_score=self._sentence_score(feats,labels)
        return forward_score-gold_score
    
    def _viterbi_decode(self, feats):
        """维特比解码，给定输入x和相关参数(发射矩阵和转移矩阵)，或者概率最大的标签序列
        """
        backpointers = []
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.label_idx["_BOS_"]] = 0
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  
            viterbivars_t = []  
            for next_tag in range(self.tagset_size):
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = torch.argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)
        terminal_var = forward_var + self.transitions[self.label_idx["_EOS_"]]
        best_tag_id = torch.argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.label_idx["_BOS_"]  # Sanity check
        best_path.reverse()
        return path_score, best_path

In [58]:
class BiLSTM_CNN_CRF(nn.Module):
    def __init__(self,vocab_size,embedding_dim,label_idx,hidden_size,
                 embed_matrix=None,dropout=0.5,device="cpu"):
        super(BiLSTM_CNN_CRF,self).__init__()
        self.vocab_size=vocab_size
        self.embedding_dim=embedding_dim
        self.label_idx=label_idx
        self.label_size=len(label_idx)
        self.hidden_size=hidden_size
        #self.dropout=dropout
        self.device=device
        self.word_embedding=nn.Embedding(self.vocab_size,self.embedding_dim,
                                        padding_idx=0)
        self.word_embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.word_embedding.weight.requires_grad = False
        self.char_embedding=char_cnn(embed_size=200,num_filters=10,
                                     filter_size=2)
        self.encoder=nn.LSTM(input_size=2*self.embedding_dim,
                             hidden_size=self.hidden_size//2,
                             bidirectional=True
                            )
        self.hidden=(torch.randn(2, 1, self.hidden_size // 2).to(self.device),
                     torch.randn(2, 1, self.hidden_size // 2).to(self.device))
        self.hidden2label=nn.Linear(self.hidden_size,self.label_size)
        self.crf=CRF(label_idx,self.device)
        
        
    def _get_features(self,sentence,word_batch,sentence_len):
        word_batch=word_batch.squeeze(0)[0:sentence_len,:]
        word_batch=word_batch.to(self.device)
        char_embed=self.char_embedding(word_batch) #sentence_len*1*100
        sentence=sentence[:,0:sentence_len]
        sentence=sentence.to(self.device)
        word_embed=self.word_embedding(sentence) #1*sentence_len*100
        word_embed=word_embed.transpose(0,1)    #sentence_len * 1 *100
        embed=torch.cat((word_embed,char_embed),dim=2)
        bilstm_output,hidden_n=self.encoder(embed,self.hidden)
        # bilstm_output= seq_len* 1 * hidden_size
        bilstm_output=bilstm_output.squeeze(1)
        bilstm_feats=self.hidden2label(bilstm_output)
        return bilstm_feats
    
    def loss_function(self,label,sentence,word_batch,sentence_len):
        label=label.to(self.device)
        word_batch=word_batch.to(self.device)
        sentence_=sentence.to(self.device)
        feats=self._get_features(sentence,word_batch,sentence_len)
        feats=feats.to(self.device)
        loss=self.crf.neg_log_likelihood(feats,label)
        return loss
    
    def forward(self,label_idx,sentence,word_batch,sentence_len):
        feats=self._get_features(sentence,word_batch,sentence_len)
        score,labels=self.crf._viterbi_decode(feats)
        return score,labels

In [59]:
model=BiLSTM_CNN_CRF(embedding_matrix.shape[0],
                     embedding_matrix.shape[1],
                    label_idx,hidden_size,embed_matrix=embedding_matrix,
                    dropout=dropout,device=device
                    ).to(device)
optimizer=torch.optim.Adam(model.parameters(),lr=lr)

200 10 2


In [None]:
for epoch in range(epochs):
    model.train()
    for i,batch in enumerate(train_loader):
        sentence=batch["sentence"]
        label=batch["label"]
        word_batch=batch["char_level"]
        sentence_len=batch["length"]
        optimizer.zero_grad()
        loss=model.loss_function(label,sentence,word_batch,sentence_len)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        if i % 100==0:
            print("i:{:d}, loss:{:.4f}".format(i,loss.item()))
    print("epoch is:",epoch)


i:0, loss:10009.0293
i:100, loss:10018.4570
i:200, loss:10011.3672
i:300, loss:10029.8320
i:400, loss:10089.9365
i:500, loss:10091.7617
i:600, loss:10149.0400
i:700, loss:10078.2080
i:800, loss:10327.1816
i:900, loss:10108.0059
i:1000, loss:10126.9355
i:1100, loss:10148.5762
i:1200, loss:10696.4688
i:1300, loss:10080.7803
i:1400, loss:10088.2148
