# 用RNN做POS tagging

姓名: 蒋一泽

在这份作业中，你会用一个bidirectional recurrent neural network来做POS tagging。

In [1]:
# import necessary libraries and set the random seeds

import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import lr_scheduler
from torchtext import data
import numpy as np
import random
from torch.utils.data import Dataset
import time
import shutil

EMBEDDING_DIM = 300
HIDDEN_DIM = 200


USE_CUDA = torch.cuda.is_available()
random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
if USE_CUDA:
    torch.cuda.manual_seed(53113)
    
BATCH_SIZE = 128


载入POS tagging训练和dev数据集。这些文件都是tab分隔的text和POS tag数据，

In [14]:
def load_datasets():
    text = data.Field(include_lengths=True, batch_first=True)
    tags = data.Field(batch_first=True)
#     train_data, val_data, test_data = data.TabularDataset.splits(path='Pytorch-POS-Tagger/RNN_Data_files/', train='train_data.tsv', validation='val_data.tsv', test='val_data.tsv', fields=[('text', text), ('tags', tags)], format='tsv')
    train_data, val_data, test_data = data.TabularDataset.splits(path='/media/bnu/data/nlp-practice/part-of-speech-tagging/', train='train.txt', validation='dev.txt', test='dev.txt', fields=[('text', text), ('tags', tags)], format='tsv')

    
    batch_sizes = (BATCH_SIZE, BATCH_SIZE, BATCH_SIZE)
    train_loader, dev_loader, test_loader = data.BucketIterator.splits((train_data, val_data, test_data), batch_sizes=batch_sizes, sort_key=lambda x: len(x.text))

    text.build_vocab(train_data)
    tags.build_vocab(train_data)
    dataloaders = {'train': train_loader,
                   'validation': dev_loader,
                   'test': dev_loader}
    return text, tags, dataloaders

text, tags, dataloaders = load_datasets()
text_vocab_size = len(text.vocab.stoi)
tag_vocab_size = len(tags.vocab.stoi)   # = 42 (not including the <pad> token
print(text_vocab_size)
print(tag_vocab_size)

print(tags.vocab.stoi)
for batch in dataloaders['train']:
    print(batch.tags[1])
    print(batch.text[0][1])
    print(batch.text[1][1])
    break

32352
44
defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7fcf7c2f6a10>>, {'<unk>': 0, '<pad>': 1, 'NN': 2, 'NNP': 3, 'IN': 4, 'DT': 5, 'JJ': 6, 'NNS': 7, '.': 8, 'VBD': 9, ',': 10, 'VBN': 11, 'VBZ': 12, 'CD': 13, 'VB': 14, 'CC': 15, 'TO': 16, 'RB': 17, 'VBG': 18, 'VBP': 19, 'PRP': 20, 'POS': 21, 'PRP$': 22, 'MD': 23, '``': 24, 'WDT': 25, 'JJS': 26, 'JJR': 27, 'WP': 28, 'RP': 29, 'NNPS': 30, 'WRB': 31, '$': 32, 'RBR': 33, ':': 34, 'LRB': 35, 'RRB': 36, 'EX': 37, 'RBS': 38, ';': 39, 'PDT': 40, 'WP$': 41, 'UH': 42, 'FW': 43})
tensor([20,  9,  5,  2, 23, 14,  5, 24,  6,  2,  8, 24,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])
tensor([  63,   19,    8, 5377,   91,   46,   29,   28, 5574, 2780,    3,   28,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    

In [3]:
class POSDataset(Dataset):
    def __init__(self, path, sen_vocab, tag_vocab):
        super(POSDataset, self).__init__()
        self.sen_vocab = sen_vocab
        self.tag_vocab = tag_vocab
        self.num_classes = tag_vocab.size()
        sen_file = os.path.join(path, 'sentences.txt')
        tag_file = os.path.join(path, 'tags.txt')
        self.sentences = []
        with open(sen_file, 'r') as f:
            for line in f:
                idxs = self.sen_vocab.toIdx(line.rstrip('\n').split(' '))
                tensor = torch.LongTensor(idxs)
                self.sentences.append(tensor)

        self.tags = []
        with open(tag_file, 'r') as f:
            for line in f:
                idxs = self.tag_vocab.toIdx(line.rstrip('\n').split(' '))
                tensor = torch.LongTensor(idxs)
                self.tags.append(tensor)

        # making sure there are same number of sentences as tags.
        assert(len(self.sentences) == len(self.tags))

    def __getitem__(self, index):
        sentence = self.sentences[index]
        tags = self.tags[index]
        return sentence, tags

    def __len__(self):
        return len(self.sentences)

In [106]:
def sequence_mask(sequence_length, max_len=None):
    ''' Given a tensor of a sequence of lengths, create a mask of each length. 
    '''
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.range(0, max_len - 1).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


# run one epoch of training
def train(model, train_loader, loss_fn, optimizer, use_gpu=False):
    model.train()  # Set model to training mode
    running_loss = 0.0
    running_corrects = 0
    example_count = 0
    step = 0
    # Iterate over data.
    for batch in train_loader:
        sentences = batch.text[0]
        lengths = batch.text[1]
        tags = batch.tags
        ''' Implement the code to train the model. 
            - Prepare the input data (text, tags, mask) to the correct format and shape
            - Run the forward method of the model
            - Compute the loss
            - Run backward on loss for back propagation
            - Run the optimizer to update the model parameters. 
            - Compute the number of correct predictions
        '''
        # TODO
        if USE_CUDA:
            sentences = sentences.to('cuda')
            lengths = lengths.to('cuda')
            tags = tags.to('cuda')
        mask = (tags != 1)
        
        outputs = model(sentences, lengths)
        
        running_corrects += ((outputs.argmax(-1) == tags) * mask).sum().item()
        example_count += lengths.sum().item()
        
        outputs = outputs.view(-1, outputs.size(2))
        tags = tags.view(-1)
        loss = loss_fn(outputs, tags)
        
        running_loss += loss.item() * lengths.sum().item()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        step += 1
        if step % 100 == 0:
            print('loss: {}, running_corrects: {}, example_count: {}, acc: {}'.format(loss.item(), 
                            running_corrects, example_count, (running_corrects / example_count) * 100))
        if step * BATCH_SIZE >= 40000:
            break
    loss = running_loss / example_count
    acc = (running_corrects / example_count) * 100
    print(loss)
    print(acc)
    print('Train Loss: {:.4f} Acc: {:2.3f} ({}/{})'.format(loss, acc, running_corrects, example_count))
    return loss, acc


def validate(model, val_loader, loss_fn, use_gpu=False):
    model.eval()  # Set model to evaluate mode
    running_loss = 0.0
    running_corrects = 0
    example_count = 0
    # Iterate over data.
    with torch.no_grad():
        for batch in val_loader:
            sentences = batch.text[0]
            lengths = batch.text[1]
            tags = batch.tags
            ''' Similar to training, do the following to evaluate the model.  
            - Prepare the input data (text, tags, mask) to the correct format and shape
            - Run the forward method of the model
            - Compute the loss
            - Compute the number of correct predictions
            '''
            # TODO
            if USE_CUDA:
                sentences = sentences.to('cuda')
                lengths = lengths.to('cuda')
                tags = tags.to('cuda')
            mask = (tags != 1)
            
            outputs = model(sentences, lengths)
            
            running_corrects += ((outputs.argmax(-1) == tags) * mask).sum().item()
            example_count += lengths.sum().item()
            
            outputs = outputs.view(-1, outputs.size(2))
            tags = tags.view(-1)
            loss = loss_fn(outputs, tags)

            running_loss += loss.item() * lengths.sum().item()

    loss = running_loss / example_count
    acc = (running_corrects / example_count) * 100
    print(loss)
    print(acc)
    print('Validation Loss: {:.4f} Acc: {:2.3f} ({}/{})'.format(loss, acc, running_corrects, example_count))
    return loss, acc


def train_model(model, data_loaders, criterion, optimizer, scheduler, save_dir, num_epochs=25, use_gpu=False):
    print('Training Model with use_gpu={}...'.format(use_gpu))
    since = time.time()

    best_model_wts = model.state_dict()
    best_acc = 0.0
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        train_begin = time.time()
        train_loss, train_acc = train(model, data_loaders['train'], criterion, optimizer, use_gpu)
        train_time = time.time() - train_begin
        print('Epoch Train Time: {:.0f}m {:.0f}s'.format(train_time // 60, train_time % 60))
        
        validation_begin = time.time()
        val_loss, val_acc = validate(model, data_loaders['validation'], criterion, use_gpu)
        validation_time = time.time() - validation_begin
        print('Epoch Validation Time: {:.0f}m {:.0f}s'.format(validation_time // 60, validation_time % 60))
        
        # deep copy the model
        is_best = val_acc > best_acc
        if is_best:
            best_acc = val_acc
            best_model_wts = model.state_dict()

        save_checkpoint(save_dir, {
            'epoch': epoch,
            'best_acc': best_acc,
            'state_dict': model.state_dict(),
            # 'optimizer': optimizer.state_dict(),
        }, is_best)

        scheduler.step()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    # load best model weights
    model.load_state_dict(best_model_wts)

    return model


def save_checkpoint(save_dir, state, is_best):
    savepath = save_dir + '/' + 'checkpoint.pth.tar'
    torch.save(state, savepath)
    if is_best:
        shutil.copyfile(savepath, save_dir + '/' + 'model_best.pth.tar')


def test_model(model, test_loader, use_gpu=False):
    model.eval()  # Set model to evaluate mode
    running_corrects = 0
    example_count = 0
    test_begin = time.time()
    # Iterate over data.
    
    incorrect_records = []
    with torch.no_grad():
        for batch in test_loader:
            sentences = batch.text[0]
            lengths = batch.text[1]
            tags = batch.tags
            ''' Similar to dev, except do we not need to compute the loss here
            '''
            if USE_CUDA:
                sentences = sentences.to('cuda')
                lengths = lengths.to('cuda')
                tags = tags.to('cuda')
            mask = (tags != 1)
            
            outputs = model(sentences, lengths)
            
            for i in range(outputs.size(0)):
                y_preds = outputs[i].argmax(-1)
                y_trues = tags[i]
                
                if ((y_preds == y_trues) * mask[i]).sum().item() != lengths[i].item():
                    incorrect_records.append((sentences[i], y_preds, y_trues, lengths[i].item()))
            
            running_corrects += ((outputs.argmax(-1) == tags) * mask).sum().item()
            example_count += lengths.sum().item()
            

    acc = (running_corrects / example_count) * 100
    print('Test Acc: {:2.3f} ({}/{})'.format(acc, running_corrects, example_count))
    test_time = time.time() - test_begin
    print('Test Time: {:.0f}m {:.0f}s'.format(test_time // 60, test_time % 60))
    return acc, incorrect_records

# Define the model

In [80]:
class POSTagger(nn.Module):
    def __init__(self, rnn_class, embedding_dim, hidden_dim, vocab_size, target_size, num_layers):
        super(POSTagger, self).__init__()
        ''' Define your model here
            Basically, your model only need three components:
            - an embedding layer
            - a bidirectional RNN (LSTM, GRU) that takes the embeddings and outputs hidden states
            - a final linear prediction layer to convert hidden states to tag scores
            Optionally, define extra layers such as dropout to prevent overfitting. 
        ''' 
        # TODO
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=1)
        if rnn_class == 'lstm':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, 
                               bidirectional=True, batch_first=True)
        elif rnn_class == 'gru':
            self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers=num_layers, 
                              bidirectional=True, batch_first=True)
            
        self.linear = nn.Linear(2 * hidden_dim, target_size)
        self.dropout = nn.Dropout(0.2)
        
        
    def forward(self, sentences, lengths):
        ''' Define your forward method
        ''' 
        # TODO
        x_embed = self.dropout(self.embed(sentences))
        
        packed_emb = nn.utils.rnn.pack_padded_sequence(
            x_embed, lengths, batch_first=True, enforce_sorted=False
        )
        packed_out, _ = self.rnn(packed_emb)
        x_out, _ = nn.utils.rnn.pad_packed_sequence(
            packed_out, batch_first=True
        )
        
        tag_scores = self.linear(self.dropout(x_out))
        return tag_scores
    
EMBEDDING_DIM = 300
HIDDEN_DIM = 300

model = POSTagger("lstm", EMBEDDING_DIM, HIDDEN_DIM, text_vocab_size, tag_vocab_size, 3)
if USE_CUDA:
    model = model.cuda()

In [81]:
LR = 0.001
GAMMA = 1.
STEP_SIZE = 10
NUM_EPOCHS = 20
SAVE_DIR = "./"
loss_fn = nn.CrossEntropyLoss(ignore_index=1)
optimizer = optim.Adam(model.parameters(), lr=LR)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)
model = train_model(model, dataloaders, loss_fn, optimizer, exp_lr_scheduler, SAVE_DIR, NUM_EPOCHS, use_gpu=USE_CUDA)


Training Model with use_gpu=True...
Epoch 0/19
----------
loss: 0.5309128165245056, running_corrects: 165862, example_count: 280570, acc: 59.11608511244965
loss: 0.21247901022434235, running_corrects: 416669, example_count: 560065, acc: 74.39654325837179
loss: 0.18891668319702148, running_corrects: 679922, example_count: 840901, acc: 80.85636715855968
0.6493956264656602
81.38954443375773
Train Loss: 0.6494 Acc: 81.390 (712515/875438)
Epoch Train Time: 0m 13s
0.1482745125660661
95.55020080321285
Validation Loss: 0.1483 Acc: 95.550 (83272/87150)
Epoch Validation Time: 0m 0s
Epoch 1/19
----------
loss: 0.12056713551282883, running_corrects: 268164, example_count: 279731, acc: 95.86495597556224
loss: 0.10973041504621506, running_corrects: 538481, example_count: 560712, acc: 96.03521950662729
loss: 0.11439646780490875, running_corrects: 808516, example_count: 840354, acc: 96.2113585465173
0.12458189218607638
96.22725995444567
Train Loss: 0.1246 Acc: 96.227 (842410/875438)
Epoch Train Time: 

Epoch 15/19
----------
loss: 0.008000845089554787, running_corrects: 279785, example_count: 280713, acc: 99.66941324413192
loss: 0.019886760041117668, running_corrects: 558687, example_count: 560614, acc: 99.65626973282865
loss: 0.014605679549276829, running_corrects: 837613, example_count: 840602, acc: 99.64442149792649
0.010621668408602361
99.64269314331797
Train Loss: 0.0106 Acc: 99.643 (872310/875438)
Epoch Train Time: 0m 14s
0.10684419761315134
97.87951807228914
Validation Loss: 0.1068 Acc: 97.880 (85302/87150)
Epoch Validation Time: 0m 0s
Epoch 16/19
----------
loss: 0.0095213046297431, running_corrects: 278829, example_count: 279705, acc: 99.68681289215424
loss: 0.010095213539898396, running_corrects: 557917, example_count: 559706, acc: 99.68036790743712
loss: 0.0090538514778018, running_corrects: 837318, example_count: 840203, acc: 99.65663059998595
0.009825241345491703
99.65502982507041
Train Loss: 0.0098 Acc: 99.655 (872418/875438)
Epoch Train Time: 0m 14s
0.1126542175423121


In [107]:
acc, records = test_model(model, dataloaders['test'], use_gpu=USE_CUDA)
print(acc)

Test Acc: 97.822 (85252/87150)
Test Time: 0m 1s
97.82214572576018


In [111]:
for k in range(len(records)):
    sent = records[k][0].cpu().numpy()
    pred = records[k][1].cpu().numpy()
    target = records[k][2].cpu().numpy()
    length = records[k][3]
    sent = ' '.join([text.vocab.itos[sent[i]] for i in range(length)])
    pred = ' '.join([tags.vocab.itos[pred[i]] for i in range(length)])
    target = ' '.join([tags.vocab.itos[target[i]] for i in range(length)])
    print('S:', sent)
    print('P:', pred)
    print('T:', target)
    print('-' * 60)

S: Those who seek to please <unk> please nobody .
P: DT WP VBP TO VB NNP VBP NNP .
T: DT WP VBP TO VB DT VB DT .
------------------------------------------------------------
S: Christians <unk> Bethlehem as the birthplace of Jesus .
P: NNPS VBP NNP IN DT NN IN NNP .
T: NNS VBP NNP IN DT NN IN NNP .
------------------------------------------------------------
S: The final vote was 270 - 3 .
P: DT JJ NN VBD VBN IN CD .
T: DT JJ NN VBD CD IN CD .
------------------------------------------------------------
S: The Palestinian officials provided no more details .
P: DT JJ NNS VBD DT JJR NNS .
T: DT NN NNS VBD DT JJR NNS .
------------------------------------------------------------
S: The Senate approved the extension late Thursday .
P: DT NNP VBD DT NN RB NNP .
T: DT NNP VBD DT NN JJ NNP .
------------------------------------------------------------
S: Oil refining and storage ended in 2009 .
P: NN NN CC NN VBD IN CD .
T: NN NN CC NN VBN IN CD .
--------------------------------------------

S: The Republican National Committee responded to Senator Kerry 's speech by saying the efforts to politicize the tragedy are ' <unk> ' .
P: DT NNP NNP NNP VBD TO NNP NNP POS NN IN VBG DT NNS TO VB DT NN VBP `` VBG `` .
T: DT NNP NNP NNP VBD TO NNP NNP POS NN IN VBG DT NNS TO VB DT NN VBP `` JJ `` .
------------------------------------------------------------
S: The United Nations humanitarian coordinator for Iraq , David <unk> , says U.N. officials must respond rapidly to people who need support .
P: DT NNP NNPS JJ NN IN NNP , NNP NNP , VBZ NNP NNS MD VB RB TO NNS WP VBP NN .
T: DT NNP NNP JJ NN IN NNP , NNP NNP , VBZ NNP NNS MD VB RB TO NNS WP VBP NN .
------------------------------------------------------------
S: <unk> incidents in the northern West Bank have increased in the past two weeks after Israeli troops began raids in the area .
P: NN NNS IN DT JJ NNP NNP VBP VBN IN DT JJ CD NNS IN JJ NNS VBD NNS IN DT NN .
T: VBG NNS IN DT JJ NNP NNP VBP VBN IN DT JJ CD NNS IN JJ NNS VBD N

S: Officials said Sunday that operators were testing the arm , which will be used to <unk> up samples of Martian soil and ice for testing in the lander 's onboard laboratory .
P: NNS VBD NNP IN NNS VBD VBG DT NN , WDT MD VB VBN TO VB RP NNS IN JJ NN CC NN IN NN IN DT NN POS JJ NN .
T: NNS VBD NNP IN NNS VBD VBG DT NN , WDT MD VB VBN TO VB RP NNS IN JJ NN CC NN IN NN IN DT NN POS NN NN .
------------------------------------------------------------
S: Russian President Vladimir Putin has sent new year 's greetings to <unk> old <unk> <unk> , a Soviet partisan in World War II who was convicted in Latvia of war crimes .
P: JJ NNP NNP NNP VBZ VBN JJ NN POS NNS TO VB JJ NNP NNP , DT JJ NN IN NNP NNP NNP WP VBD VBN IN NNP IN NN NNS .
T: JJ NNP NNP NNP VBZ VBN JJ NN POS NNS TO JJ JJ NNP NNP , DT JJ NN IN NNP NNP NNP WP VBD VBN IN NNP IN NN NNS .
------------------------------------------------------------
S: The major economic challenge for Cambodia over the next decade will be <unk> an economi