# LSTM

PoS tagger del castellano basado en LTSMs. Entrenado con el corpus Ancora.

Basado en:

- https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#example-an-lstm-for-part-of-speech-tagging

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f52a1ca3c50>

## Load Data

In [2]:
from tagging.ancora import SimpleAncoraCorpusReader

# load training data
files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml'
corpus = SimpleAncoraCorpusReader('ancora-3.0.1es/', files)
train_sents = list(corpus.tagged_sents())
train_sents = [s for s in train_sents if s]  # remove empty sentences

In [3]:
# load test data
test_files = '3LB-CAST/.*\.tbf\.xml'
test_corpus = SimpleAncoraCorpusReader('ancora-3.0.1es/', test_files)
test_sents = list(test_corpus.tagged_sents())

## Map to Numbers

In [5]:
from collections import Counter
wcount = Counter()
tagset = set()
for sent in train_sents:
    words, tags = zip(*sent)
    wcount.update(words)
    tagset.update(tags)

In [6]:
len(wcount), len(tagset)

(39393, 83)

In [7]:
vocab_size = 10000  # including the unknown element
tagset_size = len(tagset)
vocab = sorted(set(w for w, _ in wcount.most_common(vocab_size - 1)))

In [8]:
v_to_i = dict((v, i + 1) for i, v in enumerate(vocab))
v_to_i['xxxunk'] = 0  # unknown element is mapped to 0
t_to_i = dict((v, i) for i, v in enumerate(sorted(tagset)))
i_to_t = sorted(tagset)

In [17]:
v_to_i.get('pelota', 0)

7540

In [18]:
def prepare_sequence(seq, to_ix):
    # use 0 for unknown words
    idxs = [to_ix.get(w, 0) for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [110]:
sent = prepare_sequence('el gato come pescado'.split(), v_to_i)
sent

tensor([4714, 5664,    0, 7648])

## Neural Network

In [20]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [21]:
embedding_dim = 10
hidden_dim = 10
model = LSTMTagger(embedding_dim, hidden_dim, vocab_size, tagset_size)

## Tag a Single Sentence

In [112]:
sent, tags = zip(*test_sents[0])
seq = prepare_sequence(sent, v_to_i)

In [113]:
' '.join(sent), seq

('El presidente del órgano regulador de las Telecomunicaciones se mostró partidario de completar esta liberalización de las telecomunicaciones con otras medidas que incentiven la competencia como_puede_ser abrir el acceso a la información de los clientes de Telefónica a otros operadores .',
 tensor([ 763, 7939, 4161, 9987,    0, 4039, 6443,    0, 8854, 6948, 7467, 4039,
         3442, 5102, 6500, 4039, 6443, 9342, 3492, 7364, 6774, 8248,    0, 6412,
         3426,    0, 1965, 4714, 1989, 1885, 6412, 6110, 4039, 6613, 3273, 4039,
         1727, 1885, 7366, 7297,    8]))

In [114]:
with torch.no_grad():
    tag_scores = model(seq)

In [71]:
tags = tag_scores.argmax(dim=1)
tags.shape

torch.Size([41])

In [72]:
pred_tags = [i_to_t[tag] for tag in tags]

In [73]:
print(list(zip(sent, pred_tags)))

[('El', 'fs'), ('presidente', 'fe'), ('del', 'fe'), ('órgano', 'fe'), ('regulador', 'fs'), ('de', 'fe'), ('las', 'fe'), ('Telecomunicaciones', 'fs'), ('se', 'fe'), ('mostró', 'fs'), ('partidario', 'cs'), ('de', 'fe'), ('completar', 'p0000000'), ('esta', 'vag0000'), ('liberalización', 'dd0000'), ('de', 'fe'), ('las', 'fe'), ('telecomunicaciones', 'fs'), ('con', 'rn'), ('otras', 'fe'), ('medidas', 'fe'), ('que', 'fe'), ('incentiven', 'fs'), ('la', 'fe'), ('competencia', 'fs'), ('como_puede_ser', 'fs'), ('abrir', 'dd0000'), ('el', 'fs'), ('acceso', 'vag0000'), ('a', 'vag0000'), ('la', 'fs'), ('información', 'dd0000'), ('de', 'fe'), ('los', 'dd0000'), ('clientes', 'dd0000'), ('de', 'vaii000'), ('Telefónica', 'fe'), ('a', 'vag0000'), ('otros', 'fs'), ('operadores', 'fs'), ('.', 'fs')]


## Train

In [74]:
# Negative Log-Likelihood Loss
# https://pytorch.org/docs/stable/nn.html#nllloss
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

### One Step

In [77]:
tagged_sent = train_sents[0]
sent, tags = zip(*tagged_sent)
isent = prepare_sequence(sent, v_to_i)
itags = prepare_sequence(tags, t_to_i)

tag_scores = model(isent)  # forwad step

In [84]:
loss = loss_function(tag_scores, itags)

In [85]:
loss

tensor(4.3652, grad_fn=<NllLossBackward>)

In [86]:
loss.backward()  # calls backwards on tag_scores and then inside all the model

In [87]:
optimizer.step()

In [88]:
# check new loss
tag_scores = model(isent)
loss = loss_function(tag_scores, itags)
loss

tensor(4.3399, grad_fn=<NllLossBackward>)

### One Epoch

In [89]:
len(train_sents)

13857

In [90]:
# with a bit from https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
running_loss = 0.0
for i, tagged_sent in enumerate(train_sents):
    # optimizer.zero_grad()
    model.zero_grad()  # are these equivalent?

    sent, tags = zip(*tagged_sent)
    isent = prepare_sequence(sent, v_to_i)
    itags = prepare_sequence(tags, t_to_i)

    tag_scores = model(isent)  # forwad step
    loss = loss_function(tag_scores, itags)
    loss.backward()  # calls backwards on tag_scores and then inside all the model
    
    optimizer.step()
    
    running_loss += loss.item()
    if i % 1000 == 999:
        print('loss at {}: {}'.format(i, running_loss / 1000))
        running_loss = 0.0

loss at 999: 2.8925788106918335
loss at 1999: 2.210839986205101
loss at 2999: 1.9667545999288558
loss at 3999: 1.7658093155026435
loss at 4999: 1.704954649090767
loss at 5999: 1.6245890661478042
loss at 6999: 1.671179015338421
loss at 7999: 1.6694490669369697
loss at 8999: 1.5900325691998005
loss at 9999: 1.5400521922707557
loss at 10999: 1.463077837049961
loss at 11999: 1.3678464286103844
loss at 12999: 1.3661414576172828


### N Epochs

In [105]:
# with a bit from https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html

for epoch in range(4):
    running_loss = 0.0
    for i, tagged_sent in enumerate(train_sents[:5000]):
        # optimizer.zero_grad()
        model.zero_grad()  # are these equivalent?

        sent, tags = zip(*tagged_sent)
        isent = prepare_sequence(sent, v_to_i)
        itags = prepare_sequence(tags, t_to_i)

        tag_scores = model(isent)  # forwad step
        loss = loss_function(tag_scores, itags)
        loss.backward()  # calls backwards on tag_scores and then inside all the model

        optimizer.step()

        running_loss += loss.item()
        if i % 2000 == 1999:
            print('loss at {}: {}'.format(i, running_loss / 2000))
            running_loss = 0.0

loss at 1999: 1.2726990660130977
loss at 3999: 1.2318624093532562


In [171]:
# more epochs, now measuring time

# with a bit from https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
import time
start = time.time()

for epoch in range(4):
    running_loss = 0.0
    for i, tagged_sent in enumerate(train_sents):
        # optimizer.zero_grad()
        model.zero_grad()  # are these equivalent?

        sent, tags = zip(*tagged_sent)
        isent = prepare_sequence(sent, v_to_i)
        itags = prepare_sequence(tags, t_to_i)

        tag_scores = model(isent)  # forwad step
        loss = loss_function(tag_scores, itags)
        loss.backward()  # calls backwards on tag_scores and then inside all the model

        optimizer.step()

        running_loss += loss.item()
        if i % 2000 == 1999:
            print('loss at {}: {}'.format(i, running_loss / 2000))
            running_loss = 0.0

end = time.time()

loss at 1999: 0.8682691221386194
loss at 3999: 0.8556401647403836
loss at 5999: 0.885220473356545
loss at 7999: 0.9375973992832005
loss at 9999: 0.9077403522077948
loss at 11999: 0.8568295622328296
loss at 1999: 0.8273824489563704
loss at 3999: 0.8153675747662783
loss at 5999: 0.8469382223002613
loss at 7999: 0.895628368742764
loss at 9999: 0.8662434305520729
loss at 11999: 0.819216605653055
loss at 1999: 0.7926845996007323
loss at 3999: 0.7811656927168369
loss at 5999: 0.8131427071280778
loss at 7999: 0.8583107233792543
loss at 9999: 0.8298938591573387
loss at 11999: 0.7853196448897943
loss at 1999: 0.7610678947642445
loss at 3999: 0.7498976232036948
loss at 5999: 0.7825025436617434
loss at 7999: 0.8260377780143171
loss at 9999: 0.7987665629573166
loss at 11999: 0.7552165974611417


In [172]:
end - start

1006.1496112346649

## Evaluate

In [107]:
with torch.no_grad():
    y_true, y_pred = [], []
    for tagged_sent in test_sents:
        sent, tags = zip(*tagged_sent)
        
        y_true.extend(tags)
        
        seq = prepare_sequence(sent, v_to_i)
        tag_scores = model(seq)
        tags = tag_scores.argmax(dim=1)
        pred_tags = [i_to_t[tag] for tag in tags]
        y_pred.extend(pred_tags)

In [165]:
# on ~5 epochs
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)

0.7301969226872665

In [174]:
def eval():
    with torch.no_grad():
        y_true, y_pred = [], []
        for tagged_sent in test_sents:
            sent, tags = zip(*tagged_sent)

            y_true.extend(tags)

            seq = prepare_sequence(sent, v_to_i)
            tag_scores = model(seq)
            tags = tag_scores.argmax(dim=1)
            pred_tags = [i_to_t[tag] for tag in tags]
            y_pred.extend(pred_tags)
    
    return y_true, y_pred

In [175]:
y_true, y_pred = eval()

In [176]:
accuracy_score(y_true, y_pred)

0.7618776251081703

# Move to CUDA

In [93]:
# torch.cuda.is_available()
tag_scores.is_cuda

False

In [94]:
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [97]:
# move things to cuda
model.to(device)  # out of memory :(

LSTMTagger(
  (word_embeddings): Embedding(10000, 10)
  (lstm): LSTM(10, 10)
  (hidden2tag): Linear(in_features=10, out_features=83, bias=True)
)

# TODOs

- Save on disk!
- Gradient clipping?
- Regularization?

Advanced:

- Fixed pre-computed embeddings
- Beam search
- Bidirectional LSTM
- Multi-layer LSTM