http://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html#example-an-lstm-for-part-of-speech-tagging

In [30]:
import numpy as np
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from keras.preprocessing.sequence import pad_sequences
from __future__ import print_function

Using TensorFlow backend.


In [2]:
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]

In [6]:
word_to_ix = {}
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)

print(word_to_ix)

{'Everybody': 5, 'ate': 2, 'apple': 4, 'that': 7, 'read': 6, 'dog': 1, 'book': 8, 'the': 3, 'The': 0}


In [7]:
tag_to_ix = {"DET": 0, "NN": 1, "V": 2}

In [8]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

In [37]:
inputs = [[word_to_ix[w] for w in s] for s, _ in training_data]
inputs = pad_sequences(inputs, maxlen=max([len(s) for s in inputs]), padding='post')
inputs

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 0]], dtype=int32)

In [64]:
targets = [[tag_to_ix[w] for w in t] for _, t in training_data]
targets = pad_sequences(targets, maxlen=max([len(s) for s in outputs]), padding='post')
targets

array([[0, 1, 2, 0, 1],
       [1, 2, 0, 1, 0]], dtype=int32)

In [66]:
inputs = Variable(torch.from_numpy(inputs).long())
targets = Variable(torch.from_numpy(targets).long())

In [39]:
EMBEDDING_SIZE = 6
HIDDEN_SIZE = 6

In [58]:
class LSTMTagger(nn.Module):
    
    def __init__(self, embedding_size, hidden_size, vocab_size, tagset_size, batch_size):
        super(LSTMTagger, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        
        self.hidden_size = hidden_size
        self.states = self.init_hidden(batch_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)
        
        self.out = nn.Linear(hidden_size, target_size)
    
    def forward(self, input):
        output, states = self.lstm(self.embedding(input), self.states)
        output = self.out(output)
        output = F.log_softmax(output)
        
        return output
    
    def init_hidden(self, batch_size):
        init_hidden_state = Variable(torch.zeros(1, batch_size, self.hidden_size))
        init_cell_state = Variable(torch.zeros(1, batch_size, self.hidden_size))
        
        return (init_hidden_state, init_cell_state)

In [41]:
vocab_size = len(word_to_ix)
embedding_size = EMBEDDING_SIZE
embedding = nn.Embedding(vocab_size, embedding_size)

hidden_size = HIDDEN_SIZE
lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True)

target_size = len(tag_to_ix)
out = nn.Linear(hidden_size, target_size)

batch_size = len(inputs)
init_hidden_state = Variable(torch.zeros(1, batch_size, hidden_size))
init_cell_state = Variable(torch.zeros(1, batch_size, hidden_size))
states = (init_hidden_state, init_cell_state)

In [46]:
output, states = lstm(embedding(inputs), states)
print(output)
print(states)

Variable containing:
(0 ,.,.) = 
  0.0125  0.0638 -0.1566  0.0795 -0.0155  0.0252
  0.0374  0.0070 -0.1613  0.0325 -0.0805 -0.1018
 -0.0083 -0.0942 -0.2753  0.0079  0.0361 -0.0752
  0.0728 -0.0845 -0.2299  0.1048  0.0292 -0.0769
  0.0805  0.0428 -0.1351  0.0395 -0.1368 -0.1728

(1 ,.,.) = 
 -0.3708  0.2416 -0.1105  0.2070 -0.0066 -0.0789
 -0.1872  0.2158 -0.2571  0.0726 -0.0335 -0.0738
 -0.1694  0.2340 -0.2225  0.1150 -0.0497  0.0140
 -0.4145  0.0822 -0.1314  0.2731 -0.0454 -0.1007
 -0.1635  0.0525 -0.2342  0.0998 -0.0228 -0.0476
[torch.FloatTensor of size 2x5x6]

(Variable containing:
(0 ,.,.) = 
  0.0805  0.0428 -0.1351  0.0395 -0.1368 -0.1728
 -0.1635  0.0525 -0.2342  0.0998 -0.0228 -0.0476
[torch.FloatTensor of size 1x2x6]
, Variable containing:
(0 ,.,.) = 
  0.1139  0.0810 -0.3410  0.0620 -0.2803 -0.3426
 -0.3066  0.1363 -0.4942  0.3002 -0.0416 -0.0923
[torch.FloatTensor of size 1x2x6]
)


In [50]:
output = out(output)
print(output)

Variable containing:
(0 ,.,.) = 
 -0.0856 -0.3301 -0.1152
 -0.0453 -0.3505 -0.1871
  0.0551 -0.2725 -0.2111
  0.0238 -0.3073 -0.1981
 -0.0626 -0.3845 -0.2050

(1 ,.,.) = 
 -0.1152 -0.5399 -0.0718
 -0.0628 -0.3807 -0.1236
 -0.1161 -0.3860 -0.0832
 -0.0696 -0.5799 -0.1282
 -0.0303 -0.3791 -0.1528
[torch.FloatTensor of size 2x5x3]



In [51]:
output = F.log_softmax(output)
print(output)

Variable containing:
(0 ,.,.) = 
 -0.6785 -0.5937 -0.7151
 -0.6844 -0.6782 -0.7254
 -0.6112 -0.6380 -0.7592
 -0.6475 -0.5661 -0.7287
 -0.7094 -0.6958 -0.7196

(1 ,.,.) = 
 -0.7080 -0.8035 -0.6717
 -0.7019 -0.7084 -0.6619
 -0.7824 -0.7515 -0.6312
 -0.7409 -0.8387 -0.6588
 -0.6771 -0.6905 -0.6674
[torch.FloatTensor of size 2x5x3]



In [59]:
model = LSTMTagger(EMBEDDING_SIZE, HIDDEN_SIZE, len(word_to_ix), len(tag_to_ix), len(inputs))
model

LSTMTagger (
  (embedding): Embedding(9, 6)
  (lstm): LSTM(6, 6, batch_first=True)
  (out): Linear (6 -> 3)
)

In [62]:
loss_function = nn.NLLLoss()
optimizer = optim.RMSprop(model.parameters(), lr=0.1)

In [67]:
outputs = model(inputs)
outputs

Variable containing:
(0 ,.,.) = 
 -0.7125 -0.6804 -0.6590
 -0.6862 -0.6986 -0.6383
 -0.6607 -0.7290 -0.6925
 -0.7017 -0.6689 -0.6946
 -0.6940 -0.6867 -0.7415

(1 ,.,.) = 
 -0.6741 -0.7061 -0.7286
 -0.7001 -0.6877 -0.7512
 -0.7267 -0.6585 -0.6938
 -0.6847 -0.7180 -0.6917
 -0.6923 -0.6997 -0.6471
[torch.FloatTensor of size 2x5x3]

In [84]:
outputs.view(-1, len(tag_to_ix))

Variable containing:
-0.7125 -0.6804 -0.6590
-0.6862 -0.6986 -0.6383
-0.6607 -0.7290 -0.6925
-0.7017 -0.6689 -0.6946
-0.6940 -0.6867 -0.7415
-0.6741 -0.7061 -0.7286
-0.7001 -0.6877 -0.7512
-0.7267 -0.6585 -0.6938
-0.6847 -0.7180 -0.6917
-0.6923 -0.6997 -0.6471
[torch.FloatTensor of size 10x3]

In [89]:
targets

Variable containing:
 0  1  2  0  1
 1  2  0  1  0
[torch.LongTensor of size 2x5]

In [90]:
targets.view(-1)

Variable containing:
 0
 1
 2
 0
 1
 1
 2
 0
 1
 0
[torch.LongTensor of size 10]

In [88]:
loss_function(outputs.view(-1, len(tag_to_ix)), targets.view(-1))

Variable containing:
 0.7086
[torch.FloatTensor of size 1]

In [103]:
loss = 0

for i in range(targets.size(1)):
    output = outputs[:, i, :]
    target = targets[:, i]
    loss += loss_function(output, target) * targets.size(0)

loss / (targets.size(0) * targets.size(1))

Variable containing:
 0.7086
[torch.FloatTensor of size 1]

In [111]:
for i in range(1000):
    model.zero_grad()
    model.states = model.init_hidden(len(inputs))
    
    outputs = model(inputs)
    
    loss = loss_function(outputs.view(-1, len(tag_to_ix)), targets.view(-1))
    
    loss.backward()
    optimizer.step()

In [112]:
model(inputs).data.topk(1)[1]


(0 ,.,.) = 
  0
  1
  1
  0
  1

(1 ,.,.) = 
  1
  2
  0
  1
  0
[torch.LongTensor of size 2x5x1]