In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time

In [35]:
# Hyperparameters
DIM_EMBEDDING = 100
LSTM_HIDDEN = 50
BATCH_SIZE = 32 
LEARNING_RATE = 0.01
EPOCHS = 10
PAD = '<PAD>'
torch.manual_seed(8446)

<torch._C.Generator at 0x116af6850>

In [36]:
def read_iob2_file(path):
    """
    read in conll file
    
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding='utf-8'):
        line = line.strip()
        if line:
            if line[0] == '#':
                continue # skip comments
            tok = line.split('\t')

            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words:  # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data

train_data= read_iob2_file('./en_ewt-ud-train.iob2')
dev_data = read_iob2_file('./en_ewt-ud-dev.iob2')

print(train_data[0])

(['Where', 'in', 'the', 'world', 'is', 'Iguazu', '?'], ['O', 'O', 'O', 'O', 'O', 'B-LOC', 'O'])


### Prepare data

In [37]:
class Vocab():
    def __init__(self, pad_unk):
        """
        A convenience class that can help store a vocabulary
        and retrieve indices for inputs.
        """
        self.pad_unk = pad_unk
        self.word2idx = {self.pad_unk: 0}
        self.idx2word = [self.pad_unk]

    def getIdx(self, word, add=False):
        if word not in self.word2idx:
            if add:
                self.word2idx[word] = len(self.idx2word)
                self.idx2word.append(word)
            else:
                return self.word2idx[self.pad_unk]
        return self.word2idx[word]

    def getWord(self, idx):
        return self.idx2word(idx)


max_len = max([len(x[0]) for x in train_data ])

# Create vocabularies for both the tokens
# and the tags
token_vocab = Vocab(PAD)
label_vocab = Vocab(PAD)
id_to_token = [PAD]

for tokens, tags in train_data:
    for token in tokens:
        token_vocab.getIdx(token, True)
    for tag in tags:
        label_vocab.getIdx(tag, True)

NWORDS = len(token_vocab.idx2word)
NTAGS = len(label_vocab.idx2word)

# convert text data with labels to indices
def data2feats(inputData, word_vocab, label_vocab):
    feats = torch.zeros((len(inputData), max_len), dtype=torch.long)
    labels = torch.zeros((len(inputData), max_len), dtype=torch.long)
    for sentPos, sent in enumerate(inputData):
        for wordPos, word in enumerate(sent[0][:max_len]):
            wordIdx = token_vocab.getIdx(word)
            feats[sentPos][wordPos] = wordIdx
        for labelPos, label in enumerate(sent[1][:max_len]):
            labelIdx = label_vocab.getIdx(label)
            labels[sentPos][labelPos] = labelIdx
    return feats, labels

train_features, train_labels = data2feats(train_data, token_vocab, label_vocab)

### Batches

In [52]:
# convert to batches
num_batches = int(len(train_features)/BATCH_SIZE)
train_feats_batches = train_features[:BATCH_SIZE*num_batches].view(num_batches, BATCH_SIZE, max_len)
train_labels_batches = train_labels[:BATCH_SIZE*num_batches].view(num_batches, BATCH_SIZE, max_len)

## Define BiLSTM

In [56]:
# Our model consisting of word embeddings, a single bilstm layer, and an output labels
class LangID(nn.Module):
    def __init__(self, embed_dim, lstm_dim, vocab_dim):
        super(LangID, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_dim, embed_dim)
        self.bilstm = nn.LSTM(embed_dim, lstm_dim, bidirectional=True, batch_first=True)
        self.hidden_to_tag = nn.Linear(lstm_dim * 2, NTAGS)
        self.lstm_dim = lstm_dim
    
    def forward(self, inputs):
        # First encode the input into word representations and run the bilstm
        word_vectors = self.word_embeddings(inputs)
        bilstm_out, _ = self.bilstm(word_vectors)
        #  Now combine (concatenate) the last state of each layer
        backward_out = bilstm_out[:,0,-self.lstm_dim:].squeeze(1)
        forward_out = bilstm_out[:,-1,:self.lstm_dim].squeeze(1)
        bilstm_out = torch.cat((forward_out, backward_out),1)
        # And get the prediction
        y = self.hidden_to_tag(bilstm_out)
        log_probs = F.softmax(y, dim=1)
        print(log_probs.shape)
        return log_probs
    
    def predict(self, inputs):
        # Disable updating the weights
        with torch.no_grad():
            y = self.forward(inputs)
            outputs = torch.argmax(y)
        return outputs


# define the model
langid_model = LangID(DIM_EMBEDDING, LSTM_HIDDEN, NWORDS)
loss_function = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
optimizer = optim.Adam(langid_model.parameters(), lr=LEARNING_RATE)
print('model overview: ')
print(langid_model)
print()

model overview: 
LangID(
  (word_embeddings): Embedding(19674, 100)
  (bilstm): LSTM(100, 50, batch_first=True, bidirectional=True)
  (hidden_to_tag): Linear(in_features=100, out_features=8, bias=True)
)



### Train the model

In [73]:
print('epoch   loss     total time')
langid_model.train()
start = time.time()
for epoch in range(EPOCHS):
    epoch_loss = 0.0
    for feats, label in zip(train_feats_batches, train_labels_batches):
        optimizer.zero_grad()
        y = langid_model.forward(feats)
        print(label.shape)
        loss = loss_function(y,label)
    #     loss.backward()
    #     optimizer.step()
    #     epoch_loss += loss.item()
    # print(str(epoch) +  '       {:.4f}'.format(epoch_loss/len(train_feats_batches)) + '   {:.2f}'.format(time.time() - start))

epoch   loss     total time
torch.Size([32, 8])
torch.Size([32, 159])


RuntimeError: 0D or 1D target tensor expected, multi-target not supported