In [45]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import codecs

torch.manual_seed(1)
np.random.seed(1)

### Reading connlu dataset file

In [2]:
def read_connlu(file):
    words, tags, sent_words, sent_tags = [],[],[],[]
    with codecs.open(file, 'r', 'utf-8') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        if len(line) == 0 or line.startswith('-DOCSTART-'):
            if len(sent_words) > 0:
                words.append(sent_words)
                tags.append(sent_tags)
                sent_words,sent_tags = [],[]
            continue
        line_splitted = line.split(' ')
        sent_words.append(line_splitted[0])
        sent_tags.append(line_splitted[-1])
    if len(sent_words) > 0:
        words.append(sent_words)
        tags.append(sent_tags)     
    return np.array(words), np.array(tags)

In [31]:
def read_embeddings():
    embeddings = {}
    with codecs.open('glove.6B.100d.txt', 'r', 'utf-8') as f:
        lines = f.readlines()
    for line in lines:
        line_splitted = line.strip().split(" ")
        word = line_splitted[0]
        embeddings[word] = np.array(line_splitted[1:]).astype(np.float)
    return embeddings

In [32]:
embeddings = read_embeddings()

In [58]:
train_words, train_tags = read_connlu("train.txt")
dev_words, dev_tags = read_connlu("dev.txt")
test_words, test_tags = read_connlu("test.txt")

In [115]:
tag_to_ix = {'B-LOC':0, 'B-MISC':1, 'B-ORG':2, 'B-PER':3, 'I-LOC':4, 'I-MISC':5, 'I-ORG':6, 'I-PER':7, 'O':8}

In [99]:
PAD_WORD = "<pad>"
word_to_ix = {}
for sentences in [train_words, dev_words, test_words]:
    for sentence in sentences:
        for word in sentence:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
word_to_ix[PAD_WORD] = len(word_to_ix)

In [100]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

EMBEDDING_DIM = len(embeddings['.'])
BATCH_SIZE = 128
TARGET_SIZE = len(tag_to_ix)
VOCAB_SIZE = len(word_to_ix)
HIDDEN_DIM = 120

### Embeddings loading

In [101]:
def get_lowercase_embedding_matrix():
    embeddings_matrix = np.empty((VOCAB_SIZE, EMBEDDING_DIM))
    for word, index in word_to_ix.items():
        word_lower_embedding = embeddings.get(word.lower())
        embeddings_matrix[index] = word_lower_embedding if word_lower_embedding  is not None else embeddings['unknown']
    return embeddings_matrix
        
def get_original_embedding_matrix():
    embeddings_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
    for word, index in word_to_ix.items():
        original_word_embedding = embeddings.get(word)
        embeddings_matrix[index] = original_word_embedding if original_word_embedding is not None else embeddings['unknown']
    return embeddings_matrix

def get_full_embedding_matrix():
    embeddings_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
    for word, index in word_to_ix.items():
        original_word_embedding = embeddings.get(word)
        word_lower_embedding = embeddings.get(word.lower())
        if original_word_embedding is not None:
            embeddings_matrix[index] = original_word_embedding
        elif word_lower_embedding  is not None:
            embeddings_matrix[index] = word_lower_embedding
        else:
            embeddings_matrix[index] = embeddings['unknown']
    return embeddings_matrix

### Creating model

#### Sequential models

In [263]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim).from_pretrained(
            torch.tensor(get_original_embedding_matrix(), dtype=torch.float))
        self.word_embeddings.weight.requires_grad = False
        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores
    
class BiLSTM_TaggerIter(nn.Module):
    def __init__(self, embedding_dim, vocab_size, tagset_size,
                 hidden_dim=128, embedding_type='original'):
        super(BiLSTM_Tagger, self).__init__()
        self.hidden_dim = hidden_dim
        if embedding_type == "original":
            self.embedding = nn.Embedding(vocab_size, embedding_dim)\
                .from_pretrained(torch.tensor(get_original_embedding_matrix(), dtype=torch.float))
        elif embedding_type == "lower":
            self.embedding = nn.Embedding(vocab_size, embedding_dim)\
                .from_pretrained(torch.tensor(get_lowercase_embedding_matrix(), dtype=torch.float))
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)\
                .from_pretrained(torch.tensor(get_full_embedding_matrix(), dtype=torch.float))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(input_size=self.embedding.embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=1,
                            bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
    
    def forward(self, sents):
        x = self.embedding(sents)
        x = x.view(len(sents), 1, -1)
        lstm_out, (h_n, c_n) = self.lstm(x)
        tags_output = self.hidden2tag(lstm_out)
        tags_prob = F.log_softmax(tags_output, dim=1)
        return tags_prob

#### Models for batching

In [264]:
class BiLSTM_Tagger(nn.Module):
    def __init__(self, embedding_dim, vocab_size, tagset_size,
                 hidden_dim=128, embedding_type='original'):
        super(BiLSTM_Tagger, self).__init__()
        self.target_size = tagset_size
        self.hidden_dim = hidden_dim
        if embedding_type == "original":
            self.embedding = nn.Embedding(vocab_size, embedding_dim)\
                .from_pretrained(torch.tensor(get_original_embedding_matrix(), dtype=torch.float))
        elif embedding_type == "lower":
            self.embedding = nn.Embedding(vocab_size, embedding_dim)\
                .from_pretrained(torch.tensor(get_lowercase_embedding_matrix(), dtype=torch.float))
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)\
                .from_pretrained(torch.tensor(get_full_embedding_matrix(), dtype=torch.float))
        self.embedding.weight.requires_grad = False
        self.lstm = nn.LSTM(input_size=self.embedding.embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=1,
                            bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
    
    def forward(self, sents):
        x = self.embedding(sents)
        lstm_out, (h_n, c_n) = self.lstm(x)
        flatten = lstm_out.view(-1, lstm_out.shape[2]) 
        tags_output = self.hidden2tag(flatten)
        tags_prob = F.log_softmax(tags_output, dim=1)
        return tags_prob

    def loss_function(self, outputs, labels):
        # reshape labels to give a flat vector of length batch_size*seq_len
        labels = labels.view(-1)
        mask = (labels >= 0).float()
        labels = labels % outputs.shape[1]
        num_tokens = int(torch.sum(mask).item())
        return -torch.sum(outputs[range(outputs.shape[0]), labels]*mask)/num_tokens

### Token-level score function

In [304]:
# with idea to use such approach with matrix my groupmate helped me
from sklearn.metrics import f1_score
metric_types={'TP':0, 'TN':1, 'FP':2, 'FN':3}

def micro_f1_score(model, eval_words, eval_tags):
    metrics_matrix=np.zeros((len(tag_to_ix), len(metric_types)))
    model.eval()
    with torch.no_grad():
        true_tags = []
        preds = []
        for batch_data, batch_labels in get_batches(train_words, train_tags):
            batch_scores = model(batch_data).view(BATCH_SIZE,-1,9)
            batch_labels = batch_labels.numpy()
            for batch_sent_index in range(BATCH_SIZE):
                scores = batch_scores[batch_sent_index]
                predictions = scores.numpy().argmax(axis=1)
                for tag_index in range(len(predictions)):
                    if batch_data[batch_sent_index, tag_index] == word_to_ix[PAD_WORD]:
                        break
                    prediction=predictions[tag_index]
                    tag = batch_labels[batch_sent_index, tag_index]
                    true_tags.append(tag)
                    preds.append(prediction)
                    if prediction==tag:
                        metrics_matrix[:,1]+=1
                        metrics_matrix[tag,0]+=1
                        metrics_matrix[tag,1]-=1
                    else:
                        metrics_matrix[:,1]+=1
                        metrics_matrix[tag,3]+=1
                        metrics_matrix[tag,1]-=1
                        metrics_matrix[prediction,2]+=1
                        metrics_matrix[prediction,1]-=1
    print("sklearn f1 micro : ", f1_score(true_tags, preds, average='micro'))
    average_precision = sum(metrics_matrix[:,0])/(sum(metrics_matrix[:,0])+sum(metrics_matrix[:,2]))
    average_recall = sum(metrics_matrix[:,0])/(sum(metrics_matrix[:,0])+sum(metrics_matrix[:,3]))
    f1=2*average_precision*average_recall/(average_precision+average_recall)
    f05=1.25*average_precision*average_recall/((0.25*average_precision)+average_recall)
    return f1, f05

### Training logic

#### Batches

In [158]:
def unison_shuffled(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [270]:
def get_batches(words, tags, shuffle=False):
    batches = []
    order = list(range(len(train_words)))
    if shuffle:
        words, tags = unison_shuffled(words, tags)
    for i in range((len(words)+1) // BATCH_SIZE):
        batch_sentences = [prepare_sequence(words[idx], word_to_ix) 
                           for idx in order[i*BATCH_SIZE:(i+1)*BATCH_SIZE]]
        batch_tags = [prepare_sequence(tags[idx], tag_to_ix) 
                      for idx in order[i*BATCH_SIZE:(i+1)*BATCH_SIZE]]
        batch_max_len = max([len(s) for s in batch_sentences])
        batch_data = word_to_ix[PAD_WORD]*np.ones((len(batch_sentences), batch_max_len))
        batch_labels = -1*np.ones((len(batch_sentences), batch_max_len))
        for j in range(len(batch_sentences)):
            cur_len = len(batch_sentences[j])
            batch_data[j][:cur_len] = batch_sentences[j]
            batch_labels[j][:cur_len] = batch_tags[j]
        batch_data, batch_labels = torch.LongTensor(batch_data), torch.LongTensor(batch_labels)
        batch_data, batch_labels = Variable(batch_data), Variable(batch_labels)
        batches.append((batch_data, batch_labels))
    return batches

In [271]:
def train_batch(epochs, model, loss_function, optimizer, train_words, train_tags):
    model.train()
    model.zero_grad()
    for epoch in range(epochs): 
        print("Epoch %d started." % (epoch))
        model.train()
        for batch_data, batch_labels in get_batches(train_words, train_tags, shuffle=True):
            model.zero_grad()
            tag_scores = model(batch_data)
            loss = loss_function(tag_scores, batch_labels)
            loss.backward()
            optimizer.step()  
        f1,f05 = micro_f1_score(model, dev_words, dev_tags)
        print(f'Epoch {epoch} : f1 loss {f1}')
        print(f'Epoch {epoch} : f0.5 loss {f05}')

#### Iterable approach

In [239]:
def train_iter(epochs, model, loss_function, optimizer, train_words, train_tags):
    model.train()
    model.zero_grad()
    for epoch in range(epochs): 
        print("Epoch %d started." % (epoch))
        model.train()
        order = list(range(len(train_words)))
        train_words, train_tags = unison_shuffled(train_words, train_tags)
        for i in range((len(train_words))):
            model.zero_grad()
            sentence = prepare_sequence(train_words[i], word_to_ix)
            tags = prepare_sequence(train_tags[i], tag_to_ix)
            tag_scores = model(sentence)
            loss = loss_function(tag_scores, tags)
            loss.backward()
            optimizer.step()    
        f1,f05 = micro_f1_score(model, dev_words, dev_tags)
        print(f'Epoch {epoch} : f1 loss {f1}')
        print(f'Epoch {epoch} : f0.5 loss {f05}')

###  Model training

#### Original embeddings

In [305]:
model = BiLSTM_Tagger(EMBEDDING_DIM, VOCAB_SIZE, TARGET_SIZE)
train_batch(5, model, model.loss_function, optim.SGD(model.parameters(), lr=0.1), train_words, train_tags)

Epoch 0 started.
sklearn f1 micro :  0.8329869116797582
Epoch 0 : f1 loss 0.8329869116797582
Epoch 0 : f0.5 loss 0.8329869116797582
Epoch 1 started.
sklearn f1 micro :  0.8328239865315211
Epoch 1 : f1 loss 0.8328239865315211
Epoch 1 : f0.5 loss 0.8328239865315211
Epoch 2 started.
sklearn f1 micro :  0.8327153697660297
Epoch 2 : f1 loss 0.8327153697660297
Epoch 2 : f0.5 loss 0.8327153697660297
Epoch 3 started.
sklearn f1 micro :  0.8327351182688463
Epoch 3 : f1 loss 0.8327351182688463
Epoch 3 : f0.5 loss 0.8327351182688463
Epoch 4 started.
sklearn f1 micro :  0.8326561242575798
Epoch 4 : f1 loss 0.8326561242575798
Epoch 4 : f0.5 loss 0.8326561242575797


In [306]:
f1,f05 = micro_f1_score(model, test_words, test_tags)
print(f'f1 loss: {f1}')
print(f'f0.5 loss: {f1}')

sklearn f1 micro :  0.8326561242575798
f1 loss: 0.8326561242575798
f0.5 loss: 0.8326561242575798


#### Lowercase embeddings

In [309]:
model = BiLSTM_Tagger(EMBEDDING_DIM, VOCAB_SIZE, TARGET_SIZE, embedding_type='lower')
train_batch(5, model, model.loss_function, optim.SGD(model.parameters(), lr=0.1), train_words, train_tags)

Epoch 0 started.
sklearn f1 micro :  0.8329869116797582
Epoch 0 : f1 loss 0.8329869116797582
Epoch 0 : f0.5 loss 0.8329869116797582
Epoch 1 started.
sklearn f1 micro :  0.8331399625765872
Epoch 1 : f1 loss 0.8331399625765872
Epoch 1 : f0.5 loss 0.8331399625765872
Epoch 2 started.
sklearn f1 micro :  0.8356332110571867
Epoch 2 : f1 loss 0.8356332110571867
Epoch 2 : f0.5 loss 0.8356332110571867
Epoch 3 started.
sklearn f1 micro :  0.8432906930243351
Epoch 3 : f1 loss 0.8432906930243351
Epoch 3 : f0.5 loss 0.8432906930243351
Epoch 4 started.
sklearn f1 micro :  0.8525873007252638
Epoch 4 : f1 loss 0.8525873007252638
Epoch 4 : f0.5 loss 0.8525873007252638


In [310]:
f1,f05 = micro_f1_score(model, test_words, test_tags)
print(f'f1 loss: {f1}')
print(f'f0.5 loss: {f1}')

sklearn f1 micro :  0.8525873007252638
f1 loss: 0.8525873007252638
f0.5 loss: 0.8525873007252638


#### Full embeddings

In [312]:
model = BiLSTM_Tagger(EMBEDDING_DIM, VOCAB_SIZE, TARGET_SIZE, embedding_type='full')
train_batch(5, model, model.loss_function, optim.SGD(model.parameters(), lr=0.1), train_words, train_tags)

Epoch 0 started.
sklearn f1 micro :  0.8329869116797582
Epoch 0 : f1 loss 0.8329869116797582
Epoch 0 : f0.5 loss 0.8329869116797582
Epoch 1 started.
sklearn f1 micro :  0.833199208085037
Epoch 1 : f1 loss 0.833199208085037
Epoch 1 : f0.5 loss 0.833199208085037
Epoch 2 started.
sklearn f1 micro :  0.8366206361980182
Epoch 2 : f1 loss 0.8366206361980182
Epoch 2 : f0.5 loss 0.8366206361980182
Epoch 3 started.
sklearn f1 micro :  0.8449298187581153
Epoch 3 : f1 loss 0.8449298187581153
Epoch 3 : f0.5 loss 0.8449298187581153
Epoch 4 started.
sklearn f1 micro :  0.8543992258586895
Epoch 4 : f1 loss 0.8543992258586895
Epoch 4 : f0.5 loss 0.8543992258586895


In [313]:
f1,f05 = micro_f1_score(model, test_words, test_tags)
print(f'f1 loss: {f1}')
print(f'f0.5 loss: {f1}')

sklearn f1 micro :  0.8543992258586895
f1 loss: 0.8543992258586895
f0.5 loss: 0.8543992258586895


### Summary

As we can see, the best performance in our case gave BiLSTM with full embeddings(search by original word and if not exists, then by lower case). So we received f1 score 0.854 on dev data and test data.

Next was the BiLSTM with searching embeddings by lowercase word with dev f1 0.852.

And the worst in our case was the BiLSTM with embedding searching strategy only by original word, that gave f1 score 0.832.