In [None]:
!wget http://gmb.let.rug.nl/releases/gmb-1.0.0.zip
!unzip gmb-1.0.0.zip

In [54]:
def prepare(f):
    res = ""

    for line in f:
        for i in '\n.,:;!"-=+/':
            line = line.replace(i, ' ')
        line = line.replace("'", " '")
        res += ' ' + line
    return ' '.join(res.split()) + '\n'

dataset = []
limit = 10000

for i in os.walk("gmb-1.0.0/"):
    if not 'en.raw' in i[2]:
        continue
    if len(dataset) == limit:
        break
    f = open(f'{i[0]}/en.raw', 'r')
    dataset.append(prepare(f))
    f.close()
out = open('dataset.txt', 'w')
out.writelines(dataset)
out.close()

In [55]:
from pprint import pprint as print
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath

# Set file names for train and test data
corpus_file = 'dataset.txt'

model = FastText(vector_size=100)

# build the vocabulary
model.build_vocab(corpus_file=corpus_file)

# train the model
model.train(
    corpus_file=corpus_file, epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words,
)

print(model)


<gensim.models.fasttext.FastText object at 0x7f0ea55b3450>


In [65]:
# from pprint import pprint as print
# from gensim.models.fasttext import FastText
# from gensim.test.utils import datapath

# # Set file names for train and test data
# corpus_file = datapath('lee_background.cor')

# model = FastText(vector_size=100)

# # build the vocabulary
# model.build_vocab(corpus_file=corpus_file)

# # train the model
# model.train(
#     corpus_file=corpus_file, epochs=model.epochs,
#     total_examples=model.corpus_count, total_words=model.corpus_total_words,
# )

# print(model)

<gensim.models.fasttext.FastText object at 0x7f7901c90210>


In [56]:
model.save("model.model")

In [59]:


from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm



def tsne_plot(labels, tokens, classes, clusters):
    tsne_model = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=33)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []

    for value in new_values:
        x.append(value[0])
        y.append(value[1])

    colors = cm.rainbow(np.linspace(0, 1, clusters))
    
    plt.figure(figsize=(16, 9))
    for i in range(len(x)):
        plt.scatter(x[i], y[i], c=colors[classes[i]].reshape(1,-1), alpha=0.75)
        plt.annotate(labels[i], alpha=0.75, xy=(x[i], y[i]), xytext=(5, 2), 
                     textcoords='offset points', ha='right', va='bottom', size=10)
        
    plt.grid(True)
    plt.savefig('embedding.png', dpi=300)
    plt.show()



In [2]:
import xml.etree.cElementTree as ET
import os

def prepare(root):
    ner_tags = list(filter(lambda tag: tag.attrib['type'] == 'ne', tree.getroot().iter('tag')))
    words = list(root.iter('word'))
    sentenses = []
    sentence = []
    i = '1'
    for word in words:
        a = word.attrib
        if a["{http://www.w3.org/XML/1998/namespace}id"][1] != i:
            i = a["{http://www.w3.org/XML/1998/namespace}id"][1]
            sentenses.append(sentence)
            sentence = []
        sentence.append([word.text, 'O'])
    sentenses.append(sentence)
    
    for tag in ner_tags:
        sentense_id = int(tag.attrib['index'][1])-1
        pos = int(tag.attrib['index'][2:])-1
        sentenses[sentense_id][pos][1] = tag.text

    return sentenses

    
data_for_training = []
for i in os.walk("gmb-1.0.0/"):
    if not 'en.drs.xml' in i[2]:
        continue
    tree = ET.parse(f'{i[0]}/en.drs.xml')
    data_for_training += prepare(tree.getroot())


In [2]:
max_text_width = max(map(lambda it: len(it), data_for_training))
print(f'Maximal word number in sentence is {max_text_width}.')

Maximal word number in sentence is 55.


In [61]:
from gensim.models.fasttext import FastText

fasttext_model_name = "model.model"

fasttext_model = FastText.load('model.model')

In [62]:
set_of_labels = set()
for cur_sample in data_for_training:
    set_of_labels |= set(map(lambda it: it[1], cur_sample))
set_of_labels -= {'O'}
set_of_labels = ['O'] +  sorted(list(set_of_labels))
set_of_labels

['O',
 'I-ART',
 'I-DAT',
 'I-LOC',
 'I-MON',
 'I-ORG',
 'I-PCT',
 'I-PER',
 'I-TIM',
 'I-TTL']

In [63]:
import os
import pandas as pd
from torch.utils.data import Dataset
import random
from tqdm.notebook import tqdm
import numpy as np

class TrainsetGenerator(Dataset):
    def __init__(self, training_set,
                 classes_list,
                 used_fasttext_model: FastText, batch_size: int):
        self.training_set = sorted(training_set, key=lambda it: len(it))
        self.classes_list = classes_list
        self.batch_size = batch_size
        self.vocabulary_ = {'<BOS>': 0, '<EOS>': 1}
        n_words = 2
        classes_set = set(self.classes_list)
        print(f'Dictionary of unique words is created...')
        for cur_sent in tqdm(self.training_set):
            for cur_word, cur_label in cur_sent:
                if cur_word not in self.vocabulary_:
                    self.vocabulary_[cur_word] = n_words
                    n_words += 1
                if cur_label not in classes_set:
                    err_msg_ = f'Label "{cur_label}" is unknown!'
                    raise ValueError(err_msg_)
        del classes_set
        print(f'There are {n_words - 2} unique words in the training data!')
        all_words = sorted(list(self.vocabulary_.keys()))
        word_vector = used_fasttext_model.wv[all_words[0]].astype(np.float64)
        word_vector /= np.linalg.norm(word_vector)
        word_vector = word_vector.astype(np.float32)
        word_idx = self.vocabulary_[all_words[0]]
        self.vector_size_ = word_vector.shape[0] + 2
        self.matrix_ = np.zeros((n_words, self.vector_size_), dtype=np.float32)
        self.matrix_[0, self.vector_size_ - 2] = 1.0
        self.matrix_[1, self.vector_size_ - 1] = 1.0
        self.matrix_[word_idx, 0:(self.vector_size_ - 2)] = word_vector
        del word_vector
        print(f'Vectors of unique words are calculated...')
        for cur_word in tqdm(all_words[1:]):
            word_vector = used_fasttext_model.wv[cur_word].astype(np.float64)
            word_vector /= np.linalg.norm(word_vector)
            word_vector = word_vector.astype(np.float32)
            word_idx = self.vocabulary_[cur_word]
            self.matrix_[word_idx, 0:(self.vector_size_ - 2)] = word_vector
            del word_vector
        del all_words

    def __len__(self):
        return (len(self.training_set) // self.batch_size) -10

    def __getitem__(self, batch_idx):
        batch_start = batch_idx * self.batch_size
        batch_end = min(len(self.training_set), batch_start + self.batch_size)
        max_text_len = max(map(
            lambda it: len(it),
            self.training_set[batch_start:batch_end]
        ))
        max_text_len += 2
        batch_x = np.zeros(
            (batch_end - batch_start, max_text_len, self.vector_size_),
            dtype=np.float32
        )
        batch_y = np.zeros(
            (batch_end - batch_start, max_text_len),
            dtype=np.int64
        )
        for sent_idx, cur_sent in enumerate(self.training_set[batch_start:batch_end]):
            time_idx = 0
            word_idx = self.vocabulary_['<BOS>']
            batch_x[sent_idx, time_idx] = self.matrix_[word_idx]
            class_idx = self.classes_list.index('O')
            batch_y[sent_idx, time_idx] = class_idx
            for time_idx, (cur_word, cur_label) in enumerate(cur_sent):
                word_idx = self.vocabulary_[cur_word]
                batch_x[sent_idx, time_idx + 1] = self.matrix_[word_idx]
                class_idx = self.classes_list.index(cur_label)
                batch_y[sent_idx, time_idx + 1] = class_idx
            time_idx = len(cur_sent) + 1
            word_idx = self.vocabulary_['<EOS>']
            batch_x[sent_idx, time_idx] = self.matrix_[word_idx]
            class_idx = self.classes_list.index('O')
            batch_y[sent_idx, time_idx] = class_idx
        return torch.tensor(batch_x[0]), torch.tensor(batch_y[0])

random.shuffle(data_for_training)
n_valid = int(round(0.3 * len(data_for_training)))
n_train = len(data_for_training) - n_valid
train_dataloader = TrainsetGenerator(data_for_training[:n_train], set_of_labels, fasttext_model, batch_size=1)
valid_dataloader = TrainsetGenerator(data_for_training[n_train:], set_of_labels, fasttext_model, batch_size=1)

'Dictionary of unique words is created...'


  0%|          | 0/2965 [00:00<?, ?it/s]

'There are 8565 unique words in the training data!'
'Vectors of unique words are calculated...'


  0%|          | 0/8566 [00:00<?, ?it/s]

'Dictionary of unique words is created...'


  0%|          | 0/1271 [00:00<?, ?it/s]

'There are 5437 unique words in the training data!'
'Vectors of unique words are calculated...'


  0%|          | 0/5438 [00:00<?, ?it/s]

In [64]:
import torch

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.LSTM(102, 512, bidirectional=True)
        self.cls = nn.Linear(1024, len(set_of_labels))

    def forward(self, x):
        rnn_out, _ = self.rnn(x)
        return F.log_softmax(self.cls(rnn_out), dim = 1)

model = Model()

In [74]:
n_epochs = 100
lr=0.05

# Define Loss, Optimizer

# We'll also set the model to the device that we defined earlier (default is CPU)
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [75]:
for epoch in range(1, n_epochs + 1):
    loss_number = 0
    for i in range(len(train_dataloader)):
        sentence, tags = train_dataloader[i]
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.

        # Step 3. Run our forward pass.
        tag_scores = model(sentence)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = criterion(tag_scores, tags)
        loss_number += loss.item()
        loss.backward()
        optimizer.step()
        
    if epoch%1 == 0:
        print('Epoch: {}/{}.............'.format(epoch, n_epochs))
        print("Loss: {}".format(loss_number/len(train_dataloader)))
    loss_number = 0

'Epoch: 1/100.............'
'Loss: 0.5554383510890749'
'Epoch: 2/100.............'
'Loss: 0.5400072243203317'
'Epoch: 3/100.............'
'Loss: 0.5333417163975852'
'Epoch: 4/100.............'
'Loss: 0.5258689763199171'
'Epoch: 5/100.............'
'Loss: 0.5231945255665895'
'Epoch: 6/100.............'
'Loss: 0.5148628778598081'
'Epoch: 7/100.............'
'Loss: 0.5099543539885992'
'Epoch: 8/100.............'
'Loss: 0.5139175660891676'
'Epoch: 9/100.............'
'Loss: 0.505844530184499'
'Epoch: 10/100.............'
'Loss: 0.5135514655023815'
'Epoch: 11/100.............'
'Loss: 0.5006406383236098'
'Epoch: 12/100.............'
'Loss: 0.5027646959902672'
'Epoch: 13/100.............'
'Loss: 0.49196123771121186'
'Epoch: 14/100.............'
'Loss: 0.4900650358409424'
'Epoch: 15/100.............'
'Loss: 0.4953741187954884'
'Epoch: 16/100.............'
'Loss: 0.48827745053367144'
'Epoch: 17/100.............'
'Loss: 0.4838273095771998'
'Epoch: 18/100.............'
'Loss: 0.4821291541596608'


KeyboardInterrupt: 

In [79]:
torch.save(model.state_dict(), "model.pth")

In [82]:
model = model.eval()
predicted_labels = []
true_labels = []
with torch.no_grad():
    for i in range(len(valid_dataloader)):
        sentence, tags = valid_dataloader[i]
        predicted = np.argmax(np.exp(model(sentence)), axis=1)
        for p, t in zip(predicted, tags):
            true_labels.append(set_of_labels[t])
            predicted_labels.append(set_of_labels[p])

In [83]:
from sklearn.metrics import classification_report

print(classification_report(true_labels, predicted_labels))

('              precision    recall  f1-score   support\n'
 '\n'
 '       I-DAT       0.04      0.00      0.00       880\n'
 '       I-LOC       0.52      0.08      0.14      1193\n'
 '       I-MON       0.70      0.68      0.69       100\n'
 '       I-ORG       0.43      0.07      0.13       910\n'
 '       I-PCT       0.50      0.11      0.18        37\n'
 '       I-PER       0.65      0.14      0.24       571\n'
 '       I-TIM       0.00      0.00      0.00        29\n'
 '       I-TTL       0.00      0.00      0.00         1\n'
 '           O       0.88      0.99      0.93     23365\n'
 '\n'
 '    accuracy                           0.87     27086\n'
 '   macro avg       0.41      0.23      0.26     27086\n'
 'weighted avg       0.81      0.87      0.82     27086\n')


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
