In [1]:
import pandas as pd
dataset = pd.read_csv('NER dataset.csv', encoding='unicode_escape')

In [2]:
# prepare train fastmodel
sentences = [""]
last_id = 1
for id, word in dataset[['Sentence #', 'Word']].iloc:
    if not pd.isnull(id):
        id = int(id[9:])
        if last_id != id:
            sentences[last_id - 1] += '\n'
            sentences.append("")
            last_id = id
    sentences[last_id - 1] += ' ' + str(word)

In [3]:
out = open('dataset.txt', 'w')
out.writelines(sentences)
out.close()

In [4]:
from pprint import pprint as print
from gensim.models.fasttext import FastText
from gensim.test.utils import datapath

# Set file names for train and test data
corpus_file = 'dataset.txt'

model = FastText(vector_size=100)

# build the vocabulary
model.build_vocab(corpus_file=corpus_file)

# train the model
model.train(
    corpus_file=corpus_file, epochs=model.epochs,
    total_examples=model.corpus_count, total_words=model.corpus_total_words,
)

print(model)


<gensim.models.fasttext.FastText object at 0x7f6539bb6210>


In [5]:
model.save("model.model")

In [6]:
data_for_training = [[]]
last_id = 1
for id, word, tag in dataset[['Sentence #', 'Word', 'Tag']].iloc:
    if not pd.isnull(id):
        id = int(id[9:])
        if last_id != id:
            data_for_training.append([])
            last_id = id
    data_for_training[last_id - 1].append([str(word), tag])


In [7]:
max_text_width = max(map(lambda it: len(it), data_for_training))
print(f'Maximal word number in sentence is {max_text_width}.')

'Maximal word number in sentence is 104.'


In [8]:
from gensim.models.fasttext import FastText

fasttext_model_name = "model.model"

fasttext_model = FastText.load('model.model')

In [9]:
set_of_labels = set()
for cur_sample in data_for_training:
    set_of_labels |= set(map(lambda it: it[1], cur_sample))
set_of_labels -= {'O'}
set_of_labels = ['O'] +  sorted(list(set_of_labels))
set_of_labels

['O',
 'B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim']

In [10]:
import os
import pandas as pd
from torch.utils.data import Dataset
import random
from tqdm.notebook import tqdm
import numpy as np

class TrainsetGenerator(Dataset):
    def __init__(self, training_set,
                 classes_list,
                 used_fasttext_model: FastText, batch_size: int):
        self.training_set = sorted(training_set, key=lambda it: len(it))
        self.classes_list = classes_list
        self.batch_size = batch_size
        self.vocabulary_ = {'<BOS>': 0, '<EOS>': 1}
        n_words = 2
        classes_set = set(self.classes_list)
        print(f'Dictionary of unique words is created...')
        for cur_sent in tqdm(self.training_set):
            for cur_word, cur_label in cur_sent:
                if cur_word not in self.vocabulary_:
                    self.vocabulary_[cur_word] = n_words
                    n_words += 1
                if cur_label not in classes_set:
                    err_msg_ = f'Label "{cur_label}" is unknown!'
                    raise ValueError(err_msg_)
        del classes_set
        print(f'There are {n_words - 2} unique words in the training data!')
        all_words = sorted(list(self.vocabulary_.keys()))
        word_vector = used_fasttext_model.wv[all_words[0]].astype(np.float64)
        word_vector /= np.linalg.norm(word_vector)
        word_vector = word_vector.astype(np.float32)
        word_idx = self.vocabulary_[all_words[0]]
        self.vector_size_ = word_vector.shape[0] + 2
        self.matrix_ = np.zeros((n_words, self.vector_size_), dtype=np.float32)
        self.matrix_[0, self.vector_size_ - 2] = 1.0
        self.matrix_[1, self.vector_size_ - 1] = 1.0
        self.matrix_[word_idx, 0:(self.vector_size_ - 2)] = word_vector
        del word_vector
        print(f'Vectors of unique words are calculated...')
        for cur_word in tqdm(all_words[1:]):
            word_vector = used_fasttext_model.wv[cur_word].astype(np.float64)
            word_vector /= np.linalg.norm(word_vector)
            word_vector = word_vector.astype(np.float32)
            word_idx = self.vocabulary_[cur_word]
            self.matrix_[word_idx, 0:(self.vector_size_ - 2)] = word_vector
            del word_vector
        del all_words

    def __len__(self):
        return (len(self.training_set) // self.batch_size)

    def __getitem__(self, batch_idx):
        batch_start = batch_idx * self.batch_size
        batch_end = min(len(self.training_set), batch_start + self.batch_size)
        max_text_len = max(map(
            lambda it: len(it),
            self.training_set[batch_start:batch_end]
        ))
        max_text_len += 2
        batch_x = np.zeros(
            (batch_end - batch_start, max_text_len, self.vector_size_),
            dtype=np.float32
        )
        batch_y = np.zeros(
            (batch_end - batch_start, max_text_len),
            dtype=np.int64
        )
        for sent_idx, cur_sent in enumerate(self.training_set[batch_start:batch_end]):
            time_idx = 0
            word_idx = self.vocabulary_['<BOS>']
            batch_x[sent_idx, time_idx] = self.matrix_[word_idx]
            class_idx = self.classes_list.index('O')
            batch_y[sent_idx, time_idx] = class_idx
            for time_idx, (cur_word, cur_label) in enumerate(cur_sent):
                word_idx = self.vocabulary_[cur_word]
                batch_x[sent_idx, time_idx + 1] = self.matrix_[word_idx]
                class_idx = self.classes_list.index(cur_label)
                batch_y[sent_idx, time_idx + 1] = class_idx
            time_idx = len(cur_sent) + 1
            word_idx = self.vocabulary_['<EOS>']
            batch_x[sent_idx, time_idx] = self.matrix_[word_idx]
            class_idx = self.classes_list.index('O')
            batch_y[sent_idx, time_idx] = class_idx
        return torch.tensor(batch_x), torch.tensor(batch_y)

random.shuffle(data_for_training)
n_valid = int(round(0.2 * len(data_for_training)))
n_train = len(data_for_training) - n_valid
train_dataloader = TrainsetGenerator(data_for_training[:n_train], set_of_labels, fasttext_model, batch_size=512)
valid_dataloader = TrainsetGenerator(data_for_training[n_train:], set_of_labels, fasttext_model, batch_size=16)

'Dictionary of unique words is created...'


  0%|          | 0/38367 [00:00<?, ?it/s]

'There are 31886 unique words in the training data!'
'Vectors of unique words are calculated...'


  0%|          | 0/31887 [00:00<?, ?it/s]

'Dictionary of unique words is created...'


  0%|          | 0/9592 [00:00<?, ?it/s]

'There are 17207 unique words in the training data!'
'Vectors of unique words are calculated...'


  0%|          | 0/17208 [00:00<?, ?it/s]

In [16]:
import torch

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.rnn = nn.LSTM(102, 128, batch_first=True, bidirectional=True)
        self.cls = nn.Linear(256, len(set_of_labels))

    def forward(self, x):
        rnn_out, _ = self.rnn(x)
        return self.cls(rnn_out)

model = Model()
model.to('cuda')

Model(
  (rnn): LSTM(102, 128, batch_first=True, bidirectional=True)
  (cls): Linear(in_features=256, out_features=17, bias=True)
)

In [17]:
n_epochs = 200
lr=0.005

# Define Loss, Optimizer

# We'll also set the model to the device that we defined earlier (default is CPU)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [18]:
best_loss = 10
state_dict = {}
for epoch in range(1, n_epochs + 1):
    loss_number = 0
    for i in range(len(train_dataloader)):
        sentence, tags = train_dataloader[i]
        sentence, tags = sentence.to('cuda'), tags.to('cuda')
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.

        # Step 3. Run our forward pass.
        tag_scores = model(sentence)
        

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = criterion(tag_scores.view(-1,len(set_of_labels)), tags.view(-1))
        loss_number += loss.item()
        loss.backward()
        optimizer.step()
    val_loss = 0
    with torch.no_grad():
        for i in range(len(valid_dataloader)):
            x, y = valid_dataloader[i]
            x, y = x.to('cuda'), y.to('cuda')
            output = model(x)
            loss = criterion(output.view(-1,len(set_of_labels)), y.view(-1))
            val_loss += loss.item()
        val_loss /= len(valid_dataloader)
    if best_loss > val_loss:
        state_dict = model.state_dict()
        best_loss = val_loss
        
    if epoch%1 == 0:
        print('Epoch: {}/{}.............'.format(epoch, n_epochs))
        print("Loss: {} Valid: {}".format(loss_number/len(train_dataloader), val_loss))
    loss_number = 0

'Epoch: 1/200.............'
'Loss: 0.7390093396644335 Valid: 0.42496255211818196'
'Epoch: 2/200.............'
'Loss: 0.2787059611968092 Valid: 0.21513677754117572'
'Epoch: 3/200.............'
'Loss: 0.19281328429241437 Valid: 0.1861499468166263'
'Epoch: 4/200.............'
'Loss: 0.17183759828677048 Valid: 0.1691195695087289'
'Epoch: 5/200.............'
'Loss: 0.1592262248332436 Valid: 0.15893687764944536'
'Epoch: 6/200.............'
'Loss: 0.15001546067965998 Valid: 0.15146216019391018'
'Epoch: 7/200.............'
'Loss: 0.14256515303576314 Valid: 0.14543563564726228'
'Epoch: 8/200.............'
'Loss: 0.13629456116138278 Valid: 0.14028495802965407'
'Epoch: 9/200.............'
'Loss: 0.13088562702004974 Valid: 0.1359817224223546'
'Epoch: 10/200.............'
'Loss: 0.12611031471877485 Valid: 0.13253614204766256'
'Epoch: 11/200.............'
'Loss: 0.12176967935787665 Valid: 0.12952024194230022'
'Epoch: 12/200.............'
'Loss: 0.11778036857376227 Valid: 0.12690466368113815'
'Epoch:

KeyboardInterrupt: 

In [None]:
torch.save(model.state_dict(), "model.pth")

In [21]:
model.load_state_dict(state_dict)
model.eval()
predicted_labels = []
true_labels = []
with torch.no_grad():
    for i in range(len(valid_dataloader)):
        sentence, tags = valid_dataloader[i]
        sentence, tags = sentence.to('cuda'), tags.to('cuda')
        predicted = np.argmax(model(sentence).cpu(), axis=2)
        for p, t in zip(predicted, tags):
            for p1, t1 in zip(p, t):
                true_labels.append(set_of_labels[t1])
                predicted_labels.append(set_of_labels[p1])

In [22]:
from sklearn.metrics import classification_report

print(classification_report(true_labels, predicted_labels))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


('              precision    recall  f1-score   support\n'
 '\n'
 '       B-art       0.00      0.00      0.00        84\n'
 '       B-eve       0.76      0.22      0.34        60\n'
 '       B-geo       0.82      0.88      0.85      7416\n'
 '       B-gpe       0.92      0.88      0.90      3220\n'
 '       B-nat       0.80      0.08      0.15        50\n'
 '       B-org       0.71      0.63      0.66      4025\n'
 '       B-per       0.81      0.73      0.77      3480\n'
 '       B-tim       0.93      0.86      0.89      4036\n'
 '       I-art       0.00      0.00      0.00        58\n'
 '       I-eve       0.50      0.08      0.14        49\n'
 '       I-geo       0.79      0.68      0.73      1497\n'
 '       I-gpe       0.89      0.39      0.54        41\n'
 '       I-nat       0.00      0.00      0.00        11\n'
 '       I-org       0.73      0.69      0.71      3298\n'
 '       I-per       0.79      0.86      0.82      3483\n'
 '       I-tim       0.81      0.72      0.76     

  _warn_prf(average, modifier, msg_start, len(result))
