In [1]:
import numpy as np
import os
from copy import deepcopy

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torchcrf import CRF

from gensim.models.fasttext import FastText
from elmoformanylangs import Embedder

In [2]:
device = torch.device("cuda:0")
torch.tensor([1.], device = device)

tensor([1.], device='cuda:0')

In [3]:
NAMED_ENTITIES = ['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC']

## Functions to load data from document

In [4]:
def compute_label(spans, objects, tokens_number):
    type_ent = {'Org' : 'ORG', 'Person' : 'PER', 'Location' : 'LOC', 'LocOrg' : 'LOC'}
    labels_named = []
    labels = []
    spans_list = []
    wait = []  
    
    spans_number = []

    for s in spans:
        num_of_tokens = int(s[5])
        r_tks = s[7:7+num_of_tokens]
        spans_number.append(r_tks)
    
    for tk in tokens_number:
        tk_labels = 'O'
        tk_spans = []
        for i in range(len(spans_number)):
            r_tks = spans_number[i]
            
            if(tk in r_tks and spans[i][1] != 'job'):
                tk_spans.append(deepcopy(spans[i]))
        
        if(len(tk_spans) != 0):
            spans_list.append(tk_spans)
        else:
            spans_list.append(['O'])

    for i in range(len(spans_list)):
        if(len(spans_list[i]) == 1):
            labels_named.append(spans_list[i][0])
        else:
            rm = []
            l2 = deepcopy(spans_list[i])
            
            for j in l2:
                rm.append((j.pop(0), j.pop(0)))
                
            unique_data = [list(x) for x in set(tuple(x) for x in l2)]
            if (len(unique_data) == 1):
                i_unique = l2.index(unique_data[0])
                labels_named.append(spans_list[i][i_unique])
            else:
                i_near = np.arange(i-4, i+4)
                i_near = np.delete(i_near, 4)
                
                for j in range(len(i_near)):
                    if (i_near[j] >= 0 and i_near[j] < len(spans_list)):
                        if (j < 4):
                            to_look = [labels_named[i_near[j]]]
                        else:
                            to_look = spans_list[i_near[j]]

                        corr = []
                        for sp in spans_list[i]:
                            if (sp in to_look):
                                corr.append(deepcopy(sp))

                        if (len(corr) == 1):
                            ind = spans_list[i].index(corr[0])
                            labels_named.append(spans_list[i][ind])
                            break
                            
                        rm = []
                        for c in corr:
                            rm.append(c.pop(1))
                        unique_data = [list(x) for x in set(tuple(x) for x in corr)]                        
                        if(len(unique_data) == 1):
                            i_unique = corr.index(unique_data[0])
                            labels_named.append(spans_list[i][i_unique])
                            break

    ignore = ['job', 'prj_descr', 'prj_name', 'facility_descr', 'geo_adj']                        
    for l in labels_named:
        if (l == 'O' or l[1] in ignore):
            labels.append(0)
        else:
            if (l[1] not in ignore):
                id_obj = l[0]
                l_obj = []

                for o in objects:
                    if(id_obj in o):
                        index_i = o.index('#')
                        obj = o[1:index_i]
                        l_obj.append(obj)

                if (len(l_obj) == 1):
                    if (l_obj[0] in wait):
                        tk_labels = "I-" + type_ent.get(l_obj[0][0])
                    else:
                        tk_labels = "B-" + type_ent.get(l_obj[0][0])
                        wait.append(l_obj[0])

                else:
                    for p in l_obj:
                        if (p in wait):
                            tk_labels = "I-" + type_ent.get(p[0])
                            break
                        else:
                            wait.append(p)
                labels.append(NAMED_ENTITIES.index(tk_labels))
    return labels

In [5]:
def load_data(path, test_data = False):
    list_texts = []
    list_labels = []
    list_books = []
    
    list_tokens_file = list(filter(lambda f : f.endswith('.tokens'), os.listdir(path)))
    list_spans_file = list(filter(lambda f : f.endswith('.spans'), os.listdir(path)))
    list_objects_file = list(filter(lambda f : f.endswith('.objects'), os.listdir(path)))
    
    for i in range(len(list_tokens_file)):
        text_i = []
        tokens_number = []
        token_file = open(path + list_tokens_file[i], 'r', encoding = 'utf-8')
        spans_file = open(path + list_spans_file[i], 'r', encoding = 'utf-8')
        objects_file = open(path + list_objects_file[i], 'r', encoding = 'utf-8')
        
        tokens = list(map(lambda l: l.split(), token_file.readlines()))
        spans = list(map(lambda l : l.split(), spans_file.readlines()))
        objects = list(map(lambda l : l.split(), objects_file.readlines()))
        
        length = 0
        for j in range(len(tokens)):
            if(len(tokens[j]) == 0):
                if(test_data == False):                    
                    list_labels.append(compute_label(spans, objects, tokens_number))
                    
                list_texts.append(text_i)
                text_i = []
                tokens_number = []
            else:
                text_i.append(tokens[j][-1])
                tokens_number.append(tokens[j][0])
                length += 1
        
        if(test_data == True):
            list_books.append((list_tokens_file[i].replace('.tokens', ''), length))
            list_labels.append(compute_label(spans, objects, tokens_number))
        
        token_file.close()
        spans_file.close()
        objects_file.close()
        
    if (test_data == True):
        return list_books, list_texts, list_labels
    else:
        return list_texts, list_labels

In [33]:
entite = {'B-ORG' : 'org', 'I-ORG' : 'org', 'B-PER' : 'per', 'I-PER' : 'per', 'B-LOC' : 'loc', 'I-LOC' : 'loc'}

## Function to create file for evaluation

In [46]:
def create_file_eval(path_dir_test, path_dir_eval, filename, predictions):
    entite_to_file = {'B-ORG' : 'org', 'I-ORG' : 'org', 'B-PER' : 'per', 'I-PER' : 'per', 'B-LOC' : 'loc', 'I-LOC' : 'loc'}
    m = 0
    for i in range(len(filename)):
        name = filename[i][0]
        length = filename[i][1]
        
        text_predict = []
        som = 0
        for k in range(m, len(predictions)):
            som += len(predictions[m])
            text_predict += predictions[m]
            m += 1
            if(som == length):
                break       
        
        token_file = open(path_dir_test + name + '.tokens', 'r', encoding = 'utf-8')
        tokens_b = list(map(lambda l: l.split(), token_file.readlines()))
        tokens = []        
        for t in tokens_b:
            if (t != []):
                tokens.append(t)
        
        eval_f = open(path_dir_eval + name + '.task1', 'w+', encoding = 'utf-8')
        
        to_write = []
        size = 0
        for j in range(len(tokens)):
            label_j = NAMED_ENTITIES[text_predict[j]]

            if (label_j != 'O'):
                if (label_j[0] == 'B'):
                    if(to_write != []):
                        to_write.append(str(size))
                        eval_f.write(to_write[0] + ' ' + to_write[1] + ' '  + to_write[2] + '\n')

                        to_write = []
                        
                    start = tokens[j][1]
                    size = int(tokens[j][2])
                    ent = entite[label_j]
                    
                    to_write.append(ent)
                    to_write.append(start)
                else:
                    ent = entite[label_j]
                    
                    if(to_write != []):
                        if(ent != to_write[0]):
                            to_write.append(str(size))
                            eval_f.write(to_write[0] + ' ' + to_write[1] + ' '  + to_write[2] + '\n')
                            to_write = []
                        else:
                            size += 1
                            size += int(tokens[j][2])
                        
                    if(to_write == []):
                        start = tokens[j][1]
                        size = int(tokens[j][2])
                        to_write.append(ent)
                        to_write.append(start)                
                    
        eval_f.close()

## Implement Dataset

In [7]:
class NERDataset(Dataset):
    def __init__ (self, data, label, model_vect_size):
        self.data = data
        self.label = label
        self.model_vect_size = model_vect_size
        
    def __len__(self):
        return len(self.data)
        
    def __getitem__(self, index):       
        text = self.data[index] 
        seq_len = len(text)
        
        miss_data = 90 - seq_len
        data_to_add = np.zeros((miss_data, self.model_vect_size))        
        text = np.concatenate([text, data_to_add])
        
        mask = np.ones(seq_len)
        mask = np.concatenate([mask, np.zeros(miss_data)])
        
        
        if (self.label != None):
            t_label = self.label[index]
            t_label = np.concatenate([t_label, np.zeros(miss_data)])            
            return text, t_label, mask
        
        else:
            return text, mask

## Bi-LSTM CRF network

In [93]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, embed_size, hidden_dim, num_tag, batch_first = False):
        super(BiLSTM_CRF, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.lstm = nn.LSTM(embed_size, hidden_dim // 2, num_layers = 1, bidirectional = True, 
                            batch_first = batch_first)
        self.hidden2tag = nn.Linear(hidden_dim, num_tag)

        self.crf = CRF(num_tag, batch_first = batch_first)
        
        self.hidden = self.init_hidden(batch_size)
    
    def init_hidden(self, size):
        return (torch.zeros(2, size, self.hidden_dim // 2, device = device),
                torch.zeros(2, size, self.hidden_dim // 2, device = device))

    def forward(self, x, y, mask):
        self.hidden = self.init_hidden(x.shape[0])
        x, _ = self.lstm(x, self.hidden)
        x = self.hidden2tag(x)
        x = self.crf(x, y, mask = mask)
        return -x
    
    def predict(self, x, mask):
        self.hidden = self.init_hidden(x.shape[0])
        x, _ = self.lstm(x, self.hidden)
        x = self.hidden2tag(x)
        return self.crf.decode(x, mask = mask)

## Functions to train and predict

In [9]:
def train_model(model, train_loader, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        for i_step, (x, y, mask) in enumerate(train_loader):
            model.zero_grad()        
            x_gpu = x.to(device, dtype=torch.float)
            y_gpu = y.to(device, dtype=torch.long)
            m_gpu = mask.to(device, dtype=torch.uint8)

            loss = model(x_gpu, y_gpu, m_gpu)

            optimizer.zero_grad()
            loss.backward()        
            optimizer.step() 

def predict(model, test_loader):
    model.eval()
    predictions = []
    with torch.no_grad():
        for i_step, (x, mask) in enumerate(test_loader):
            x_gpu = x.to(device, dtype=torch.float)
            m_gpu = mask.to(device, dtype=torch.uint8)
            
            predict = model.predict(x_gpu, m_gpu)
            predictions += predict
    return predictions

## Loading data

In [10]:
texts_train, labels_train = load_data("data/devset/")
books_name, texts_test, labels_test = load_data("data/testset/", test_data=True)

# Part 1: Using FastText model

### Loading FastText model

In [11]:
fasttext_model = FastText.load_fasttext_format('cc.ru.300.bin')

2019-05-18 22:11:28,424 INFO: loading 2000000 words for fastText model from cc.ru.300.bin
2019-05-18 22:12:28,413 INFO: resetting layer weights
2019-05-18 22:12:28,756 INFO: Total number of ngrams is 0
2019-05-18 22:12:29,022 INFO: Updating model with new vocabulary
2019-05-18 22:12:41,927 INFO: New added 2000000 unique words (50% of original 4000000) and increased the count of 2000000 pre-existing words (50% of original 4000000)
2019-05-18 22:12:53,666 INFO: deleting the raw counts dictionary of 2000000 items
2019-05-18 22:12:53,666 INFO: sample=1e-05 downsamples 6340 most-common words
2019-05-18 22:12:53,675 INFO: downsampling leaves estimated 102264634653 word corpus (100.7% of prior 101572190356)
2019-05-18 22:22:53,789 INFO: loaded (4000000, 300) weight matrix for fastText model from cc.ru.300.bin


### Word to vector function

In [12]:
def gensim_convert_to_vectors(model, list_text):
    list_vect = []
    for sentence in list_text:
        sentence_vect = []
        for word in sentence:
            if word.lower() in model.wv.vocab:
                sentence_vect.append(model.wv[word])
            else:
                sentence_vect.append(np.zeros(model.vector_size))
        list_vect.append(np.array(sentence_vect))
        
    return list_vect

## Preparing data


In [13]:
texts_train1 = deepcopy(texts_train)
labels_train1 = deepcopy(labels_train)

texts_test1 = deepcopy(texts_test)

#### Batch size

In [14]:
batch_size = 100

#### Vectorization

In [15]:
texts_train1 = gensim_convert_to_vectors(fasttext_model, texts_train1)
texts_test1 = gensim_convert_to_vectors(fasttext_model, texts_test1)

#### Creating dataset

In [16]:
texts_dataset_train1 = NERDataset(texts_train1, labels_train1, fasttext_model.vector_size)
texts_train_loader1 = torch.utils.data.DataLoader(texts_dataset_train1, batch_size = batch_size)

texts_dataset_test1 = NERDataset(texts_test1, None, fasttext_model.vector_size)
texts_test_loader1 = torch.utils.data.DataLoader(texts_dataset_test1, batch_size = batch_size)

#### Initializing model

In [102]:
bilstm_crf1 = BiLSTM_CRF(embed_size = fasttext_model.vector_size, 
                         hidden_dim = 10, 
                         num_tag = 7,
                         batch_first = True)

bilstm_crf1.type(torch.cuda.FloatTensor)
bilstm_crf1.to(device)

optimizer1 = optim.RMSprop(bilstm_crf1.parameters(), lr = 1e-2, weight_decay=1e-3)

#### Training and predicting

In [103]:
%%time

train_model(bilstm_crf1, texts_train_loader1, optimizer1, num_epochs = 20)

predictions1 = predict(bilstm_crf1, texts_test_loader1)

create_file_eval('data/testset/', 'data/eval/', books_name, predictions1)

!python scripts/t1_eval.py -s data/testset -t data/eval -l

Failed to load the standard of book_3954:
Unknown mention tag: Facility
Type    P        R        F1       TP1      TP2      In Std.  In Test.
per        0.8721   0.7612   0.8129  1017.75  1017.75     1337     1167
loc        0.7865   0.8487   0.8164  1042.17  1042.17     1228     1325
org        0.7128   0.5543   0.6236   872.43   872.43     1574     1224
overall    0.7891   0.7085   0.7466  2932.34  2932.34     4139     3716
Wall time: 4min 33s


# Part 2: Using ELMo model

Using model from : https://github.com/HIT-SCIR/ELMoForManyLangs

### Loading ELMo model

In [17]:
elmo_model = Embedder('russian.model/')

2019-05-18 22:23:37,848 INFO: char embedding size: 3896
2019-05-18 22:23:39,892 INFO: word embedding size: 329681
2019-05-18 22:23:47,451 INFO: Model(
  (token_embedder): ConvTokenEmbedder(
    (word_emb_layer): EmbeddingLayer(
      (embedding): Embedding(329681, 100, padding_idx=3)
    )
    (char_emb_layer): EmbeddingLayer(
      (embedding): Embedding(3896, 50, padding_idx=3893)
    )
    (convolutions): ModuleList(
      (0): Conv1d(50, 32, kernel_size=(1,), stride=(1,))
      (1): Conv1d(50, 32, kernel_size=(2,), stride=(1,))
      (2): Conv1d(50, 64, kernel_size=(3,), stride=(1,))
      (3): Conv1d(50, 128, kernel_size=(4,), stride=(1,))
      (4): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
      (5): Conv1d(50, 512, kernel_size=(6,), stride=(1,))
      (6): Conv1d(50, 1024, kernel_size=(7,), stride=(1,))
    )
    (highways): Highway(
      (_layers): ModuleList(
        (0): Linear(in_features=2048, out_features=4096, bias=True)
        (1): Linear(in_features=2048, out_fe

### Sentence to vector function
We could give the list of text to the ELMo model but my graphic card is not enough powerful so I am splitting the list into parts and giving each part to the model.

In [18]:
def elmo_convert_to_vectors(model, list_text):
    list_vect = []
    list_sentence = [list_text[15*i:15*(i+1)] for i in range(int(len(list_text)/15) + 1)]
    if (list_sentence[-1] == []):
        list_sentence.pop(-1)

    for sentences in list_sentence:
        sentences_vect = model.sents2elmo(sentences)
        list_vect += sentences_vect
    
    return list_vect

## Preparing data

In [19]:
texts_train2 = deepcopy(texts_train)
labels_train2 = deepcopy(labels_train)

texts_test2 = deepcopy(texts_test)

#### Batch size, Embedding size

In [20]:
batch_size = 100
elmo_embedding_size = elmo_model.config.get('token_embedder').get('filters')[-1][-1]

In [21]:
texts_train2 = elmo_convert_to_vectors(elmo_model, texts_train2)
texts_test2 = elmo_convert_to_vectors(elmo_model, texts_test2)

2019-05-18 22:23:58,046 INFO: 1 batches, avg len: 32.2
2019-05-18 22:24:03,410 INFO: 1 batches, avg len: 19.1
2019-05-18 22:24:05,123 INFO: 1 batches, avg len: 22.3
2019-05-18 22:24:06,390 INFO: 1 batches, avg len: 19.1
2019-05-18 22:24:07,730 INFO: 1 batches, avg len: 22.4
2019-05-18 22:24:08,988 INFO: 1 batches, avg len: 19.1
2019-05-18 22:24:10,340 INFO: 1 batches, avg len: 26.1
2019-05-18 22:24:12,336 INFO: 1 batches, avg len: 17.7
2019-05-18 22:24:13,853 INFO: 1 batches, avg len: 16.3
2019-05-18 22:24:14,829 INFO: 1 batches, avg len: 17.5
2019-05-18 22:24:16,294 INFO: 1 batches, avg len: 22.0
2019-05-18 22:24:18,004 INFO: 1 batches, avg len: 18.1
2019-05-18 22:24:19,407 INFO: 1 batches, avg len: 25.0
2019-05-18 22:24:21,291 INFO: 1 batches, avg len: 31.0
2019-05-18 22:24:23,645 INFO: 1 batches, avg len: 26.5
2019-05-18 22:24:25,471 INFO: 1 batches, avg len: 19.9
2019-05-18 22:24:27,314 INFO: 1 batches, avg len: 16.2
2019-05-18 22:24:28,536 INFO: 1 batches, avg len: 18.3
2019-05-18

2019-05-18 22:27:40,559 INFO: 1 batches, avg len: 23.0
2019-05-18 22:27:42,378 INFO: 1 batches, avg len: 19.7
2019-05-18 22:27:43,819 INFO: 1 batches, avg len: 13.7
2019-05-18 22:27:45,202 INFO: 1 batches, avg len: 15.1
2019-05-18 22:27:46,378 INFO: 1 batches, avg len: 18.0
2019-05-18 22:27:47,724 INFO: 1 batches, avg len: 23.1
2019-05-18 22:27:49,213 INFO: 1 batches, avg len: 23.2
2019-05-18 22:27:51,442 INFO: 1 batches, avg len: 14.6
2019-05-18 22:27:52,882 INFO: 1 batches, avg len: 14.0
2019-05-18 22:27:53,860 INFO: 1 batches, avg len: 22.5
2019-05-18 22:27:55,359 INFO: 1 batches, avg len: 18.5
2019-05-18 22:27:57,385 INFO: 1 batches, avg len: 15.2
2019-05-18 22:27:58,905 INFO: 1 batches, avg len: 17.6
2019-05-18 22:28:00,145 INFO: 1 batches, avg len: 19.7
2019-05-18 22:28:01,757 INFO: 1 batches, avg len: 16.8
2019-05-18 22:28:03,467 INFO: 1 batches, avg len: 27.6
2019-05-18 22:28:05,188 INFO: 1 batches, avg len: 21.8
2019-05-18 22:28:06,918 INFO: 1 batches, avg len: 25.8
2019-05-18

2019-05-18 22:31:33,599 INFO: 1 batches, avg len: 17.5
2019-05-18 22:31:34,665 INFO: 1 batches, avg len: 23.3
2019-05-18 22:31:36,373 INFO: 1 batches, avg len: 22.3
2019-05-18 22:31:38,005 INFO: 1 batches, avg len: 19.9
2019-05-18 22:31:39,557 INFO: 1 batches, avg len: 23.1
2019-05-18 22:31:41,359 INFO: 1 batches, avg len: 28.7
2019-05-18 22:31:43,399 INFO: 1 batches, avg len: 24.9
2019-05-18 22:31:44,872 INFO: 1 batches, avg len: 19.7
2019-05-18 22:31:46,125 INFO: 1 batches, avg len: 19.2
2019-05-18 22:31:47,915 INFO: 1 batches, avg len: 29.9
2019-05-18 22:31:50,078 INFO: 1 batches, avg len: 21.0
2019-05-18 22:31:51,630 INFO: 1 batches, avg len: 18.4
2019-05-18 22:31:53,010 INFO: 1 batches, avg len: 27.7
2019-05-18 22:31:54,923 INFO: 1 batches, avg len: 31.5
2019-05-18 22:31:57,449 INFO: 1 batches, avg len: 23.4
2019-05-18 22:31:59,141 INFO: 1 batches, avg len: 20.0
2019-05-18 22:32:00,882 INFO: 1 batches, avg len: 20.5
2019-05-18 22:32:02,096 INFO: 1 batches, avg len: 21.4
2019-05-18

#### Creating dataset

In [22]:
texts_dataset_train2 = NERDataset(texts_train2, labels_train2, elmo_embedding_size)
texts_train_loader2 = torch.utils.data.DataLoader(texts_dataset_train2, batch_size = batch_size)

texts_dataset_test2 = NERDataset(texts_test2, None, elmo_embedding_size)
texts_test_loader2 = torch.utils.data.DataLoader(texts_dataset_test2, batch_size = batch_size)

#### Initializing model

In [98]:
bilstm_crf2 = BiLSTM_CRF(embed_size = elmo_embedding_size, 
                         hidden_dim = 10, 
                         num_tag = 7,
                         batch_first = True)

bilstm_crf2.type(torch.cuda.FloatTensor)
bilstm_crf2.to(device)

optimizer2 = optim.RMSprop(bilstm_crf2.parameters(), lr = 1e-2, weight_decay=1e-3)

#### Training and predicting

In [99]:
%%time

train_model(bilstm_crf2, texts_train_loader2, optimizer2, num_epochs = 20)

prediction2 = predict(bilstm_crf2, texts_test_loader2)

create_file_eval('data/testset/', 'data/eval/', books_name, prediction2)

!python scripts/t1_eval.py -s data/testset -t data/eval -l

Failed to load the standard of book_3954:
Unknown mention tag: Facility
Type    P        R        F1       TP1      TP2      In Std.  In Test.
per        0.9118   0.8621   0.8863  1155.25  1155.25     1340     1267
loc        0.9039   0.8627   0.8828  1058.50  1058.50     1227     1171
org        0.7350   0.6486   0.6891  1020.88  1020.88     1574     1389
overall    0.8452   0.7811   0.8119  3234.63  3234.63     4141     3827
Wall time: 5min 28s
