In [44]:
import os
import xml.etree.ElementTree as ET
from ipymarkup import show_box_markup
from ipymarkup.palette import palette, BLUE, RED, GREEN, ORANGE, BROWN, GREY, PURPLE
import sys
from brat_format import read_file
import os
import torch
import torch.nn as nn
import torch.optim as optim
import time
import numpy as np

def read_text_with_markup(directory):
    # В функции read_file заменила порядок элементов в entity-tuple для применения show_box_markup
    from brat_format import read_file
    file = read_file(directory+'.ann')
    text = file.txt_data
    spans = file.ners
    return text, spans

# Расчет F1
def calc_ner_f1(docs, predicted_labels):
    total_tp = 0
    total_fp = 0
    total_fn = 0

    for i in range(len(docs)):
        true_ners = docs[i].spans
        spans_predicted = get_spans(predicted_labels[i], docs[i].tokens)
        pred_ners = sorted(spans_predicted, key=lambda x: x[1])
        tp, fp, fn = cacl_ner_tp_fp_fn(true_ners, pred_ners)

        total_tp += tp
        total_fp += fp
        total_fn += fn

    precision, recall = compute_precision_and_recall(total_tp, total_fp, total_fn)

    f_measure = 2 * precision * recall / (precision + recall)
    
    return f_measure


#os.chdir(os.getcwd()+'/RuREBus-master/eval_scripts')
def compute_precision_and_recall(true_positive, false_positive, false_negative):
    """
    Вычисляем точность и полноту по TP, FP и FN
    """
    if false_positive + true_positive > 0:
        precision = float(true_positive) / (true_positive + false_positive)
    else:
        precision = 0
    if false_negative + true_positive > 0:
        recall = float(true_positive) / (true_positive + false_negative)
    else:
        recall = 0
    return recall, precision


def cacl_ner_tp_fp_fn(true_ners, pred_ners):
    # A - концы true, B - концы red, С - совпадение
    # C C  = TP + 1
    # A A B B | A B C | A B A B = FN + 1, advance A
    # B B A A | B A C | B A B A = FP + 1, advance B
    # C A B | C B A = FP + 1, FN + 1, advance A and B

    true_positive = 0
    false_positive = 0
    false_negative = 0

    i = 0
    j = 0
    while i != len(true_ners) and j != len(pred_ners):
        if true_ners[i] == pred_ners[j]:
            true_positive += 1
            i += 1
            j += 1
            continue
        if true_ners[i][0] >= pred_ners[j][1]:
            false_positive += 1
            j += 1
            continue
        if true_ners[i][1] <= pred_ners[j][0]:
            false_negative += 1
            i += 1
            continue
        if true_ners[i][0] < pred_ners[j][0]:
            false_negative += 1
            i += 1
            continue
        if true_ners[i][0] > pred_ners[j][0]:
            false_positive += 1
            j += 1
            continue

        false_positive += 1
        false_negative += 1
        j += 1
        i += 1

    false_negative += len(true_ners) - i
    false_positive += len(pred_ners) - j

    return true_positive, false_positive, false_negative


def get_next_gen_batch(samples, max_seq_len=1500, max_char_seq_len=40, batch_size=32):
    indices = np.arange(len(samples))
    np.random.shuffle(indices)
    batch_begin = 0
    while batch_begin < len(samples):
        batch_indices = indices[batch_begin: batch_begin + batch_size]
        batch = []
        batch_labels = []
        batch_max_len = 0
        for data_ind in batch_indices:
            sample = samples[data_ind]
            inputs = []
            for token in sample.tokens[:max_seq_len]:
                chars = [char_set.index(ch) if ch in char_set else char_set.index("<unk>") for ch in token.text][:max_char_seq_len]
                chars += [0] * (max_char_seq_len - len(chars))
                inputs.append(chars)
            batch_max_len = max(batch_max_len, len(inputs))
            inputs += [[0]*max_char_seq_len] * (max_seq_len - len(inputs))
            batch.append(inputs)
            labels = sample.labels[:max_seq_len]
            labels += [0] * (max_seq_len - len(labels))
            batch_labels.append(labels)
        batch_begin += batch_size
        batch = torch.cuda.LongTensor(batch)[:, :batch_max_len]
        labels = torch.cuda.LongTensor(batch_labels)[:, :batch_max_len]
        yield batch_indices, batch, labels


def train_gen_model(model, train_samples, val_samples, epochs_count=10, 
                    loss_every_nsteps=1000, lr=0.01, batch_size=32, save_path="model.pt", device_name="cuda",
                    early_stopping=True, patience = 5):
    params_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print("Trainable params: {}".format(params_count))
    device = torch.device(device_name)
    model = model.to(device)
    total_loss = 0
    train_batch_count = 0
    skipped = 0
    start_time = time.time()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_function = nn.CrossEntropyLoss().cuda()
    prev_avg_val_loss = None
    for epoch in range(epochs_count):
        model.train()
        for step, (_, batch, batch_labels) in enumerate(get_next_gen_batch(train, batch_size = batch_size)):
            logits = model(batch) # Прямой проход
            logits = logits.transpose(1, 2)
            loss = loss_function(logits, batch_labels) # Подсчёт ошибки
            loss_batch = loss.to("cpu").item()
            loss.backward() # Подсчёт градиентов dL/dw
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
            optimizer.step() # Градиентный спуск или его модификации (в данном случае Adam)
            optimizer.zero_grad() # Зануление градиентов, чтобы их спокойно менять на следующей итерации
            total_loss += loss.detach().item()
            train_batch_count += 1
        val_total_loss = 0
        val_batch_count = 0
        model.eval()
        for _, (_, batch, batch_labels) in enumerate(get_next_gen_batch(val)):
            logits = model(batch) # Прямой проход
            logits = logits.transpose(1, 2)
            val_total_loss += loss_function(logits, batch_labels).detach().item() # Подсчёт ошибки
            val_batch_count += 1
        avg_val_loss = val_total_loss/val_batch_count
        print("Epoch = {}, Avg Train Loss = {:.4f}, Avg val loss = {:.4f}, Time = {:.2f}s".format(epoch, total_loss / train_batch_count, avg_val_loss, time.time() - start_time))
        total_loss = 0
        start_time = time.time()
        
        if prev_avg_val_loss is not None and avg_val_loss > prev_avg_val_loss:
            skipped +=1
        else:
            skipped = 0

        if early_stopping and skipped > patience:
            model.load_state_dict(torch.load(save_path))
            model.eval()
            break
        prev_avg_val_loss = avg_val_loss       
        torch.save(model.state_dict(), save_path)
        
def predict(model, samples):
    device = torch.device("cuda")
    model = model.to(device)
    model.eval()
    true_labels = []
    predicted_labels = []
    all_indices = []
    for _, (indices, batch, batch_labels) in enumerate(get_next_gen_batch(samples)):
        logits = model(batch)
        plabels = logits.max(dim=2)[1]
        true_labels.extend(list(batch_labels.cpu().detach().numpy()))
        predicted_labels.extend(list(plabels.cpu().detach().numpy()))
        all_indices.extend(indices)
    samples = [samples[index] for index in all_indices]
    return samples, true_labels, predicted_labels

In [2]:
# кодировка Label по BIO схеме
def make_sample(data):
    from razdel import tokenize
    from collections import namedtuple
    Sample = namedtuple("Sample", "text,tokens,spans,labels")
    
    samples = []
    for text, spans in data:
        labels = []
        tokens = list(tokenize(text))
        for token in tokens: # для каждого токета проходим по всем спанам
            label = 0
            for span in spans:
                if (span[2]=='BIN'):
                    if token.start == span[0]:
                        label = 1
                    elif token.start > span[0] and token.stop <= span[1]:
                        label = 2
                elif (span[2]=='ECO'):
                    if token.start == span[0]:
                        label = 3
                    elif token.start > span[0] and token.stop <= span[1]:
                        label = 4  
                elif (span[2]=='INST'):
                    if token.start == span[0]:
                        label = 5
                    elif token.start > span[0] and token.stop <= span[1]:
                        label = 6  
                elif (span[2]=='CMP'):
                    if token.start == span[0]:
                        label = 7
                    elif token.start > span[0] and token.stop <= span[1]:
                        label = 8  
                elif (span[2]=='ACT'):
                    if token.start == span[0]:
                        label = 9
                    elif token.start > span[0] and token.stop <= span[1]:
                        label = 10  
                elif (span[2]=='MET'):
                    if token.start == span[0]:
                        label = 11
                    elif token.start > span[0] and token.stop <= span[1]:
                        label = 12  
                elif (span[2]=='QUA'):
                    if token.start == span[0]:
                        label = 13
                    elif token.start > span[0] and token.stop <= span[1]:
                        label = 14  
                elif (span[2]=='SOC'):
                    if token.start == span[0]:
                        label = 15
                    elif token.start > span[0] and token.stop <= span[1]:
                        label = 16  
                    
            labels.append(label)
        sample = Sample(text, tokens, spans, labels)
        samples.append(sample)
    return(samples)

# Расшифровка label
def get_spans(labels, tokens):

    # Исправление неконсистентности
    labels_new = []
    subs = 0
    for i, x in enumerate(labels[:-1]):
        if i > 0: #если значение не нулевое, четное, а до этого не шло нечетное, то
            if (x%2!=0) & ((labels[i+1]==x+1) | (labels[i+1]==x)): #(x%2==0) & (x>0) & (labels[i-1]!=x-1):
                labels_new.append(x) #добавляем недостающее нечетное
                subs = 1
            elif (x%2!=0) & (subs == 1): 
                labels_new.append(x+1)
            elif (x%2!=0) | (x==0):
                labels_new.append(x)
                subs = 0
            elif (x%2==0) & (subs==1) & (labels[i+1]==x):
                labels_new.append(x)
            elif (x%2==0) & (subs==1) & (labels[i+1]!=x):
                labels_new.append(x)
                subs = 0
            elif (x%2==0) & (subs==0):
                labels_new.append(x-1)
                subs = 1
        else:
            if (x%2==0) & (x>0):
                labels_new.append(x-1) #если первый элемент четный, заменяем его на нечетный
            else:
                labels_new.append(x)
    labels_new.append(labels[-1])

    spans = []
    for i, label in enumerate(labels_new):
        if label == 1:
            spans.append((tokens[i].start, tokens[i].stop, "BIN"))
        elif label == 2:
            spans[-1] = (spans[-1][0], tokens[i].stop, spans[-1][-1])
        elif label == 3:
            spans.append((tokens[i].start, tokens[i].stop, "ECO"))
        elif label == 4:
            spans[-1] = (spans[-1][0], tokens[i].stop, spans[-1][-1])
        elif label == 5:
            spans.append((tokens[i].start, tokens[i].stop, "INST"))
        elif label == 6:
            spans[-1] = (spans[-1][0], tokens[i].stop, spans[-1][-1])
        elif label == 7:
            spans.append((tokens[i].start, tokens[i].stop, "CMP"))
        elif label == 8:
            spans[-1] = (spans[-1][0], tokens[i].stop, spans[-1][-1])
        elif label == 9:
            spans.append((tokens[i].start, tokens[i].stop, "ACT"))
        elif label == 10:
            spans[-1] = (spans[-1][0], tokens[i].stop, spans[-1][-1])
        elif label == 11:
            spans.append((tokens[i].start, tokens[i].stop, "MET"))
        elif label == 12:
            spans[-1] = (spans[-1][0], tokens[i].stop,  spans[-1][-1])
        elif label == 13:
            spans.append((tokens[i].start, tokens[i].stop, "QUA"))
        elif label == 14:
            spans[-1] = (spans[-1][0], tokens[i].stop, spans[-1][-1])
        elif label == 15:
            spans.append((tokens[i].start, tokens[i].stop, "SOC"))
        elif label == 16:
            spans[-1] = (spans[-1][0], tokens[i].stop, spans[-1][-1])            
    return spans           

### Подготовка данных

#### Чтение train

In [4]:
paths = []

for i in range(1,4):
#    directory = 'C:/Users/s-ir/Documents/DS/NLP_advanced/week4/part2/RuREBus-master/train_data/train_part_'+str(i)+'/'
    directory = 'D:/NLP_advanced/week4/part2/RuREBus-master/train_data/train_part_'+str(i)+'/'

    for sample_name in os.listdir(directory):
        sample_path = os.path.join(directory, sample_name[:-4])
        paths.append(sample_path)

paths = list(set(paths))

data_train = []
for sample_name in paths:
    data_train.append(read_text_with_markup(sample_name))
    
len(data_train)

188

#### Чтение test

In [5]:
paths = []

for i in range(1,4):
#    directory = 'C:/Users/s-ir/Documents/DS/NLP_advanced/week4/part2/RuREBus-master/test_data/'
    directory = 'D:/NLP_advanced/week4/part2/RuREBus-master/test_data/'

    for sample_name in os.listdir(directory):
        sample_path = os.path.join(directory, sample_name[:-4])
        paths.append(sample_path)

paths = list(set(paths))

data_test = []
for sample_name in paths:
    data_test.append(read_text_with_markup(sample_name))
    
len(data_test)

544

In [7]:
show_box_markup(data_train[0][0], data_train[0][1], palette=palette(BIN=BLUE, SOC=RED,ECO=GREEN,INST=PURPLE,CMP=BROWN, ACT=ORANGE,MET=RED,QUA=BLUE))

#### Train, val, test

In [6]:
import random
import pandas as pd
random.seed(42)

samples = make_sample(data_train)
random.shuffle(samples)
train = samples.copy()[:120]
val = samples.copy()[120:]
test = make_sample(data_test)

#### Словарь символов

In [7]:
#char_set = ["<pad>", "<unk>"] + list({ch for sample in samples for token in sample.tokens for ch in token.text})
char_set = ['<pad>', '<unk>', '8', 'д', 'ы', '»', ';', 'Н', 'М', '\uf0b7', '7', 'x', 't', 'у', 'O', 'M', 'б', 'S', 'i', '─', 'Д', 'Ö', 'ч', 'B', '‰', 'Я', '│', 'a', 'м', '(', ')', '"', '*', 'z', 'H', 'A', 'ъ', ':', 'п', '3', 'Ч', 'f', 'с', 'Z', 'ш', '├', 'ф', 'Ь', 'ю', 'Й', 'Ш', 'W', '└', '1', 'k', 'җ', 'л', 'Ж', 'K', '“', 'И', 'І', '┌', 'r', '┬', 'Ы', 'g', '0', '┘', 'р', 'E', '9', '×', 'Ф', '…', 'я', '/', 'ә', 'N', 'х', 'Ъ', 'p', 'з', 'ь', '2', 'о', 'ё', 'Б', '+', '@', 'Э', '£', '∑', '┼', '■', '┤', 'Σ', '4', 'F', 'О', 'T', '=', 'Г', 'ц', 'n', 'j', 'h', 'b', 'l', 'Ю', '{', 'X', 'В', 'd', '>', '6', 's', 'c', 'P', 'е', 'o', 'А', 'й', 'Т', 'V', '•', 'v', '5', '№', 'm', 'щ', 'L', 'I', 'u', 'R', 'Л', 'и', 'Х', 'К', '}', 'в', 'Ә', 'У', 'т', 'e', '.', 'ж', 'y', 'к', 'э', '_', 'З', 'Ё', 'Ц', 'Р', 'Е', ']', '«', 'а', '-', 'П', 'w', 'Щ', 'D', ',', '[', '<', '!', 'г', '–', 'н', 'С', '”', '┐', 'U', '%', '—', 'G', 'C']
print(char_set)

['<pad>', '<unk>', '8', 'д', 'ы', '»', ';', 'Н', 'М', '\uf0b7', '7', 'x', 't', 'у', 'O', 'M', 'б', 'S', 'i', '─', 'Д', 'Ö', 'ч', 'B', '‰', 'Я', '│', 'a', 'м', '(', ')', '"', '*', 'z', 'H', 'A', 'ъ', ':', 'п', '3', 'Ч', 'f', 'с', 'Z', 'ш', '├', 'ф', 'Ь', 'ю', 'Й', 'Ш', 'W', '└', '1', 'k', 'җ', 'л', 'Ж', 'K', '“', 'И', 'І', '┌', 'r', '┬', 'Ы', 'g', '0', '┘', 'р', 'E', '9', '×', 'Ф', '…', 'я', '/', 'ә', 'N', 'х', 'Ъ', 'p', 'з', 'ь', '2', 'о', 'ё', 'Б', '+', '@', 'Э', '£', '∑', '┼', '■', '┤', 'Σ', '4', 'F', 'О', 'T', '=', 'Г', 'ц', 'n', 'j', 'h', 'b', 'l', 'Ю', '{', 'X', 'В', 'd', '>', '6', 's', 'c', 'P', 'е', 'o', 'А', 'й', 'Т', 'V', '•', 'v', '5', '№', 'm', 'щ', 'L', 'I', 'u', 'R', 'Л', 'и', 'Х', 'К', '}', 'в', 'Ә', 'У', 'т', 'e', '.', 'ж', 'y', 'к', 'э', '_', 'З', 'Ё', 'Ц', 'Р', 'Е', ']', '«', 'а', '-', 'П', 'w', 'Щ', 'D', ',', '[', '<', '!', 'г', '–', 'н', 'С', '”', '┐', 'U', '%', '—', 'G', 'C']


#### MLP

In [None]:
class SuperSimpleModel(nn.Module):
    def __init__(self, char_set_size, char_embedding_dim=16, classes_count=17, char_max_seq_len=40):
        super().__init__()
        
        self.embeddings_layer = nn.Embedding(char_set_size, char_embedding_dim)
        self.out_layer = nn.Linear(char_max_seq_len * char_embedding_dim, classes_count)

    def forward(self, inputs):
        projections = self.embeddings_layer.forward(inputs)
        projections = projections.reshape(projections.size(0), projections.size(1), -1)
        output = self.out_layer.forward(projections)
        return output

In [9]:
model = SuperSimpleModel(len(char_set))
train_gen_model(model, train, val, epochs_count=22, early_stopping=True, lr=0.01, patience = 5, save_path="model_mlp1.pt")

Trainable params: 13761
Epoch = 0, Avg Train Loss = 2.5638, Avg val loss = 1.6414, Time = 3.30s
Epoch = 1, Avg Train Loss = 0.8046, Avg val loss = 1.3461, Time = 3.07s
Epoch = 2, Avg Train Loss = 0.4632, Avg val loss = 1.3075, Time = 3.08s
Epoch = 3, Avg Train Loss = 0.3205, Avg val loss = 1.1655, Time = 3.00s
Epoch = 4, Avg Train Loss = 0.2396, Avg val loss = 1.1451, Time = 3.08s
Epoch = 5, Avg Train Loss = 0.1923, Avg val loss = 1.0755, Time = 3.07s
Epoch = 6, Avg Train Loss = 0.1603, Avg val loss = 0.9920, Time = 3.07s
Epoch = 7, Avg Train Loss = 0.1365, Avg val loss = 1.2542, Time = 3.00s
Epoch = 8, Avg Train Loss = 0.1221, Avg val loss = 1.0363, Time = 3.07s
Epoch = 9, Avg Train Loss = 0.1076, Avg val loss = 1.0739, Time = 3.07s
Epoch = 10, Avg Train Loss = 0.0963, Avg val loss = 1.0239, Time = 3.00s
Epoch = 11, Avg Train Loss = 0.0876, Avg val loss = 1.0704, Time = 3.08s
Epoch = 12, Avg Train Loss = 0.0817, Avg val loss = 1.0614, Time = 3.07s
Epoch = 13, Avg Train Loss = 0.0753, 

In [13]:
model = SuperSimpleModel(len(char_set))
model.load_state_dict(torch.load("model_mlp1.pt"))

docs, true_labels, predicted_labels = predict(model, test)
calc_ner_f1(docs, true_labels, predicted_labels)

0.179334643693543

In [20]:
show_box_markup(docs[100].text, get_spans(true_labels[100], docs[100].tokens),
                palette=palette(BIN=BLUE,SOC=RED,ECO=GREEN,INST=PURPLE,CMP=BROWN,ACT=ORANGE,MET=RED,QUA=BLUE))

#### RNN

In [51]:
class LstmModel(nn.Module):
    def __init__(self, char_set_size, char_embedding_dim=8, classes_count=17, lstm_embedding_dim=32, char_max_seq_len=40):
        super().__init__()
        
        self.embeddings_layer = nn.Embedding(char_set_size, char_embedding_dim)
        self.dropout = nn.Dropout(0.5)
        self.lstm_layer = nn.LSTM(char_embedding_dim * char_max_seq_len, lstm_embedding_dim // 2, batch_first=True, bidirectional=True)
        self.out_layer = nn.Linear(lstm_embedding_dim, classes_count)

    
    def forward(self, x_in, apply_softmax=False):
        from torch.nn import functional as F 
        """ Прямой проход модели
         Аргументы:
         x_in (torch.Tensor): тензор входных данных
         Значение x_in.shape должно быть (batch, input_dim)
         apply_softmax (bool): флаг для многомерной логистической функции
         активации во время обучения должен равняться 0
         Возвращает:
         итоговый тензор. Значение tensor.shape должно быть (batch, output_dim)
         """
        projections = self.embeddings_layer(x_in)
        projections = projections.reshape(projections.size(0), projections.size(1), -1)
        y_out, _ = self.lstm_layer(projections)
        batch_size, seq_size, feat_size = y_out.shape
        y_out = y_out.contiguous().view(batch_size * seq_size, feat_size)
        y_out = self.out_layer(F.tanh(F.dropout(y_out, p=0.5)))
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)
        new_feat_size = y_out.shape[-1]
        y_out = y_out.view(batch_size, seq_size, new_feat_size)
        return y_out


In [54]:
model = LstmModel(len(char_set))
train_gen_model(model, train, val, epochs_count=100, early_stopping=True, lr=0.01, batch_size=32, patience = 5, save_path="model_lstm.pt")

Trainable params: 45257
Epoch = 0, Avg Train Loss = 2.4263, Avg val loss = 2.0602, Time = 4.51s
Epoch = 1, Avg Train Loss = 0.9861, Avg val loss = 1.8088, Time = 4.59s
Epoch = 2, Avg Train Loss = 0.5705, Avg val loss = 1.6403, Time = 4.60s
Epoch = 3, Avg Train Loss = 0.3911, Avg val loss = 1.4765, Time = 4.50s
Epoch = 4, Avg Train Loss = 0.3035, Avg val loss = 1.3413, Time = 4.59s
Epoch = 5, Avg Train Loss = 0.2492, Avg val loss = 1.3858, Time = 4.81s
Epoch = 6, Avg Train Loss = 0.2127, Avg val loss = 1.4266, Time = 4.42s
Epoch = 7, Avg Train Loss = 0.1840, Avg val loss = 1.4343, Time = 4.52s
Epoch = 8, Avg Train Loss = 0.1628, Avg val loss = 1.4702, Time = 4.50s
Epoch = 9, Avg Train Loss = 0.1456, Avg val loss = 1.3337, Time = 4.44s
Epoch = 10, Avg Train Loss = 0.1305, Avg val loss = 1.4790, Time = 4.50s
Epoch = 11, Avg Train Loss = 0.1182, Avg val loss = 1.3416, Time = 4.51s
Epoch = 12, Avg Train Loss = 0.1076, Avg val loss = 1.3703, Time = 4.45s
Epoch = 13, Avg Train Loss = 0.0993, 

In [55]:
char_set = ['<pad>', '<unk>', '8', 'д', 'ы', '»', ';', 'Н', 'М', '\uf0b7', '7', 'x', 't', 'у', 'O', 'M', 'б', 'S', 'i', '─', 'Д', 'Ö', 'ч', 'B', '‰', 'Я', '│', 'a', 'м', '(', ')', '"', '*', 'z', 'H', 'A', 'ъ', ':', 'п', '3', 'Ч', 'f', 'с', 'Z', 'ш', '├', 'ф', 'Ь', 'ю', 'Й', 'Ш', 'W', '└', '1', 'k', 'җ', 'л', 'Ж', 'K', '“', 'И', 'І', '┌', 'r', '┬', 'Ы', 'g', '0', '┘', 'р', 'E', '9', '×', 'Ф', '…', 'я', '/', 'ә', 'N', 'х', 'Ъ', 'p', 'з', 'ь', '2', 'о', 'ё', 'Б', '+', '@', 'Э', '£', '∑', '┼', '■', '┤', 'Σ', '4', 'F', 'О', 'T', '=', 'Г', 'ц', 'n', 'j', 'h', 'b', 'l', 'Ю', '{', 'X', 'В', 'd', '>', '6', 's', 'c', 'P', 'е', 'o', 'А', 'й', 'Т', 'V', '•', 'v', '5', '№', 'm', 'щ', 'L', 'I', 'u', 'R', 'Л', 'и', 'Х', 'К', '}', 'в', 'Ә', 'У', 'т', 'e', '.', 'ж', 'y', 'к', 'э', '_', 'З', 'Ё', 'Ц', 'Р', 'Е', ']', '«', 'а', '-', 'П', 'w', 'Щ', 'D', ',', '[', '<', '!', 'г', '–', 'н', 'С', '”', '┐', 'U', '%', '—', 'G', 'C']
model_lstm = LstmModel(len(char_set))
model_lstm.load_state_dict(torch.load("model_lstm.pt"))

docs, true_labels, predicted_labels = predict(model_lstm, test)
calc_ner_f1(docs, predicted_labels)

#### CharFFLstm

In [46]:
class CharFFLstmModel(nn.Module):
    def __init__(self, char_set_size, char_embedding_dim=4, classes_count=17, word_embedding_dim=64, lstm_embedding_dim=32, char_max_seq_len=40):
        super().__init__()
        
        self.embeddings_layer = nn.Embedding(char_set_size, char_embedding_dim)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(char_embedding_dim * char_max_seq_len, word_embedding_dim)
        self.relu = nn.ReLU()
        self.lstm_layer = nn.LSTM(word_embedding_dim, lstm_embedding_dim // 2, batch_first=True, bidirectional=True)
        self.out_layer = nn.Linear(lstm_embedding_dim, classes_count)

    def forward(self, inputs):
        projections = self.embeddings_layer.forward(inputs)
        projections = projections.reshape(projections.size(0), projections.size(1), -1)
        projections = self.relu(self.linear(projections))
        projections = self.dropout(projections)
        output, _= self.lstm_layer(projections)
        output = self.dropout(output)
        output = self.out_layer.forward(output)
        return output

In [None]:
model = CharFFLstmModel(len(char_set))
train_gen_model(model, train, val, epochs_count=500, early_stopping=True, lr=0.01, save_path="model_charff_v1.pt", patience = 5)

Trainable params: 22077
Epoch = 0, Avg Train Loss = 2.2922, Avg val loss = 1.5811, Time = 5.13s
Epoch = 1, Avg Train Loss = 0.7752, Avg val loss = 1.4932, Time = 4.69s
Epoch = 2, Avg Train Loss = 0.4822, Avg val loss = 1.4512, Time = 4.54s
Epoch = 3, Avg Train Loss = 0.3557, Avg val loss = 1.3404, Time = 4.47s
Epoch = 4, Avg Train Loss = 0.2728, Avg val loss = 1.3193, Time = 4.58s
Epoch = 5, Avg Train Loss = 0.2167, Avg val loss = 1.2514, Time = 4.57s
Epoch = 6, Avg Train Loss = 0.1798, Avg val loss = 1.2741, Time = 4.58s
Epoch = 7, Avg Train Loss = 0.1561, Avg val loss = 1.2980, Time = 4.52s
Epoch = 8, Avg Train Loss = 0.1380, Avg val loss = 1.3308, Time = 4.60s
Epoch = 9, Avg Train Loss = 0.1225, Avg val loss = 1.2711, Time = 4.63s
Epoch = 10, Avg Train Loss = 0.1114, Avg val loss = 1.1864, Time = 4.54s
Epoch = 11, Avg Train Loss = 0.1005, Avg val loss = 1.2245, Time = 4.64s
Epoch = 12, Avg Train Loss = 0.0919, Avg val loss = 1.3031, Time = 4.65s
Epoch = 13, Avg Train Loss = 0.0846, 

In [18]:
char_set = ['<pad>', '<unk>', '8', 'д', 'ы', '»', ';', 'Н', 'М', '\uf0b7', '7', 'x', 't', 'у', 'O', 'M', 'б', 'S', 'i', '─', 'Д', 'Ö', 'ч', 'B', '‰', 'Я', '│', 'a', 'м', '(', ')', '"', '*', 'z', 'H', 'A', 'ъ', ':', 'п', '3', 'Ч', 'f', 'с', 'Z', 'ш', '├', 'ф', 'Ь', 'ю', 'Й', 'Ш', 'W', '└', '1', 'k', 'җ', 'л', 'Ж', 'K', '“', 'И', 'І', '┌', 'r', '┬', 'Ы', 'g', '0', '┘', 'р', 'E', '9', '×', 'Ф', '…', 'я', '/', 'ә', 'N', 'х', 'Ъ', 'p', 'з', 'ь', '2', 'о', 'ё', 'Б', '+', '@', 'Э', '£', '∑', '┼', '■', '┤', 'Σ', '4', 'F', 'О', 'T', '=', 'Г', 'ц', 'n', 'j', 'h', 'b', 'l', 'Ю', '{', 'X', 'В', 'd', '>', '6', 's', 'c', 'P', 'е', 'o', 'А', 'й', 'Т', 'V', '•', 'v', '5', '№', 'm', 'щ', 'L', 'I', 'u', 'R', 'Л', 'и', 'Х', 'К', '}', 'в', 'Ә', 'У', 'т', 'e', '.', 'ж', 'y', 'к', 'э', '_', 'З', 'Ё', 'Ц', 'Р', 'Е', ']', '«', 'а', '-', 'П', 'w', 'Щ', 'D', ',', '[', '<', '!', 'г', '–', 'н', 'С', '”', '┐', 'U', '%', '—', 'G', 'C']
model_ff = CharFFLstmModel(len(char_set))
model_ff.load_state_dict(torch.load("model_charff_v1.pt"))

docs, true_labels, predicted_labels = predict(model_ff, test)
calc_ner_f1(docs, predicted_labels)

0.24890888063648148

#### Лучший результат  по CharFF+BiLSTM: F1 = 0.24890888063648148

#### CharCNNLstm

In [21]:
import torch
class CharCNNLstmModel(nn.Module):
    def __init__(self, char_set_size, char_embedding_dim=4, classes_count=17, word_embedding_dim=64, lstm_embedding_dim=32, max_char_seq_len=40, filters=((2, 16), (3, 24))):
        super().__init__()
        
        self.embeddings_layer = nn.Embedding(char_set_size, char_embedding_dim)
        self.filters = []
        all_filters_count = 0
        for kernel_size, filters_count in filters:
            all_filters_count += filters_count
            self.filters.append(nn.Conv2d(1, filters_count, (kernel_size, char_embedding_dim), padding=(1, 0))) 
        self.filters = nn.ModuleList(self.filters)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(all_filters_count, word_embedding_dim)
        self.relu = nn.ReLU()
        self.lstm_layer = nn.LSTM(word_embedding_dim, lstm_embedding_dim // 2, batch_first=True, bidirectional=True)
        self.out_layer = nn.Linear(lstm_embedding_dim, classes_count)

    def forward(self, inputs):
        projections = self.embeddings_layer.forward(inputs)
        batch_size = inputs.shape[0]
        max_seq_len = inputs.shape[1]
        max_char_seq_len = inputs.shape[2]
        char_embedding_dim = projections.shape[3]
        projections = projections.unsqueeze(2).view(-1, 1, max_char_seq_len, char_embedding_dim)
        results = []
        for f in self.filters:
            convolved = self.dropout(self.relu(f(projections))).squeeze(3)
            pooling = torch.max(convolved, 2)[0]
            results.append(pooling)
        conv = torch.cat(results, 1)
        conv = self.relu(self.linear(conv))
        conv = self.dropout(conv)
        word_embedding_dim = conv.shape[1]
        output, _= self.lstm_layer(conv.view(-1, max_seq_len, word_embedding_dim))
        output = self.dropout(output)
        output = self.out_layer.forward(output)
        return output

In [26]:
model = CharCNNLstmModel(len(char_set))
train_gen_model(model, train, val, epochs_count=149, early_stopping=True, lr=0.01, save_path="model_charcnn_v3.pt", patience = 5, batch_size = 32)

Trainable params: 14853
Epoch = 0, Avg Train Loss = 2.4991, Avg val loss = 1.7924, Time = 4.86s
Epoch = 1, Avg Train Loss = 0.8161, Avg val loss = 1.4324, Time = 4.64s
Epoch = 2, Avg Train Loss = 0.4855, Avg val loss = 1.2781, Time = 4.73s
Epoch = 3, Avg Train Loss = 0.3500, Avg val loss = 1.2548, Time = 4.75s
Epoch = 4, Avg Train Loss = 0.2664, Avg val loss = 1.3057, Time = 4.73s
Epoch = 5, Avg Train Loss = 0.2140, Avg val loss = 1.2242, Time = 4.74s
Epoch = 6, Avg Train Loss = 0.1830, Avg val loss = 1.1743, Time = 4.73s
Epoch = 7, Avg Train Loss = 0.1597, Avg val loss = 0.9614, Time = 4.74s
Epoch = 8, Avg Train Loss = 0.1402, Avg val loss = 1.2156, Time = 4.70s
Epoch = 9, Avg Train Loss = 0.1252, Avg val loss = 1.0543, Time = 4.74s
Epoch = 10, Avg Train Loss = 0.1138, Avg val loss = 1.0761, Time = 4.71s
Epoch = 11, Avg Train Loss = 0.1035, Avg val loss = 1.1771, Time = 4.71s
Epoch = 12, Avg Train Loss = 0.0953, Avg val loss = 1.2367, Time = 4.74s
Epoch = 13, Avg Train Loss = 0.0885, 

Epoch = 111, Avg Train Loss = 0.0090, Avg val loss = 1.0118, Time = 4.69s
Epoch = 112, Avg Train Loss = 0.0091, Avg val loss = 1.0074, Time = 4.69s
Epoch = 113, Avg Train Loss = 0.0090, Avg val loss = 0.9001, Time = 4.60s
Epoch = 114, Avg Train Loss = 0.0088, Avg val loss = 1.0211, Time = 4.77s
Epoch = 115, Avg Train Loss = 0.0087, Avg val loss = 0.9535, Time = 4.69s
Epoch = 116, Avg Train Loss = 0.0087, Avg val loss = 0.8907, Time = 4.61s
Epoch = 117, Avg Train Loss = 0.0086, Avg val loss = 0.9177, Time = 4.69s
Epoch = 118, Avg Train Loss = 0.0085, Avg val loss = 0.8729, Time = 4.67s
Epoch = 119, Avg Train Loss = 0.0084, Avg val loss = 0.9025, Time = 4.59s
Epoch = 120, Avg Train Loss = 0.0083, Avg val loss = 0.9230, Time = 4.67s
Epoch = 121, Avg Train Loss = 0.0083, Avg val loss = 0.8466, Time = 4.73s
Epoch = 122, Avg Train Loss = 0.0081, Avg val loss = 0.9043, Time = 4.60s
Epoch = 123, Avg Train Loss = 0.0080, Avg val loss = 0.8309, Time = 4.74s
Epoch = 124, Avg Train Loss = 0.0080, 

In [27]:
model_cnn = CharCNNLstmModel(len(char_set))
model_cnn.load_state_dict(torch.load("model_charcnn_v3.pt"))

docs, true_labels, predicted_labels = predict(model_cnn, test)
calc_ner_f1(docs, predicted_labels)

0.15480357837417347