In [15]:
class Sentence():
    def __init__(self):
        self.tokens = []
        self.pos_tags = []
        self.grams = []
        
class Indexer():
    def __init__(self):
        pass

In [16]:
def read_dataset(dataset, mode):
    sentences = []
    
    with open(dataset, mode='r', encoding='utf-8') as data:
        # Пропускаем заголовок
        next(data)
        
        sentence = Sentence() # будем заполнять список предложений
        
        for row in data:
            row = row.strip()
            if len(row) != 0:
                row = row.split('\t')

                if mode == 'train':
                    _, _, token, pos_gram = row 
                    pos, gram = pos_gram.split('#')

                else:
                    _, _, token = row
                    pos, gram = '<UNK>', '<UNK>'

                sentence.tokens.append(token)
                sentence.pos_tags.append(pos)
                sentence.grams.append(gram)

            else:
                if len(sentence.tokens) > 0:
                    sentences.append(sentence)
                    sentence = Sentence()
                
        if len(sentence.tokens) > 0:
            sentence.append(sentence)
            
    return sentences

In [17]:
train = read_dataset('train.csv', 'train')
test = read_dataset('test.csv', 'test')

In [18]:
# Проверка корректности чтения датасета
index = 0
assert len(train[index].tokens) == len(train[index].tokens) == len(train[index].tokens)

for idx in range(len(train[index].tokens)):
    print(train[index].tokens[idx], '\t', train[index].pos_tags[idx], '\t', train[index].grams[idx], '\t')

А 	 CONJ 	 _ 	
ведь 	 PART 	 _ 	
для 	 ADP 	 _ 	
конкретных 	 ADJ 	 Case=Gen|Degree=Pos|Number=Plur 	
изделий 	 NOUN 	 Animacy=Inan|Case=Gen|Gender=Neut|Number=Plur 	
зачастую 	 ADV 	 Degree=Pos 	
нужен 	 ADJ 	 Degree=Pos|Gender=Masc|Number=Sing|Variant=Brev 	
монокристалл 	 NOUN 	 Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing 	
не 	 PART 	 _ 	
только 	 PART 	 _ 	
крупный 	 ADJ 	 Case=Nom|Degree=Pos|Gender=Masc|Number=Sing 	
, 	 PUNCT 	 _ 	
но 	 CONJ 	 _ 	
и 	 PART 	 _ 	
заданной 	 VERB 	 Aspect=Perf|Case=Gen|Gender=Fem|Number=Sing|Tense=Past|VerbForm=Part|Voice=Pass 	
формы 	 NOUN 	 Animacy=Inan|Case=Gen|Gender=Fem|Number=Sing 	
, 	 PUNCT 	 _ 	
например 	 ADV 	 Degree=Pos 	
" 	 PUNCT 	 _ 	
стакан 	 NOUN 	 Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing 	
" 	 PUNCT 	 _ 	
, 	 PUNCT 	 _ 	
" 	 PUNCT 	 _ 	
тройник 	 NOUN 	 Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing 	
" 	 PUNCT 	 _ 	
( 	 PUNCT 	 _ 	
элемент 	 NOUN 	 Animacy=Inan|Case=Nom|Gender=Masc|Number=Sing 	
трубопровода 	 NOUN 	 Anim

In [21]:
print(train[0].tokens)
print(train[0].pos_tags)

['А', 'ведь', 'для', 'конкретных', 'изделий', 'зачастую', 'нужен', 'монокристалл', 'не', 'только', 'крупный', ',', 'но', 'и', 'заданной', 'формы', ',', 'например', '"', 'стакан', '"', ',', '"', 'тройник', '"', '(', 'элемент', 'трубопровода', ')', 'или', 'еще', 'сложнее', '.']
['CONJ', 'PART', 'ADP', 'ADJ', 'NOUN', 'ADV', 'ADJ', 'NOUN', 'PART', 'PART', 'ADJ', 'PUNCT', 'CONJ', 'PART', 'VERB', 'NOUN', 'PUNCT', 'ADV', 'PUNCT', 'NOUN', 'PUNCT', 'PUNCT', 'PUNCT', 'NOUN', 'PUNCT', 'PUNCT', 'NOUN', 'NOUN', 'PUNCT', 'CONJ', 'ADV', 'ADJ', 'PUNCT']


## Подготовка эмбеддингов

In [31]:
#запомним все уникальные слова и POS-теги в корпусе
vocab = set()
tags = set()

for sentence in train:
    for token in sentence.tokens:
        vocab.add(token)
    for tag in sentence.pos_tags:
        tags.add(tag)

In [36]:
#Загрузите эмбеддинги c https://nlp.stanford.edu/projects/glove/ или другие, которые вам нравятся и пропишите путь к ним

from navec import Navec
import numpy as np
import io
from tqdm import tqdm

word_embeddings_path = 'navec_hudlit_v1_12B_500K_300d_100q.tar'
embeddings_model = Navec.load(word_embeddings_path)

In [42]:
known_words_count = 0
unk_tags = []
embeddings = np.zeros((len(vocab), 300))

for idx, word in tqdm(enumerate(vocab), total=len(vocab)):
    word = word.lower()
    if word in embeddings_model:
        embeddings[idx] = embeddings_model[word]
        known_words_count += 1
    else:
        unk_tags.append(word)
print()
print(f'navec знает {known_words_count} слов')

100%|██████████████████████████████████████████████████████████████████████| 107487/107487 [00:00<00:00, 115757.99it/s]


navec знает 94190 слов





In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class LSTM_Tagger():
    def __init__(self, embeddings, embedding_dim, hidden_dim, vocab_size, target_size):
        super(LSTM_Tagger, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.embeddings = nn.Embeddings.from_pretrained(torch.FloatTensor(embeddings), freeze=False)
        self.lstm = nn.lstm(embedding_dim, hidden_dim)
        self.hidden_to_tag = nn.Linear(hidden_dim, target_size)
    
    def forward(self, sentence):
        word_embeddings = self.embeddings(sentence)
        lstm_out, _ = self.lstm(word_embeddings.view(len(sentence), 1, -1))
        tag_space = self.hidden_to_tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [44]:
def prepare_tensor(sentence):
    """Input: takes in a list of words, and a dictionary containing the index of the words
    Output: a tensor containing the indexes of the word"""
    idxs = [embeddings_model[token] for token in sentence]
    return torch.tensor(idxs, dtype=torch.long)