In [1]:
import pandas as pd
import numpy as np

In [4]:
#df = pd.read_csv('rupos2018/train.csv', sep='\t', engine='python', error_bad_lines=False)
#df.drop(columns=['Id'], inplace=True)
#df.columns

In [5]:
#def only_tag(s):
#    return s.split('#')[0]
#df['Prediction'] = df['Prediction'].apply(only_tag)
#df.head(30)

In [2]:
from collections import namedtuple
WordPos = namedtuple("WordPos", "word pos")

def get_sentences(filename, is_train=True):
    sentences = []
    with open(filename, "r", encoding='utf-8') as r:
        next(r)
        sentence = []
        for line in r:
            if len(line.strip()) == 0:
                if len(sentence) == 0:
                    continue
                sentences.append(sentence)
                sentence = []
                continue
            if is_train:
                word = line.strip().split("\t")[2]
                pos = line.strip().split("\t")[3].split("#")[0]
                #gram = line.strip().split("\t")[3].split("#")[1]
                sentence.append(WordPos(word, pos))
        if len(sentence) != 0:
            sentences.append(sentence)
    return sentences

all_sentences = get_sentences('rupos2018/train.csv')

In [3]:
data = []

num_tag = 0
num_word = 0
word2idx = {}
tag2idx = {}

for i in range(len(all_sentences)):
    sent_tag = []
    sent_word = []
    for j in range(len(all_sentences[i])):
        word = all_sentences[i][j][0].lower()
        tag = all_sentences[i][j][1]
        sent_word.append(word)
        sent_tag.append(tag)
        if word not in word2idx:
            word2idx[word] = num_word
            num_word += 1
        if tag not in tag2idx:
            tag2idx[tag] = num_tag
            num_tag += 1
    data.append((sent_word, sent_tag))

In [5]:
train_data = data[:int(len(data) * 0.9)]
test_data = data[int(len(data) * 0.9):]
print(list(map(len, [train_data, test_data])))

[43353, 4818]


In [16]:
print(tag2idx)

{'CONJ': 0, 'PART': 1, 'ADP': 2, 'ADJ': 3, 'NOUN': 4, 'ADV': 5, 'PUNCT': 6, 'VERB': 7, 'NUM': 8, 'PROPN': 9, 'PRON': 10, 'SCONJ': 11, 'DET': 12, 'AUX': 13, 'INTJ': 14, 'X': 15, 'SYM': 16}


In [17]:
len(word2idx)

98880

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f94a51af5d0>

In [19]:
def prepare_sequence(seq, toidx):
    idxs = [toidx[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [45]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, dropout):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(p=dropout)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sent):
        embedded = self.dropout(self.embeddings(sent))
        output, _ = self.rnn(embedded.view(len(sent), 1, -1))
        tag = self.tag(output.view(len(sent), -1))
        #print(tag.shape)
        tag_scores = F.log_softmax(tag, dim=1)
        
        return tag_scores

In [85]:
from tqdm import tqdm_notebook, tqdm

EMBEDDING_DIM = 32
HIDDEN_DIM = 32
INPUT_DIM = len(word2idx)
OUTPUT_DIM = len(tag2idx)
DROPOUT = 0.5
BATCH_SIZE = 32

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, INPUT_DIM, OUTPUT_DIM, DROPOUT)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [86]:
with torch.no_grad():
    print(train_data[0][0])
    inputs = prepare_sequence(train_data[0][0], word2idx)
    tag_scores = model(inputs)
    print(tag_scores.shape)

['а', 'ведь', 'для', 'конкретных', 'изделий', 'зачастую', 'нужен', 'монокристалл', 'не', 'только', 'крупный', ',', 'но', 'и', 'заданной', 'формы', ',', 'например', '"', 'стакан', '"', ',', '"', 'тройник', '"', '(', 'элемент', 'трубопровода', ')', 'или', 'еще', 'сложнее', '.']
torch.Size([33, 17])


In [87]:
num_train = len(train_data)

for epoch in tqdm(range(5)):  # again, normally you would NOT do 300 epochs, it is toy data
    epoch_loss = 0
    
    for i in tqdm_notebook(range(len(train_data[:num_train]))):
    #for sentence, tags in train_data[:num_train]:
        sentence, tags = train_data[i]
        model.zero_grad()

        sentence_in = prepare_sequence(sentence, word2idx)
        targets = prepare_sequence(tags, tag2idx)

        # Run our forward pass.
        tag_scores = model(sentence_in)

        #  calling optimizer.step()

        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f'Epoch={epoch}, Loss={round(epoch_loss / num_train, 3)}')
        

  0%|          | 0/5 [00:00<?, ?it/s]

HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))

 20%|██        | 1/5 [13:40<54:41, 820.38s/it]

Epoch=0, Loss=1.639


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))

KeyboardInterrupt: 

In [None]:
true_pred = 0.0
all_pred = 0.0

for sent in test_data:
    words, tags = sent
    with torch.no_grad():
        inputs = prepare_sequence(words, word2idx)
        outputs = prepare_sequence(tags, tag2idx).numpy()
        tag_scores = model(inputs)
    predict_tags = torch.max(tag_scores, dim=1)[1].numpy()

    true_pred += np.sum(outputs == predict_tags)
    all_pred += len(words)
    
print("Accuracy:", true_pred / all_pred * 100, '%')