In [0]:
import pandas as pd
import numpy as np

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
#df = pd.read_csv('rupos2018/train.csv', sep='\t', engine='python', error_bad_lines=False)
#df.drop(columns=['Id'], inplace=True)
#df.columns

In [0]:
#def only_tag(s):
#    return s.split('#')[0]
#df['Prediction'] = df['Prediction'].apply(only_tag)
#df.head(30)

In [0]:
from collections import namedtuple
WordPos = namedtuple("WordPos", "word pos")

def get_sentences(filename, is_train=True):
    sentences = []
    with open(filename, "r", encoding='utf-8') as r:
        next(r)
        sentence = []
        for line in r:
            if len(line.strip()) == 0:
                if len(sentence) == 0:
                    continue
                sentences.append(sentence)
                sentence = []
                continue
            if is_train:
                word = line.strip().split("\t")[2]
                pos = line.strip().split("\t")[3].split("#")[0]
                #gram = line.strip().split("\t")[3].split("#")[1]
                sentence.append(WordPos(word, pos))
        if len(sentence) != 0:
            sentences.append(sentence)
    return sentences

all_sentences = get_sentences('train.csv')

In [0]:
data = []

num_tag = 0
num_word = 0
word2idx = {}
tag2idx = {}

for i in range(len(all_sentences)):
    sent_tag = []
    sent_word = []
    for j in range(len(all_sentences[i])):
        word = all_sentences[i][j][0].lower()
        tag = all_sentences[i][j][1]
        sent_word.append(word)
        sent_tag.append(tag)
        if word not in word2idx:
            word2idx[word] = num_word
            num_word += 1
        if tag not in tag2idx:
            tag2idx[tag] = num_tag
            num_tag += 1
    data.append((sent_word, sent_tag))

In [4]:
train_data = data[:int(len(data) * 0.9)]
test_data = data[int(len(data) * 0.9):]
print(list(map(len, [train_data, test_data])))

[43353, 4818]


In [5]:
print(tag2idx)

{'CONJ': 0, 'PART': 1, 'ADP': 2, 'ADJ': 3, 'NOUN': 4, 'ADV': 5, 'PUNCT': 6, 'VERB': 7, 'NUM': 8, 'PROPN': 9, 'PRON': 10, 'SCONJ': 11, 'DET': 12, 'AUX': 13, 'INTJ': 14, 'X': 15, 'SYM': 16}


In [6]:
len(word2idx)

98880

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7fcd31bd5690>

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [0]:
def prepare_sequence(seq, toidx):
    idxs = [toidx[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long, device=device)

In [0]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, dropout):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.dropout = nn.Dropout(p=dropout)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sent):
        embedded = self.dropout(self.embeddings(sent))
        output, _ = self.rnn(embedded.view(len(sent), 1, -1))
        tag = self.tag(output.view(len(sent), -1))
        #print(tag.shape)
        tag_scores = F.log_softmax(tag, dim=1)
        
        return tag_scores

In [0]:
from tqdm import tqdm_notebook, tqdm

EMBEDDING_DIM = 64
HIDDEN_DIM = 64
INPUT_DIM = len(word2idx)
OUTPUT_DIM = len(tag2idx)
DROPOUT = 0.5
BATCH_SIZE = 32

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, INPUT_DIM, OUTPUT_DIM, DROPOUT).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [18]:
with torch.no_grad():
    print(train_data[0][0])
    inputs = prepare_sequence(train_data[0][0], word2idx)
    tag_scores = model(inputs)
    print(tag_scores.shape)

['а', 'ведь', 'для', 'конкретных', 'изделий', 'зачастую', 'нужен', 'монокристалл', 'не', 'только', 'крупный', ',', 'но', 'и', 'заданной', 'формы', ',', 'например', '"', 'стакан', '"', ',', '"', 'тройник', '"', '(', 'элемент', 'трубопровода', ')', 'или', 'еще', 'сложнее', '.']
torch.Size([33, 17])


In [19]:
num_train = len(train_data)

for epoch in tqdm(range(15)):
    epoch_loss = 0
    
    for i in tqdm_notebook(range(len(train_data[:num_train]))):
    #for sentence, tags in train_data[:num_train]:
        sentence, tags = train_data[i]
        model.zero_grad()

        sentence_in = prepare_sequence(sentence, word2idx)
        targets = prepare_sequence(tags, tag2idx)

        # forward
        tag_scores = model(sentence_in)

        # optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f'Epoch={epoch}, Loss={round(epoch_loss / num_train, 3)}')


  0%|          | 0/15 [00:00<?, ?it/s][A

HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


  7%|▋         | 1/15 [01:59<27:47, 119.07s/it][A

Epoch=0, Loss=1.475


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 13%|█▎        | 2/15 [03:58<25:48, 119.15s/it][A

Epoch=1, Loss=1.148


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 20%|██        | 3/15 [05:57<23:49, 119.13s/it][A

Epoch=2, Loss=1.067


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 27%|██▋       | 4/15 [07:56<21:48, 119.00s/it][A

Epoch=3, Loss=1.023


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 33%|███▎      | 5/15 [09:55<19:51, 119.13s/it][A

Epoch=4, Loss=0.995


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 40%|████      | 6/15 [11:55<17:53, 119.28s/it][A

Epoch=5, Loss=0.972


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 47%|████▋     | 7/15 [13:54<15:54, 119.31s/it][A

Epoch=6, Loss=0.955


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 53%|█████▎    | 8/15 [15:53<13:54, 119.26s/it][A

Epoch=7, Loss=0.939


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 60%|██████    | 9/15 [17:53<11:56, 119.43s/it][A

Epoch=8, Loss=0.926


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 67%|██████▋   | 10/15 [19:53<09:57, 119.45s/it][A

Epoch=9, Loss=0.915


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 73%|███████▎  | 11/15 [21:52<07:57, 119.47s/it][A

Epoch=10, Loss=0.904


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 80%|████████  | 12/15 [23:52<05:58, 119.66s/it][A

Epoch=11, Loss=0.896


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 87%|████████▋ | 13/15 [25:51<03:59, 119.51s/it][A

Epoch=12, Loss=0.887


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 93%|█████████▎| 14/15 [27:52<01:59, 119.95s/it][A

Epoch=13, Loss=0.88


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


100%|██████████| 15/15 [29:53<00:00, 120.24s/it][A
[A

Epoch=14, Loss=0.873


In [0]:
def accuracy_score(test_data, model):
  true_pred = 0.0
  all_pred = 0.0

  for sent in test_data:
      words, tags = sent
      with torch.no_grad():
          inputs = prepare_sequence(words, word2idx)
          outputs = prepare_sequence(tags, tag2idx).cpu().numpy()
          tag_scores = model(inputs)
      predict_tags = torch.max(tag_scores, dim=1)[1].cpu().numpy()

      true_pred += np.sum(outputs == predict_tags)
      all_pred += len(words)
      
  print("Accuracy:", round(true_pred / all_pred * 100, 3), '%')
  return 


### Качество после 15 эпох


In [25]:
accuracy_score(test_data, model)

Accuracy: 68.183 %


In [26]:
num_train = len(train_data)

for epoch in tqdm(range(15, 30)):
    epoch_loss = 0
    
    for i in tqdm_notebook(range(len(train_data[:num_train]))):
    #for sentence, tags in train_data[:num_train]:
        sentence, tags = train_data[i]
        model.zero_grad()

        sentence_in = prepare_sequence(sentence, word2idx)
        targets = prepare_sequence(tags, tag2idx)

        # forward
        tag_scores = model(sentence_in)

        # optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f'Epoch={epoch}, Loss={round(epoch_loss / num_train, 3)}')
    if epoch % 3 == 2:
      accuracy_score(test_data, model)


  0%|          | 0/15 [00:00<?, ?it/s][A

HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


  7%|▋         | 1/15 [02:02<28:41, 122.95s/it][A

Epoch=15, Loss=0.868


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 13%|█▎        | 2/15 [04:03<26:27, 122.12s/it][A

Epoch=16, Loss=0.861


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))

Epoch=17, Loss=0.855



 20%|██        | 3/15 [06:06<24:31, 122.61s/it][A

Accuracy: 68.661 %


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 27%|██▋       | 4/15 [08:06<22:20, 121.86s/it][A

Epoch=18, Loss=0.849


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 33%|███▎      | 5/15 [10:06<20:11, 121.20s/it][A

Epoch=19, Loss=0.843


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))

Epoch=20, Loss=0.839



 40%|████      | 6/15 [12:09<18:16, 121.83s/it][A

Accuracy: 69.109 %


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 47%|████▋     | 7/15 [14:09<16:10, 121.29s/it][A

Epoch=21, Loss=0.834


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 53%|█████▎    | 8/15 [16:09<14:06, 120.89s/it][A

Epoch=22, Loss=0.83


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))

Epoch=23, Loss=0.825



 60%|██████    | 9/15 [18:13<12:10, 121.80s/it][A

Accuracy: 69.585 %


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 67%|██████▋   | 10/15 [20:13<10:05, 121.11s/it][A

Epoch=24, Loss=0.821


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 73%|███████▎  | 11/15 [22:13<08:02, 120.68s/it][A

Epoch=25, Loss=0.819


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))

Epoch=26, Loss=0.813



 80%|████████  | 12/15 [24:16<06:04, 121.51s/it][A

Accuracy: 69.932 %


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 87%|████████▋ | 13/15 [26:15<04:01, 120.83s/it][A

Epoch=27, Loss=0.81


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))


 93%|█████████▎| 14/15 [28:15<02:00, 120.44s/it][A

Epoch=28, Loss=0.808


HBox(children=(IntProgress(value=0, max=43353), HTML(value='')))

Epoch=29, Loss=0.803



100%|██████████| 15/15 [30:19<00:00, 121.65s/it][A
[A

Accuracy: 70.311 %


### Качество после 30 эпох

In [28]:
accuracy_score(test_data, model)

Accuracy: 70.322 %
