In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
class Sentence():
    def __init__(self):
        self.tokens = []
        self.pos_tags = []
        self.grams = []

In [3]:
def read_dataset(dataset, mode):
    sentences = []
    
    with open(dataset, mode='r', encoding='utf-8') as data:
        # Пропускаем заголовок
        next(data)
        
        sentence = Sentence() # будем заполнять список предложений
        
        for row in data:
            row = row.strip()
            if len(row) != 0:
                row = row.split('\t')

                if mode == 'train':
                    _, _, token, pos_gram = row 
                    pos, gram = pos_gram.split('#')

                else:
                    _, _, token = row
                    pos, gram = '<UNK>', '<UNK>'

                sentence.tokens.append(token)
                sentence.pos_tags.append(pos)
                sentence.grams.append(gram)

            else:
                if len(sentence.tokens) > 0:
                    sentences.append(sentence)
                    sentence = Sentence()
                
        if len(sentence.tokens) > 0:
            sentence.append(sentence)
            
    return sentences

In [4]:
train = read_dataset('data/train.csv', 'train')
test = read_dataset('data/test.csv', 'test')

In [5]:
def get_vocabulary(data):
    vocabulary = set()
    for sentence in data:
        for token in sentence.tokens:
            vocabulary.add(token)

    return vocabulary

def get_tags(data):
    tags = set()
    for sentence in data:
        for tag in sentence.pos_tags:
            tags.add(tag)

    return tags

def get_chars(data):
    chars = set()
    for sentence in data:
        for token in sentence.tokens:
            for char in token:
                chars.add(char)

    return chars

In [6]:
#запомним все уникальные слова и POS-теги в train 
train_vocab = get_vocabulary(train)
train_tags = get_tags(train)
train_chars = get_chars(train)

train_token_index = {word: i for i, word in enumerate(train_vocab)}
train_tag_index = {tag: i for i, tag in enumerate(train_tags)}
train_char_index = {char: i for i, char in enumerate(train_chars)}

#запомним все уникальные слова и POS-теги в test
test_vocab = get_vocabulary(test)
test_tags = get_tags(test)
test_chars = get_chars(test)

test_token_index = {word: i for i, word in enumerate(test_vocab)}
test_tag_index = {tag: i for i, tag in enumerate(test_tags)}
test_char_index = {char: i for i, char in enumerate(test_chars)}

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pad_sequence

device = 'cuda' if torch.cuda.is_available() else 'cpu'

class CustomDataset(Dataset):
    def __init__(self, sentences, device, token_indexer, tag_indexer, char_indexer):
        self.sentences = sentences
        self.device = device
        self.token_indexer = token_indexer
        self.tag_indexer = tag_indexer
        self.char_indexer = char_indexer

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, item):
        return self.get_tensor_from_token(self.sentences[item]), self.get_tensor_from_tag(self.sentences[item]), self.get_tensor_from_char(self.sentences[item])

    def get_tensor_from_token(self, sentence):
        indicies = [self.token_indexer.get(token) for token in sentence.tokens]
        
        return torch.LongTensor(indicies)

    def get_tensor_from_tag(self, sentence):
        indicies = [self.tag_indexer.get(tag) for tag in sentence.pos_tags]
        
        return torch.LongTensor(indicies)

    def get_tensor_from_char(self, sentence):
        indicies = []
        char_indicies = []
        for token in sentence.tokens:
            for char in token:
                char_indicies.append(self.char_indexer.get(char))
            indicies.append(torch.LongTensor(char_indicies).to(device))
            char_indicies = []

        return indicies

In [8]:
device

'cuda'

In [9]:
dev = train[30000: 40000]
train = train[:30000]

In [10]:
TrainDataset = CustomDataset(train, device, train_token_index, train_tag_index, train_char_index)
DevDataset = CustomDataset(dev, device, train_token_index, train_tag_index, train_char_index)
TestDataset = CustomDataset(test, device, test_token_index, test_tag_index, test_char_index)

In [11]:
class DualLSTMTagger(nn.Module):
    def __init__(self, word_embedding_dim, word_hidden_dim, char_embedding_dim, char_hidden_dim, word_vocab_size, char_vocab_size, tag_vocab_size):
        super(DualLSTMTagger, self).__init__()
        self.word_embedding = nn.Embedding(word_vocab_size, word_embedding_dim)
        
        self.char_embedding = nn.Embedding(char_vocab_size, char_embedding_dim)
        self.char_lstm = nn.LSTM(char_embedding_dim, char_hidden_dim)
        
        self.lstm = nn.LSTM(word_embedding_dim + char_hidden_dim, word_hidden_dim)
        self.hidden2tag = nn.Linear(word_hidden_dim, tag_vocab_size)
        
    def forward(self, sentence, words):
        embeds = self.word_embedding(sentence)
        char_hidden_final = []
        for word in words:
            char_embeds = self.char_embedding(word)
            _, (char_hidden, char_cell_state) = self.char_lstm(char_embeds.view(len(word), 1, -1))
            word_char_hidden_state = char_hidden.view(-1)
            char_hidden_final.append(word_char_hidden_state)
        char_hidden_final = torch.stack(tuple(char_hidden_final))
        
        combined = torch.cat((embeds, char_hidden_final), 1)

        lstm_out, _ = self.lstm(combined.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [12]:
from tqdm import tqdm
import numpy as np

In [13]:
class Trainer:
    def __init__(self, model: nn.Module, train_iterator, dev_iterator, lr=2e-5, device=device):
        self.criterion = nn.CrossEntropyLoss().to(device)
        self.optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        self.train_iterator = train_iterator
        self.dev_iterator = dev_iterator
        self.model = model.to(device)

    def train_epoch(self):
        self.model.train()
        total_loss = 0
        total = 0
        correct = 0
        for batch_idx, (tokens, pos_tags, chars) in tqdm(enumerate(self.train_iterator), total=len(self.train_iterator)):
            self.optimizer.zero_grad()
            tokens = tokens.to(device)
            #print(tokens)
            #print(chars)
            chars = [char.to(device) for char in chars]
            pos_tags = pos_tags.to(device)
            #print()
            #print('Tokens shape {}'.format(tokens.shape))
            #print('Tags shape {}'.format(pos_tags.shape))
            #print('Chars shape {}'.format(chars.shape))
            logits = self.model(tokens, chars)
            #print(f'logits.view(-1, logits.size(-1)).shape: {logits.view(-1, logits.size(-1)).shape}')
            #print(f'pos_tags.view(-1).shape: {pos_tags.view(-1).shape}')
            loss = self.criterion(logits.view(-1, logits.size(-1)), pos_tags.view(-1))
            loss.backward()
            self.optimizer.step()
            total_loss += loss.item()

            mask = (tokens != 0).to(torch.long)
            pred = torch.argmax(logits, dim=-1)
            correct += ((pred == pos_tags)*mask).sum().item()
            total += mask.sum().item()

        print('\rLoss: %4f, Accuracy: %4f, Batch: %d of %d' % (
            total_loss / (batch_idx + 1), correct / total, batch_idx + 1, len(self.train_iterator)
        ), end='')
        print()

    def test_epoch(self):
        with torch.no_grad():
            self.model.eval()
            total_loss = 0
            total = 0
            correct = 0
            for batch_idx, (tokens, pos_tags, chars) in enumerate(self.dev_iterator):
                logits = self.model(tokens, chars)
                loss = self.criterion(logits.view(-1, logits.size(-1)), pos_tags.view(-1))
                total_loss += loss.item()

                mask = (tokens != 0).to(torch.long)
                pred = torch.argmax(logits, dim=-1)
                correct += ((pred == pos_tags) * mask).sum().item()
                total += mask.sum().item()

            print('\rLoss: %4f, Accuracy: %4f, Batch: %d of %d' % (
                total_loss / (batch_idx + 1), correct / total, batch_idx + 1, len(self.dev_iterator)
            ), end='')
            print()

In [14]:
WORD_EMBEDDING_DIM = 1024
CHAR_EMBEDDING_DIM = 128
WORD_HIDDEN_DIM = 1024
CHAR_HIDDEN_DIM = 1024
EPOCHS = 10

model = DualLSTMTagger(WORD_EMBEDDING_DIM, WORD_HIDDEN_DIM, CHAR_EMBEDDING_DIM, CHAR_HIDDEN_DIM, len(train_vocab), len(train_chars), len(train_tags))
trainer = Trainer(model, TrainDataset, DevDataset)

In [15]:
for epoch in tqdm(range(10)):
    print()
    trainer.train_epoch()
    trainer.test_epoch()

  0%|          | 0/10 [00:00<?, ?it/s]
  0%|          | 0/30000 [00:00<?, ?it/s][A
  0%|          | 0/10 [00:00<?, ?it/s]





RuntimeError: error in LoadLibraryA