In [1]:
import string
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter
import pytorch_lightning as pl

from gensim.models import KeyedVectors
from gensim.models import FastText

In [3]:
def parse_conllu(filename):
    lines = open(filename, 'r')
    parsed_data = []
    for line in lines: 
        line = line.split("\n")[0]
        if line != "":
            splitted = line.split("\t")
            if splitted[1] in string.punctuation:
                continue
            parsed_data.append(splitted[1:])
    return parsed_data
        


In [12]:
# train_data = parse_conllu("dataset/train.conllu")
# sentences = train_data[:, 0]
# fasttext_model = FastText(sentences=sentences, 
#                           epochs=5,
#                           vector_size=100, 
#                           window=5, 
#                           min_count=1, 
#                           sg=1)
fasttext_wv = KeyedVectors.load("geowac_lemmas_none_fasttextskipgram_300_5_2020/model.model")

In [15]:
print(fasttext_wv.vector_size)

300


In [5]:
class CustomCONLLDataset(Dataset):
    def __init__(self, raw_data):
        self.data = raw_data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample, label = self.data[idx]
        if label in ['B-LOC', 'I-LOC']:
            label = 0
        elif label in ['B-PER', 'I-PER']:
            label = 1
        elif label in ['B-LOC', 'I-LOC']:
            label = 2
        elif label in ['B-ORG', 'I-ORG']:
            label = 3
        elif label in ['B-MISC', 'I-MISC']:
            label = 4
        else:
            label = 5
        return sample, label


In [6]:
params = {
    "batch_size": 32
}

In [37]:
train_dataset = CustomCONLLDataset(parse_conllu("dataset/train.conllu"))
val_dataset = CustomCONLLDataset(parse_conllu("dataset/val.conllu"))
test_dataset = CustomCONLLDataset(parse_conllu("dataset/test.conllu"))

dataloader = {
    "train": DataLoader(train_dataset, batch_size=params["batch_size"], shuffle=True),
    "val": DataLoader(val_dataset, batch_size=params["batch_size"], shuffle=True),
    "test": DataLoader(test_dataset, batch_size=1, shuffle=True)
}

In [32]:
class NERModel(pl.LightningModule):
    def __init__(self, word_vector, hidden_dim, output_dim):
        super(NERModel, self).__init__()
        self.word_vector = word_vector
        self.rnn = nn.RNN(word_vector.vector_size, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, word):
        embedded = torch.tensor(self.word_vector[word])
        output, _ = self.rnn(embedded)
        predictions = self.fc(output)
        return predictions.softmax(dim=1)

    def training_step(self, batch, batch_idx):
        words, targets = batch
        predictions = self(words)
        loss = nn.CrossEntropyLoss()(predictions, targets)
        self.log('train_loss', loss)
        return loss        

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=0.001)
        return optimizer


ner_model = NERModel(fasttext_wv, 10, 6)

In [None]:
log_dir = 'logs'
tensorboard_writer = SummaryWriter(log_dir=log_dir)

In [33]:
trainer = pl.Trainer(max_epochs=2,
                     logger=tensorboard_writer) 
trainer.fit(ner_model, dataloader["train"], dataloader["val"])

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name | Type   | Params
--------------------------------
0 | rnn  | RNN    | 3.1 K 
1 | fc   | Linear | 66    
--------------------------------
3.2 K     Trainable params
0         Non-trainable params
3.2 K     Total params
0.013     Total estimated model params size (MB)


Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [50]:
actual_labels = ["Location", "Person", "Location", "Organization", "Miscellaneous", "Other"]
for i, (word, label) in enumerate(dataloader["test"]):
    predicted = ner_model(word)
    predicted_label = torch.argmax(predicted) 
    print(f"Word: {word[0]}: predicted label: \"{actual_labels[predicted_label]}\" with probability: {predicted[0][predicted_label]}. Expected label: \"{actual_labels[label]}\"")
    if i == 10:
        break

Word: ещё: predicted label: "Other" with probability: 1.0. Expected label: "Other"
Word: Находился: predicted label: "Other" with probability: 1.0. Expected label: "Other"
Word: её: predicted label: "Other" with probability: 0.999992847442627. Expected label: "Other"
Word: национальная: predicted label: "Other" with probability: 1.0. Expected label: "Other"
Word: вступает: predicted label: "Other" with probability: 1.0. Expected label: "Other"
Word: говора: predicted label: "Other" with probability: 1.0. Expected label: "Other"
Word: войсках: predicted label: "Other" with probability: 0.9999998807907104. Expected label: "Other"
Word: на: predicted label: "Other" with probability: 1.0. Expected label: "Other"
Word: Color: predicted label: "Miscellaneous" with probability: 1.0. Expected label: "Miscellaneous"
Word: его: predicted label: "Other" with probability: 1.0. Expected label: "Other"
Word: Премьера: predicted label: "Other" with probability: 0.99998939037323. Expected label: "Othe

In [None]:
tensorboard_writer.close()