In [1]:
from collections import Counter
from types import SimpleNamespace

import pandas as pd
import torch
import torchtext
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from kg.ner.model import LSTM

In [2]:
train_df = pd.read_csv('/Users/tmorrill002/Documents/datasets/conll/transformed/train.csv')

In [3]:
vocab = torchtext.vocab.Vocab(Counter(train_df['Token'].value_counts().to_dict()))
label_dict = {}
i = 0
for k in train_df['NER_Tag_Normalized'].unique():
    label_dict[k] = i
    i += 1

In [4]:
class CoNLL2003Dataset(torch.utils.data.Dataset):
    def __init__(self, df, vocab, label_dict, transform=None):
        self.df = df
        self.vocab = vocab
        self.label_dict = label_dict
        self.transform = transform
        self.sentences, self.labels = self._prepare_data()
    
    def _prepare_data(self):
        temp_df = self.df.groupby(['Article_ID', 'Sentence_ID'], as_index=False).agg(Sentence=('Token', list), Labels=('NER_Tag_Normalized', list))
        sentences = temp_df['Sentence'].values.tolist()
        labels = temp_df['Labels'].values.tolist()
        return sentences, labels
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, idx):
        if self.transform:
            raise NotImplementedError
        
        indices = []
        for token in self.sentences[idx]:
            indices.append(self.vocab[token])
        labels = []
        for label in self.labels[idx]:
            labels.append(self.label_dict[label])
        
        return indices, labels

In [5]:
train_dataset = CoNLL2003Dataset(train_df, vocab, label_dict)

In [6]:
assert vocab[train_df.iloc[0]['Token']] == train_dataset[0][0][0]

In [7]:
assert label_dict[train_df.iloc[0]['NER_Tag_Normalized']] == train_dataset[0][1][0]

In [8]:
train_dataset[0]

([964, 22406, 236, 771, 7, 4586, 210, 7683, 2], [0, 1, 2, 1, 1, 1, 2, 1, 1])

In [9]:
small_batch = []
small_batch.append(torch.tensor(train_dataset[0][0]))
small_batch.append(torch.tensor(train_dataset[1][0]))
small_batch_lens = [len(x) for x in small_batch]

small_labels_batch = []
small_labels_batch.append(torch.tensor(train_dataset[0][1]))
small_labels_batch.append(torch.tensor(train_dataset[1][1]))

In [10]:
small_batch_padded = pad_sequence(small_batch, batch_first=True, padding_value=vocab['<pad>'])
small_labels_batch_padded = pad_sequence(small_labels_batch, batch_first=True, padding_value=-1)

In [11]:
small_batch_padded

tensor([[  964, 22406,   236,   771,     7,  4586,   210,  7683,     2],
        [  737,  2088,     1,     1,     1,     1,     1,     1,     1]])

In [12]:
small_labels_batch_padded

tensor([[ 0,  1,  2,  1,  1,  1,  2,  1,  1],
        [ 3,  3, -1, -1, -1, -1, -1, -1, -1]])

In [13]:
small_batch_lens

[9, 2]

In [14]:
packed = pack_padded_sequence(small_batch_padded, small_batch_lens, batch_first=True)

In [15]:
packed

PackedSequence(data=tensor([  964,   737, 22406,  2088,   236,   771,     7,  4586,   210,  7683,
            2]), batch_sizes=tensor([2, 2, 1, 1, 1, 1, 1, 1, 1]), sorted_indices=None, unsorted_indices=None)

In [16]:
batch, sequence_lengths = pad_packed_sequence(packed, batch_first=True, padding_value=vocab['<pad>'])

In [17]:
batch

tensor([[  964, 22406,   236,   771,     7,  4586,   210,  7683,     2],
        [  737,  2088,     1,     1,     1,     1,     1,     1,     1]])

In [18]:
config = {
    'vocab_size': len(vocab),
    'embedding_dim': 128,
    'hidden_size': 128,
    'num_classes': len(label_dict),
    'batch_size': 16
}
config = SimpleNamespace(**config)

In [19]:
model = LSTM(config)

In [20]:
output = model((small_batch_padded, small_batch_lens))

In [21]:
output.shape

torch.Size([18, 5])

In [22]:
labels = small_labels_batch_padded.reshape(-1)

In [23]:
mask = (labels >= 0)

In [24]:
labels % output.shape[1]

tensor([0, 1, 2, 1, 1, 1, 2, 1, 1, 3, 3, 4, 4, 4, 4, 4, 4, 4])

In [25]:
output.shape[1]

5

In [26]:
labels

tensor([ 0,  1,  2,  1,  1,  1,  2,  1,  1,  3,  3, -1, -1, -1, -1, -1, -1, -1])

In [27]:
output[:, 0]

tensor([-1.5012, -1.5821, -1.6462, -1.5795, -1.5299, -1.4470, -1.5916, -1.5329,
        -1.4857, -1.5305, -1.4813, -1.3069, -1.3069, -1.3069, -1.3069, -1.3069,
        -1.3069, -1.3069], grad_fn=<SelectBackward>)

In [28]:
def loss_fn(outputs, labels):
    labels = labels.reshape(-1)
    mask = (labels >= 0).float()
    labels = labels % outputs.shape[1]
    num_tokens = mask.sum()
    return -torch.sum(outputs[:, labels] * mask) / num_tokens

In [29]:
loss_fn(output, small_labels_batch_padded)

tensor(27.7837, grad_fn=<DivBackward0>)