In [1]:
import os
import pandas as pd
import numpy as np
import torch


import gensim.downloader as dl

pretrained_weights_name = "word2vec-google-news-300"
model_dl_path = os.path.join(
    dl.BASE_DIR, pretrained_weights_name, f"{pretrained_weights_name}.gz")


if os.path.exists(model_dl_path):
    # load model
    print(f"Loading model from {model_dl_path}")
    gnews_embeddings = dl.load(pretrained_weights_name)
else:
    # download
    print(f"Model will be downloaded at {model_dl_path}")
    gnews_embeddings = dl.load("word2vec-google-news-300")


Loading model from /home/shawon/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [2]:
# add PAD to embeddings

# 0 padding, 300 embedding dims
gnews_embeddings.add_vector("<PAD>", np.zeros(300))

# need it later for loading the embeddings in pytorch model
padding_idx = len(gnews_embeddings.index_to_key) - 1



In [3]:
from typing import List

class ConlluRowInfo:
    word: str
    lemma: str
    pos: str

    def __init__(self, word: str, lemma: str, pos: str) -> None:
        self.word = word
        self.lemma = lemma
        self.pos = pos

    def __str__(self) -> str:
        rep = {
            "word": self.word,
            "lemma": self.lemma,
            "pos": self.pos
        }
        return str(rep)


class ConlluRow:
    info: List[ConlluRowInfo]
    # text: str

    def __init__(self, infos: List[ConlluRowInfo]) -> None:
        self.info = infos

    def __str__(self) -> str:
        return f"info : {self.info}"


In [4]:
# time to define the torch dataset

from torch.utils.data import Dataset
from tqdm.auto import trange, tqdm
from typing import Dict, List


class TweebankDataset(Dataset):
    def __init__(self, filename, w2v_weights=gnews_embeddings) -> None:
        self.filename = filename

        self.w2v = w2v_weights
        self.data = list()
        self.__read_data()

        self.MAX_SEQ_LEN = 50  # default value
        # self.__find_max_seq_len()

        self.UNIQUE_TAGS = ['PRON', 'NUM', 'NOUN', 'CCONJ', 'ADV', 'SCONJ',
                            'ADP', 'AUX', 'PROPN', 'SYM', 'DET',
                            'INTJ', 'PUNCT', 'X', 'ADJ', 'VERB', 'PART', "<PAD>"]
        self.tag_dict = dict()
        self.__encode_tags()

        self.number_tags = len(self.UNIQUE_TAGS)

        self.vocabulary = self.w2v.index_to_key  # type: ignore

    def __len__(self) -> int:
        return len(self.data)

    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
        # ============== collect ===================
        words = [i.word for i in self.data[idx].info]
        # lemmas = [i.lemma for i in self.data[idx].info]
        tags = [i.pos for i in self.data[idx].info]

        # =================== convert using word2vec weights ==========
        for idx in range(len(words)):
            try:
                w2v_idx = self.w2v.key_to_index[words[idx]]  # type: ignore
            except KeyError:
                w2v_idx = 0  # </s>
            words[idx] = w2v_idx
            tags[idx] = self.tag_dict[tags[idx]]

        # ============== pad words ===============
        # left pad
        padded_words = np.zeros(self.MAX_SEQ_LEN, dtype=np.int32)
        padded_words[-len(words):] = words

        # ============== pad tags =================
        padded_tags = np.ones(self.MAX_SEQ_LEN, dtype=np.int32) * \
            self.tag_dict.get("<PAD>")  # type: ignore
        padded_tags[-len(tags):] = tags

        return {
            "words": torch.tensor(padded_words),
            "tags": torch.tensor(padded_tags),
        }

    def __find_max_seq_len(self) -> None:
        seq_lens = []

        for idx in range(len(self.data)):
            words = [i.word for i in self.data[idx].info]
            seq_lens.append(len(words))

        self.MAX_SEQ_LEN = max(seq_lens)

    def __encode_tags(self) -> None:
        for idx, tag in enumerate(self.UNIQUE_TAGS):
            self.tag_dict[tag] = idx

    def __read_data(self) -> None:
        with open(self.filename, "r") as f:
            data = f.readlines()

            # ============ read the text file =============
            lines = list()
            buffer = list()
            for _, line in tqdm(enumerate(data)):
                if line == "\n":
                    lines.append(buffer)
                    buffer = []
                else:
                    buffer.append(line)

            # ============== organize in objects ==============
            for idx, line in tqdm(enumerate(lines)):
                # from line index 2 and onwards
                line_info = list()
                for info in line[2:]:
                    buffer = info.split("\t")

                    try:
                        word = buffer[1]
                        lemma = buffer[2]
                        tag = buffer[3]

                        line_info.append(ConlluRowInfo(word, lemma, tag))

                    except IndexError:
                        print(buffer)

                lines[idx] = ConlluRow(line_info)

            self.data = lines


In [5]:
from torch.utils.data import DataLoader

bs = 128
dl_args = {
    "pin_memory": True,
    "batch_size": bs
}


test_set = TweebankDataset(
    "/mnt/Others/experiments/datasets/Tweebank-dev/converted/en-ud-tweet-test.fixed.conllu")

test_loader = DataLoader(test_set, shuffle=False, **dl_args)


0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [6]:
# model
# https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class LSTMTagger(nn.Module):
    def __init__(self, 
                 embedding_dim: int, 
                 hidden_dim: int,  
                 tagset_size: int,
                 padding_idx=padding_idx, 
                 freeze_embeddings=True, 
                 w2v_weights=gnews_embeddings) -> None:
        
        super(LSTMTagger, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.taget_size = tagset_size
        
        
        embedding_tensors = torch.from_numpy(w2v_weights.vectors) # type: ignore        
        self.word_embeddings = nn.Embedding.from_pretrained(
            embedding_tensors, freeze=freeze_embeddings, padding_idx=padding_idx)
        
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_dim, 
            batch_first=True,
            bidirectional=True)
        
        self.attention =  nn.MultiheadAttention(hidden_dim * 2, num_heads=4, dropout=0.1, batch_first=True)
        self.relu = nn.ReLU()
        
        self.linear = nn.Linear(hidden_dim * 2, tagset_size)

        
    def forward(self, words):
        embeds = self.word_embeddings(words)
        
        lstm_out, _ = self.lstm(embeds)
        
        attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
        relu_out = self.relu(attn_out)
        
        linear_out = self.linear(relu_out)

        logits = F.log_softmax(linear_out, dim=-1)
        return logits

In [7]:
tagset_size = len(test_set.UNIQUE_TAGS)
model = LSTMTagger(embedding_dim=300, hidden_dim=100,  tagset_size=tagset_size)
model.load_state_dict(torch.load("saved.pt"))
model = model.to("cpu")
model.eval()


LSTMTagger(
  (word_embeddings): Embedding(3000001, 300, padding_idx=3000000)
  (lstm): LSTM(300, 100, batch_first=True, bidirectional=True)
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=200, out_features=200, bias=True)
  )
  (relu): ReLU()
  (linear): Linear(in_features=200, out_features=18, bias=True)
)

In [27]:
# on a single batch
def categorical_accuracy(pred: torch.Tensor, true: torch.Tensor) -> torch.Tensor:
    # unvectorized, runs on a single data instance
    def ca(pred: torch.Tensor, true: torch.Tensor) -> torch.Tensor:
        non_pad_idx = (true != 17).nonzero()
        acc = pred[non_pad_idx].squeeze(0).eq(true[non_pad_idx])
        
        return acc.sum() / true[non_pad_idx].size()[0]
    
    bs = true.size()[0]
    acc = torch.zeros(size=(bs,), dtype=torch.float32)
    for i in range(bs):
        p = pred[i]
        t = true[i]
        
        acc[i] = ca(p, t)
        
    return acc.mean()

def evaluate(model: LSTMTagger, data_loader: DataLoader) -> None:
    TAG_PAD_IDX = 17
    
    all_accs = torch.zeros(size=(data_loader.batch_size, ), dtype=torch.float32) # type: ignore
    
    for idx, batch in enumerate(data_loader):
        words = batch["words"]
        tags = batch["tags"].long()
        
        
        logits = model(words)
        preds = logits.argmax(dim=-1)
        
        # categorical acc
        all_accs[idx] = categorical_accuracy(preds, tags)
    
    print(all_accs)
    print(all_accs.size())
    print(all_accs.mean())
        
        
evaluate(model, test_loader)

tensor([0.7656, 0.7339, 0.7796, 0.7399, 0.7522, 0.7324, 0.7642, 0.7308, 0.7316,
        0.7638, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 

In [None]:
# this is where thomas realized that OOV needs to be handled. Anyways. Les go then. 