In [1]:
import os
import pandas as pd
import numpy as np
import gensim.downloader as dl

In [2]:
pretrained_weights_name = "word2vec-google-news-300"
model_dl_path = os.path.join(
    dl.BASE_DIR, pretrained_weights_name, f"{pretrained_weights_name}.gz")


if os.path.exists(model_dl_path):
    # load model
    print(f"Loading model from {model_dl_path}")
    gnews_embeddings = dl.load(pretrained_weights_name)
else:
    # download
    print(f"Model will be downloaded at {model_dl_path}")
    gnews_embeddings = dl.load("word2vec-google-news-300")

Loading model from /home/shawon/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [3]:
# add vectors for padding and oov
padding = np.zeros(300)
oov = np.ones(300) * -1

gnews_embeddings.add_vector("</PAD>", padding)  # type: ignore
gnews_embeddings.add_vector("</OOV>", oov)  # type: ignore



3000001

In [4]:
pad_idx = gnews_embeddings.key_to_index["</PAD>"]
oov_idx = gnews_embeddings.key_to_index["</OOV>"]

In [5]:
# https://github.com/Oneplus/Tweebank
from typing import List

train_file = os.path.join(
    "/mnt/Others/experiments/datasets/Tweebank-dev/converted/"
    "en-ud-tweet-train.fixed.conllu")



# need idx 1, 2,3 : word, lemma and pos

class ConlluRowInfo:
    word: str
    lemma: str
    pos: str
    
    def __init__(self, word: str, lemma: str, pos: str) -> None:
        self.word = word
        self.lemma = lemma
        self.pos = pos
        
    def __str__(self) -> str:
        rep = {
            "word": self.word,
            "lemma": self.lemma,
            "pos": self.pos
        }
        return str(rep)
    


class ConlluRow:
    info: List[ConlluRowInfo]
    # text: str
    
    def __init__(self, infos: List[ConlluRowInfo]) -> None:
        self.info = infos
        
    def __str__(self) -> str:
        return f"info : {self.info}"

In [6]:
import torch
from torch.utils.data import Dataset
from tqdm.auto import trange, tqdm
from typing import Dict, List

class TweebankDataset(Dataset):
    def __init__(self, filename, pad_idx=pad_idx, w2v_weights=gnews_embeddings) -> None:
        self.filename = filename
        self.data = list()
        self.__read_data()
        
        self.w2v = w2v_weights
        
        self.PAD_TOKEN = "</PAD>"
        self.OOV_TOKEN = "</OOV>"
        self.pad_idx = pad_idx
        
        
        self.MAX_SEQ_LEN = 50 # default value
        # self.__find_max_seq_len()
        
        self.UNIQUE_TAGS = ['PRON', 'NUM', 'NOUN', 'CCONJ', 'ADV', 'SCONJ', 
                               'ADP', 'AUX', 'PROPN', 'SYM', 'DET', 
                               'INTJ', 'PUNCT', 'X', 'ADJ', 'VERB', 'PART']
        self.tag_dict = dict()
        self.__encode_tags()
        
        self.number_tags = len(self.UNIQUE_TAGS)
        
        self.vocabulary = self.w2v.index_to_key  # type: ignore
            
    
    def __len__(self) ->  int:
        return len(self.data)
    
    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
        # ============== collect ===================
        words = [i.word for i in self.data[idx].info]
        tags = [i.pos for i in self.data[idx].info]
        
        
        # =================== convert using word2vec weights ==========
        for idx in range(len(words)):
            try:
                w2v_idx = self.w2v.key_to_index[words[idx]]  # type: ignore 
            except KeyError:
                w2v_idx = self.w2v.key_to_index[self.OOV_TOKEN] 
            words[idx] = w2v_idx
            tags[idx] = self.tag_dict[tags[idx]]
            
        
        # ============== pad words ===============
        # left pad
        padded_words = np.ones(self.MAX_SEQ_LEN, dtype=np.int32) * self.pad_idx
        padded_words[-len(words):] = words
        
        padded_tags = np.ones(self.MAX_SEQ_LEN, dtype=np.int32) * self.pad_idx
        padded_tags[-len(tags):] = tags
        
        return {
            "words": torch.tensor(padded_words),
            "tags": torch.tensor(padded_tags),
        }
        
    def __find_max_seq_len(self) -> None:
        seq_lens = []
        
        for idx in range(len(self.data)):
            words = [i.word for i in self.data[idx].info]
            seq_lens.append(len(words))
        
        
        self.MAX_SEQ_LEN = max(seq_lens)
        
    def __encode_tags(self) -> None:
        for idx, tag in enumerate(self.UNIQUE_TAGS):
            self.tag_dict[tag] = idx
        
    def __read_data(self) -> None:
        with open(self.filename, "r") as f:
            data = f.readlines()
            
            # ============ read the text file =============
            lines = list()
            buffer = list()
            for _, line in tqdm(enumerate(data)):
                if line == "\n":
                    lines.append(buffer)
                    buffer = []
                else:
                    buffer.append(line)
                    
            # ============== organize in objects ==============
            for idx, line in tqdm(enumerate(lines)):
                # from line index 2 and onwards
                line_info = list()
                for info in line[2:]:
                    buffer = info.split("\t")
                
                    try:
                        word = buffer[1]
                        lemma = buffer[2]
                        tag = buffer[3]
                        
                        line_info.append(ConlluRowInfo(word, lemma, tag))
                        
                    except IndexError:
                        print(buffer)
                        
                
                lines[idx] = ConlluRow(line_info)    

            self.data = lines

In [7]:
dataset = TweebankDataset(train_file)
sample = dataset[0]
sample

0it [00:00, ?it/s]

0it [00:00, ?it/s]

{'words': tensor([3000000, 3000000, 3000000, 3000000, 3000000, 3000000, 3000000, 3000000,
         3000000, 3000000, 3000000, 3000000, 3000000, 3000000, 3000000, 3000000,
         3000000, 3000000, 3000000,   31905, 3000001, 3000001,   12654,   14263,
         3000001,      20,     190,      42,   40105,       1,     234,   22860,
         3000001,      86,     951,     177,      48,      45,       4,    2604,
            2747, 3000001,      20,     248, 3000001, 3000001, 3000001,   10297,
         3000001, 3000001], dtype=torch.int32),
 'tags': tensor([3000000, 3000000, 3000000, 3000000, 3000000, 3000000, 3000000, 3000000,
         3000000, 3000000, 3000000, 3000000, 3000000, 3000000, 3000000, 3000000,
         3000000, 3000000, 3000000,      13,      13,      12,      14,       2,
              12,       0,       7,       7,      15,       6,       1,       2,
               3,       4,       4,      15,      10,       1,       7,       4,
               0,      12,       0,       7,

In [8]:
# model
# https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class LSTMTagger(nn.Module):
    def __init__(self, 
                 embedding_dim: int, 
                 hidden_dim: int,  
                 out_size: int,
                 pad_idx=pad_idx, 
                 freeze_embeddings=True, 
                 w2v_weights=gnews_embeddings) -> None:
        
        super(LSTMTagger, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.taget_size = out_size
        
        
        embedding_tensors = torch.from_numpy(w2v_weights.vectors) # type: ignore        
        self.word_embeddings = nn.Embedding.from_pretrained(
            embedding_tensors, freeze=freeze_embeddings, padding_idx=pad_idx)
        
        self.lstm = nn.LSTM(
            embedding_dim, 
            hidden_dim, 
            batch_first=True,
            bidirectional=True)
        
        self.attention =  nn.MultiheadAttention(hidden_dim * 2, num_heads=4, dropout=0.1, batch_first=True)
        self.relu = nn.ReLU()
        
        self.linear = nn.Linear(hidden_dim * 2, out_size)
        self.log_softmax = nn.LogSoftmax(dim=-1)

        
    def forward(self, words):
        embeds = self.word_embeddings(words)
        lstm_out, _ = self.lstm(embeds)
        attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
        linear_out = self.linear(attn_out)
        linear_out = self.relu(linear_out)
        logits = self.log_softmax(linear_out)
        return logits

In [9]:
model = LSTMTagger(embedding_dim=300, hidden_dim=100,  out_size=dataset.MAX_SEQ_LEN)

# run a sample forward pass
sample = dataset[42]
out = model(sample["words"])
out.size()


torch.Size([50, 50])

In [10]:
torch.max(out, dim=0)

torch.return_types.max(
values=tensor([-3.8585, -3.8886, -3.8952, -3.8832, -3.8592, -3.8913, -3.9326, -3.9326,
        -3.9326, -3.8963, -3.9277, -3.9326, -3.9071, -3.9326, -3.9326, -3.9326,
        -3.9175, -3.8422, -3.9305, -3.9048, -3.9326, -3.9326, -3.8916, -3.9326,
        -3.9022, -3.9326, -3.9080, -3.8582, -3.9269, -3.8869, -3.9326, -3.9310,
        -3.9231, -3.8810, -3.9326, -3.9235, -3.8713, -3.9326, -3.9326, -3.8674,
        -3.9326, -3.9320, -3.8715, -3.9233, -3.9326, -3.9233, -3.8729, -3.9326,
        -3.9326, -3.9326], grad_fn=<MaxBackward0>),
indices=tensor([ 6, 35, 16, 15,  6,  8, 18, 18, 18, 12, 41, 18, 11, 18, 18, 18, 43, 12,
         9, 11, 18, 18, 27, 18,  5, 18, 43, 34, 11, 32, 18, 21, 12, 28, 18, 11,
        18, 18, 18, 34, 18, 13, 12, 16, 18, 16, 44, 18, 18, 18]))

In [11]:
device = torch.device("cpu")
model = model.to(device)

In [12]:
from torch.utils.data import DataLoader

bs = 128
dl_args = {
    # "pin_memory": True,
    "batch_size": bs
}


training_set = dataset
validation_set = TweebankDataset("/mnt/Others/experiments/datasets/Tweebank-dev/converted/en-ud-tweet-dev.fixed.conllu")
test_set = TweebankDataset("/mnt/Others/experiments/datasets/Tweebank-dev/converted/en-ud-tweet-test.fixed.conllu")

train_loader = DataLoader(training_set, shuffle=True, **dl_args)
val_loader = DataLoader(validation_set, shuffle=False, **dl_args)
test_loader = DataLoader(test_set, shuffle=False, **dl_args)

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [13]:
optimizer = optim.AdamW(params=model.parameters())
criterion = nn.NLLLoss(ignore_index=0)
run_validation_every_n_step = 1

# fp16
scaler = torch.cuda.amp.GradScaler()

epochs = 300
for e in trange(epochs):

    steps = 0
    for batch in train_loader:
        # switch to train mode
        model.train()
        
        words = batch["words"]
        tags = batch["tags"].long()
        
        # send data to device
        words = words.to(device)
        tags = tags.to(device)
        
        # zero out optimizer to accumulate new grads
        optimizer.zero_grad()
        
        with torch.autocast(device_type="cuda", dtype=torch.float16):
            logits = model(words)
            # loss
            loss = criterion(logits, tags)
        
        
        # ======== validation ==============
        if steps % run_validation_every_n_step == 0:
            val_losses = []
            
            # switch context
            model.eval()
            with torch.no_grad():
                for val_batch in val_loader:
                    words = val_batch["words"]
                    tags = val_batch["tags"].long()
                    
                    words = words.to(device)
                    tags = tags.to(device)
                    
                    with torch.autocast(device_type="cuda", dtype=torch.float16):
                        logits = model(words)
                        val_loss = criterion(logits, tags)

                    val_losses.append(val_loss.item())


                # log
                print(f"Epoch:: {{e + 1}}/{{epochs}} Step:: {steps}")
                print(f"Train Loss:: {loss} __________ Val Loss:: {torch.mean(torch.tensor(val_losses))}")
        
        # switch context
        model.train()
        scaler.scale(loss).backward()  # type: ignore
        # loss.backward()
        scaler.step(optimizer)
        # optimizer.step()
        scaler.update()
        steps += 1

  0%|          | 0/300 [00:00<?, ?it/s]

IndexError: Target 3000000 is out of bounds.