In [1]:
import numpy as np
import pandas as pd

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F

import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

import re

from functools import reduce

import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

import main

In [2]:
train_sents, train_tags = main.load_data('data/twitter1_train.txt')
train_corpus = set()
for x in train_sents:
    for y in x:
        train_corpus.add(y)

# Utilities

In [3]:
def add_start_tag(sents):
    fixed_sents = list()
    for x in sents:
        fixed_sents.append(['<PAD>'] + x)
    return fixed_sents

def pad_sents(sents, pad_idx=0):
    padded_sents = []
    maxlen = max([len(sent) for sent in sents])
    for sent in sents:
        padded_sent = sent.copy()
        padded_sent.extend([pad_idx]*(maxlen-len(sent)))
        padded_sents.append(padded_sent)
    return padded_sents

def get_vocab_idx(train):
    tokens = set()
    for sent in train:
        tokens.update(sent)
    tokens = sorted(list(tokens))
    vocab2idx = dict(zip(tokens, range(1, len(tokens)+1)))
    vocab2idx["<PAD>"] = 0
    return vocab2idx

def convert_to_idx(sents, word2idx):
    for sent in sents:
        for i in range(len(sent)):
            sent[i] = word2idx[sent[i]]

tag2idx = {"<PAD>": 0, "O": 1, "T-NEG": 2, "T-NEU": 3, "T-POS": 4}
idx2tag = {v:k for k,v in tag2idx.items()}

def onehot(tag):
    mask = list(np.zeros(len(tag2idx), dtype=np.float32))
    mask[tag] = 1
    return mask

def split_data(data, labels):
    datalength = len(data)
    split_size = datalength // 5
    split_base = np.random.randint(0, datalength - split_size)
    return tuple(
        torch.tensor(x[:split_base] + x[split_base + split_size:]) for x in data
    ), torch.tensor(labels[:split_base] + labels[split_base + split_size:]), tuple(
        torch.tensor(x[split_base:split_base+split_size]) for x in data
    ), torch.tensor(labels[split_base:split_base+split_size])

def calculate_metrics(labels, best_path):
    TP = ((best_path == labels) & (labels != tag2idx['O']) & (labels != tag2idx['<PAD>'])).sum()
    FP = ((best_path != labels) & (labels == tag2idx['O']) & (labels != tag2idx['<PAD>'])).sum() + \
            ((best_path != labels) & (best_path != tag2idx['O']) & (labels != tag2idx['O']) & (labels != tag2idx['<PAD>'])).sum()
    FN = ((best_path == tag2idx['O']) & (labels != tag2idx['O']) & (labels != tag2idx['<PAD>'])).sum() + \
            ((best_path != labels) & (best_path != tag2idx['O']) & (labels != tag2idx['O']) & (labels != tag2idx['<PAD>'])).sum()
    precision = TP / (TP + FP) if TP > 0 else 0
    recall = TP / (TP + FN) if TP > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return precision, recall, f1


print(calculate_metrics(np.array([4,1,1,1,1,1,1]),np.array([4,4,1,1,1,1,1])))
print(calculate_metrics(np.array([2,1,1]),np.array([2,2,2])))
print(calculate_metrics(np.array([2,3,1]),np.array([2,2,2])))
print(calculate_metrics(np.array([2,3,1]),np.array([2,2,1])))
print(calculate_metrics(np.array([2,3,1]),np.array([2,2,3])))
print(calculate_metrics(np.array([2,3,1]),np.array([2,1,1])))

(0.5, 1.0, 0.6666666666666666)
(0.3333333333333333, 1.0, 0.5)
(0.3333333333333333, 0.5, 0.4)
(0.5, 0.5, 0.5)
(0.3333333333333333, 0.5, 0.4)
(1.0, 0.5, 0.6666666666666666)


# Learning embeddings

## Self-trained Word2Vec (wrong?)
For option 1, we need to train a Word2Vec model for the embeddings on the training data. We can do this like in HW1 where we manually learned a Word2Vec model before we realized that a pretrained model was far better in terms of performance.

In [4]:
self_trained_w2v = Word2Vec(sentences=train_sents, vector_size=200, window=5, min_count=1, workers=4, epochs=500)
self_trained_w2v.save('embeddings/self_trained_w2v.bin')
# self_trained_w2v.build_vocab(train_sents)
# self_trained_w2v.train(corpus_iterable=train_sents, total_examples=self_trained_w2v.corpus_count, epochs=self_trained_w2v.epochs)

In [5]:
self_trained_w2v.wv.get_vector('instagram')

array([ 0.391951  ,  0.6172369 , -0.5431648 ,  0.69275683, -0.42122728,
        0.29148424, -0.03663745,  1.358171  , -0.7680807 , -1.3340791 ,
        0.6039954 , -0.6339392 ,  0.3944765 ,  0.9352077 , -1.1123271 ,
        1.1028531 ,  1.3522481 , -0.07233609, -1.0521569 ,  0.23746867,
        0.4035885 ,  0.82767   , -0.1743096 , -1.4452302 , -0.04060737,
        1.431944  ,  0.8489689 ,  0.30153295,  0.10213476,  1.0587436 ,
       -1.53545   , -0.27331442, -0.92634547,  0.37000296, -0.03946065,
        0.76178294, -0.04086846,  1.0712398 ,  0.2625926 , -0.6698282 ,
        1.242623  , -0.02768378,  0.43941903, -1.3739824 ,  0.19388297,
        0.4372032 ,  0.5897702 ,  0.17377625,  0.09586184,  0.29164314,
       -1.2340132 ,  0.3278048 ,  1.3499397 , -0.37238437, -0.0932176 ,
        0.5546221 , -1.2831011 ,  0.1490129 , -1.1608125 , -0.04947713,
       -0.50279313, -0.53455454, -0.35660973,  0.44121858, -1.7945807 ,
        1.1220163 ,  0.5508288 ,  2.3649507 ,  0.5403302 ,  2.82

## Pre-trained Word2Vec

In [6]:
wv_from_bin = KeyedVectors.load_word2vec_format(datapath("/Users/reece/Documents/Purdue/CS577/HW2/embeddings/w2v.bin"), binary=True)
wv_from_bin.get_vector('like')

array([ 1.03515625e-01,  1.37695312e-01, -2.97546387e-03,  1.81640625e-01,
       -2.43186951e-04,  1.06933594e-01,  1.97265625e-01,  7.50732422e-03,
       -8.44726562e-02,  1.36718750e-01, -2.44140625e-03, -5.29785156e-02,
       -2.24609375e-02, -4.17480469e-02, -1.59179688e-01,  8.20312500e-02,
        7.95898438e-02,  2.50000000e-01,  4.97436523e-03, -6.00585938e-02,
       -1.22070312e-01,  9.81445312e-02, -9.09423828e-03,  1.26342773e-02,
        1.54296875e-01,  3.75976562e-02, -2.18505859e-02, -1.53808594e-02,
       -8.66699219e-03,  9.13085938e-02, -8.54492188e-02, -1.84326172e-02,
        4.07714844e-02, -7.22656250e-02, -3.17382812e-02, -3.61328125e-02,
        1.60156250e-01,  4.49218750e-02,  7.56835938e-03,  1.64062500e-01,
        1.77734375e-01, -7.61718750e-02,  2.15820312e-01, -6.17675781e-02,
        3.17382812e-02, -8.78906250e-02,  7.72094727e-03, -1.58691406e-02,
        1.17187500e-01, -1.57928467e-03, -1.24511719e-01,  1.28906250e-01,
       -1.43554688e-01, -

In [7]:
wv_from_bin.get_vector('miley')

array([-2.67578125e-01,  6.17675781e-02, -8.85009766e-03,  2.57812500e-01,
       -1.69921875e-01, -8.10546875e-02,  1.90429688e-01, -2.42187500e-01,
       -1.36718750e-02,  4.17480469e-02, -1.66992188e-01, -3.37890625e-01,
       -2.39257812e-01, -1.18652344e-01, -1.33789062e-01,  2.87109375e-01,
        5.41992188e-02,  2.53906250e-01, -2.87109375e-01,  1.29882812e-01,
       -8.78906250e-02, -9.17968750e-02,  3.26171875e-01, -2.69531250e-01,
       -2.77343750e-01, -3.02734375e-01, -8.05664062e-02, -1.25976562e-01,
        3.36914062e-02, -9.22851562e-02, -1.02050781e-01,  1.21459961e-02,
       -1.89453125e-01, -8.42285156e-03, -1.37695312e-01, -6.93359375e-02,
       -2.26020813e-04,  1.84570312e-01,  2.51953125e-01,  2.71484375e-01,
        6.83593750e-02, -3.28125000e-01,  5.03906250e-01,  1.10351562e-01,
        3.08593750e-01, -2.98828125e-01, -1.44042969e-02, -1.22558594e-01,
        2.47070312e-01,  7.12890625e-02, -1.25000000e-01,  1.76757812e-01,
        1.04003906e-01,  

## Contextualized, semi-self-trained embeddings
For option 2, we need to train a model that creates contextualized word embeddings. This can be done using biLSTM, GRU, RNN, etc. I think that it would be useful to look towards the ELMo paper for this as it contains some hints that might help us out here.

Things that will be useful here:
- dropout
- normalization (2d?)
- regularization
- lots of hidden units

In [8]:
uc_model = wv_from_bin

# Models

## Neural Net model for randomly-initialized Embeddings
I think that this is just what the tutorial is doing, except without the LSTM.

In [9]:
class RandomInitEmbeddings(nn.Module):
    def __init__(self, embedding_size, num_words, num_labels, dropout_p):
        super(RandomInitEmbeddings, self).__init__()
        self.embeddings = nn.Embedding(num_words, embedding_size, padding_idx=tag2idx['<PAD>'])
        self.hidden = nn.Linear(embedding_size + num_labels, 200)
        self.output = nn.Linear(200, num_labels)
        self.hidden_activation = nn.SiLU()
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(p=dropout_p)

    def forward(self, word, prev_label):
        x1 = self.embeddings(word)
        x2 = prev_label
        # print(x1.size(), x2.size())
        x = torch.concat([x1,x2], dim=1)
        # print(x.size())
        y = self.hidden_activation(self.hidden(x))
        y = self.softmax(self.output(y))
        return y

## Neural Net model for pre-trained Word2Vec Embeddings
We train a neural net model here that takes as input the embedding of the current word and the label of the previous word (+ and other hand-crafted features we think necessary). This model outputs a softmax normalized set of probabilities describing the propbabilities of each label respectively. We can then concatenate this probability vector to the probability table, then run viterbi.

Note that this model includes both the transition and emisison probabilities. This may be preferable, because we only need 1 model to describe the probability. Otherwise, we would need 2 models: 1 for transition, 1 for emission. Additionally, we would also need to optimize the 2 models by weighting the models using an additional hyperparameter. This means ultimately that the process is more complex.



In [42]:
class Word2VecEmbeddings(nn.Module):
    def __init__(self, embedding_size, num_labels, use_norm=True, dropout_p=0.2, hidden_neurons=200):
        super(Word2VecEmbeddings, self).__init__()
        self.hidden_1 = nn.Linear(embedding_size + num_labels, hidden_neurons)
        self.hidden_2 = nn.Linear(hidden_neurons, hidden_neurons)
        self.output = nn.Linear(hidden_neurons, num_labels)
        self.hidden_activation = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(p=dropout_p)
        if use_norm:
            self.hidden_norm = nn.BatchNorm1d(hidden_neurons)
            self.output_norm = nn.BatchNorm1d(num_labels)
        else:
            self.hidden_norm = nn.Identity(hidden_neurons)
            self.output_norm = nn.Identity(num_labels)

    def forward(self, embedding, prev_label):
        # print(embedding.size(), prev_label.size())
        x = torch.concat([embedding,prev_label], dim=1)
        # print(x.size())
        y = self.hidden_norm(self.dropout(self.hidden_activation(self.hidden_1(x))))
        # y = self.hidden)self.hidden_activation(self.hidden_2(y))
        y = self.softmax(self.output_norm(self.output(y)))
        return y

## Neural Net Model for Contextualized Embeddings
For this, we simply do what we did before, except we pass the entire sentence into the model. We do this to contextualize the word embeddings. The output becomes a sequence of predictions.

In [11]:
class ContextualizedEmbeddings(nn.Module):
    def __init__(self, embedding_size, num_words, num_labels):
        super(ContextualizedEmbeddings, self).__init__()
        self.embeddings = nn.Embedding(num_words, embedding_size, padding_idx=tag2idx['<PAD>'])
        self.hidden = nn.Linear(embedding_size + num_labels, embedding_size+num_labels)
        self.hidden_activation = nn.Tanh()
        self.output = nn.Linear(embedding_size+num_labels, num_labels)
        self.softmax = nn.Softmax(dim=2)
        self.dropout = nn.Dropout()
        self.contextualizer = nn.LSTM(embedding_size, embedding_size//2, batch_first=True, bidirectional=True, dropout=self.dropout.p)

    def forward(self, sent, prev_labels):
        x = self.embeddings(sent)
        # x2 = prev_labels
        # print(x1.size(), x2.size())
        # x = torch.concat([x1,x2], dim=2) # dim=1?
        y, _ = self.contextualizer(x)
        # print(x.size())
        y = torch.concat([y,prev_labels], dim=2)
        y = self.dropout(self.hidden_activation(self.hidden(y)))
        # y = self.hidden_activation(self.hidden_2(y))
        # y = self.hidden_activation(self.hidden(y))
        y = self.softmax(self.output(y))
        return y

# Hyperparameter Tuning

In [12]:
validation_results = pd.DataFrame(columns=['model', 'variable', 'epochs', 'batch_norm', 'dropout', 'hidden_neurons', 'hidden_layers', 'f1', 'type'])

In [13]:
def train(model: nn.Module, optimizer: optim.Optimizer, criterion: nn.CrossEntropyLoss, train_data: tuple, train_labels: torch.tensor):
    # train mode
    model.train()
    ###
    scores = model(*train_data)
    loss = criterion(scores, train_labels) # scores.view(...)?
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    # print(loss.item())

def evaluate(model: nn.Module, eval_data: list, stats_fn):
    model.eval()
    with torch.no_grad():
        return stats_fn(model, eval_data)

def KFold(sents, sents_labels, preprocessor, model_factory, stats_fn, target_variable, debug=False, **kwargs):
    K=5
    n = len(sents) // K
    folds = [list(zip(sents[i:i+n], sents_labels[i:i+n])) for i in range(0, len(sents), n)]
    for k in range(K):
        tr = reduce(lambda x,y: x+y, folds[:k] + folds[k+1:])
        val_base = len(tr) // 10
        va = tr[:val_base]
        tr = tr[val_base:]
        te = folds[k]

        train_data, train_labels = preprocessor(tr)

        # model, optimizer, and loss function
        model = model_factory(**kwargs)
        optimizer = optim.Adam(list(model.parameters()))
        criterion = nn.CrossEntropyLoss()
        for i in range(1, kwargs['epochs']+1):
            train(model, optimizer, criterion, train_data, train_labels)
            if i == kwargs['epochs'] or i % 100 == 0:
                train_results = evaluate(model, tr, stats_fn)
                val_results = evaluate(model, va, stats_fn)
                test_results = evaluate(model, te, stats_fn)
                if debug:
                    print(train_results, val_results, test_results)
                # add to dataframe
                validation_results.loc[len(validation_results)] = pd.Series({'model': str(type(model)), 'variable': target_variable, 'type': 'train', 'epochs': i, 'batch_norm': kwargs['use_norm'], 'dropout': kwargs['dropout_p'], 'hidden_neurons': kwargs['hidden_neurons'], 'hidden_layers': kwargs['hidden_layers'], 'f1': train_results})
                validation_results.loc[len(validation_results)] = pd.Series({'model': str(type(model)), 'variable': target_variable, 'type': 'validation', 'epochs': i, 'batch_norm': kwargs['use_norm'], 'dropout': kwargs['dropout_p'], 'hidden_neurons': kwargs['hidden_neurons'], 'hidden_layers': kwargs['hidden_layers'], 'f1': val_results})
                validation_results.loc[len(validation_results)] = pd.Series({'model': str(type(model)), 'variable': target_variable, 'type': 'test', 'epochs': i, 'batch_norm': kwargs['use_norm'], 'dropout': kwargs['dropout_p'], 'hidden_neurons': kwargs['hidden_neurons'], 'hidden_layers': kwargs['hidden_layers'], 'f1': test_results})

In [14]:
train_sents, train_tags = main.load_data('data/twitter1_train.txt')
sents = add_start_tag(train_sents)
vocab2idx = get_vocab_idx(sents)
convert_to_idx(sents, vocab2idx)
train_data = pad_sents(sents)
labels = add_start_tag(train_tags)
convert_to_idx(labels, tag2idx)
train_labels = pad_sents(labels)

## Uncontextualized Model

### Functions

In [15]:
def uncontextualized_preprocessor(data):
    # have labeled, preprocessed sentences, now flatten out
    flattened_words = list()
    flattened_prev_label = list()
    flattened_labels = list()
    for i in range(len(data)):
        sent, sent_labels = data[i]
        for j in range(1, len(sent)):
            if sent_labels[j] == tag2idx['<PAD>']:
                continue
            flattened_words.append(sent[j])
            flattened_prev_label.append(onehot(sent_labels[j-1]))
            flattened_labels.append(onehot(sent_labels[j]))
    return (torch.tensor(flattened_words), torch.tensor(flattened_prev_label)), torch.tensor(flattened_labels)

def uncontextualized_model_factory(**kwargs):
    return RandomInitEmbeddings(kwargs['hidden_neurons'], len(vocab2idx)+1, len(tag2idx), kwargs['dropout_p'])

def uncontextualized_stats(model: nn.Module, eval_data: list):
    total_f1 = 0
    for i,tp in enumerate(eval_data):
        sent, sent_labels = tp
        probs = np.zeros((len(tag2idx), len(sent)))
        for j in range(1, len(sent)):
            prev_label = sent_labels[j-1]
            word = sent[j]
            preds = model(torch.tensor(word).view(1), torch.tensor(onehot(prev_label)).view(1,-1))
            probs[:,j] = preds.view(-1)
        solution, best_path = main.viterbi(probs[:,1:])
        precision, recall, f1 = calculate_metrics(np.array(sent_labels), best_path)
        total_f1 += f1
    return total_f1 / len(eval_data)
    

### Cross Validation

In [16]:
epochs = [1000]
for e in epochs:
    KFold(
        train_data, train_labels, 
        uncontextualized_preprocessor, uncontextualized_model_factory, uncontextualized_stats, 'epochs', debug=True,
        epochs=e, hidden_neurons=200, use_norm=False, dropout_p=0.0, hidden_layers=1,
    )

0.48846175903840605 0.2672549672549671 0.24991821942176565
0.679645832769885 0.3024236447313369 0.29072422192989583
0.7629985229391093 0.31308520539289764 0.2834997285351899
0.7980163093465008 0.32289909301743613 0.2946616459867676
0.8066116718157965 0.32520021070316923 0.2980814596902693
0.8122332152246019 0.3268438661929785 0.3003732411533833
0.816491973537858 0.3298306230258891 0.3022025780891032
0.819895480933604 0.3234344550912597 0.30666240772623776
0.8216359889118009 0.32549137253279253 0.3066323878380619
0.8237236510125946 0.3295616612184658 0.30883509713296975
0.5084647853997636 0.30990672706649025 0.27390640617591006
0.7036027490399899 0.31272051226802344 0.2824904081996285
0.766749181011498 0.29764331484979245 0.29719247061538995
0.7862638459628013 0.29630070783916923 0.2980432615371028
0.7942961179149685 0.3023869293100061 0.2950353028553852
0.8009507349110173 0.30580573272880945 0.3019541654480068
0.8052499266363413 0.3063974487051408 0.3039866227809494
0.8101736160642984 

In [17]:
neurons = [25, 50, 100, 200, 300, 400]
for n in neurons:
    KFold(
        train_data, train_labels, 
        uncontextualized_preprocessor, uncontextualized_model_factory, uncontextualized_stats, 'neurons', debug=True,
        epochs=1000, hidden_neurons=n, use_norm=False, dropout_p=0.0, hidden_layers=1,
    )

0.0 0.0 0.0
0.0 0.0 0.0
0.21231311549761975 0.10075608152531228 0.11801756163458284
0.44421165483651076 0.19929216083062232 0.2090017022641137
0.48477157861509995 0.20864690864690857 0.22336694038821703
0.53361677353798 0.2316909586140354 0.2452884126642993
0.6246507705470252 0.263690582921352 0.26840889606847074
0.6568669857901606 0.26941989634297314 0.2776185201717119
0.6694489164314552 0.2742765781227319 0.2847174102493254
0.6964348491877332 0.2809216424601039 0.29160385713577225
0.0 0.0 0.0
0.0 0.0 0.0
0.31013568761434884 0.17037843812991743 0.1899883672933318
0.48637224379180916 0.279299333145487 0.25767553959043327
0.5364719598632912 0.29520991828684134 0.27343785712580054
0.6146441642376494 0.29601296139757677 0.29822269002494306
0.6668873465216966 0.2880060110829341 0.30612751175955205
0.6822212858176304 0.3144876491030336 0.3149842447297602
0.7060351305352041 0.31873767258382624 0.3111483210773993
0.7169344034562827 0.3205135620115376 0.3138219673680669
0.0 0.0 0.0
0.092192727

In [18]:
dropouts = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
for d in dropouts:
    KFold(
        train_data, train_labels, 
        uncontextualized_preprocessor, uncontextualized_model_factory, uncontextualized_stats, 'neurons', debug=True,
        epochs=1000, hidden_neurons=300, use_norm=False, dropout_p=d, hidden_layers=1,
    )

0.5599169508153302 0.2616280826339996 0.27329453315268937
0.7456593404975632 0.28696129642875196 0.28672377392422216
0.7923768150608762 0.287446806973434 0.30553585231076863
0.8050348071016656 0.29681564326534715 0.3126452819110308
0.8112892078406366 0.29495885442630987 0.31920831976318015
0.8169883675122191 0.296017657556119 0.3174698953297203
0.8210670749724106 0.30211796750258274 0.3183470586182311
0.8274886755502817 0.3052703706549859 0.31993854567071095
0.8310922454184952 0.3000153692461383 0.32505881284771576
0.8344912575831018 0.30011868473406916 0.3269601292880393
0.5543667002062399 0.30698788391096066 0.2824460147155185
0.740033207386726 0.32318963088193847 0.26983163200309646
0.7888881898595689 0.3203343664882125 0.2832091404786442
0.8035284127594197 0.32171034094111 0.2898628755011736
0.8081678065635053 0.3298299990607681 0.29485249758508036
0.8153632430899025 0.32558467173851774 0.29485063137545436
0.8190721553700211 0.33007505315197605 0.2964008182933158
0.8221161768593309

## Word2Vec Model

### Functions

In [19]:
def word2vec_preprocessor(data):
    # have labeled, preprocessed sentences, now flatten out
    flattened_words = list()
    flattened_prev_label = list()
    flattened_labels = list()
    for i in range(len(data)):
        sent, sent_labels = data[i]
        for j in range(1, len(sent)):
            if sent_labels[j] == tag2idx['<PAD>']:
                continue
            fixed_word = re.sub('[^a-z]', '', str(sent[j]))
            flattened_words.append(wv_from_bin[fixed_word] if sent_labels[j] != tag2idx['<PAD>'] and fixed_word in wv_from_bin else np.zeros(wv_from_bin.vector_size, dtype=np.float32))
            flattened_prev_label.append(onehot(sent_labels[j-1]))
            flattened_labels.append(onehot(sent_labels[j]))
    return (torch.tensor(flattened_words), torch.tensor(flattened_prev_label)), torch.tensor(flattened_labels)

def word2vec_model_factory(**kwargs):
    return Word2VecEmbeddings(wv_from_bin.vector_size, len(tag2idx), use_norm=kwargs['use_norm'], dropout_p=kwargs['dropout_p'], hidden_neurons=kwargs['hidden_neurons'])

def word2vec_stats(model: nn.Module, eval_data: list):
    total_f1 = 0
    for i,tp in enumerate(eval_data):
        sent, sent_labels = tp
        probs = np.zeros((len(tag2idx), len(sent)))
        for j in range(1, len(sent)):
            prev_label = sent_labels[j-1]
            word = wv_from_bin[sent[j]] if sent_labels[j] != tag2idx['<PAD>'] and sent[j] in wv_from_bin else np.zeros(wv_from_bin.vector_size, dtype=np.float32)
            preds = model(torch.tensor(word).view(1,-1), torch.tensor(onehot(prev_label)).view(1,-1))
            probs[:,j] = preds.view(-1)
        solution, best_path = main.viterbi(probs[:,1:])
        precision, recall, f1 = calculate_metrics(np.array(sent_labels), best_path)
        total_f1 += f1
    return total_f1 / len(eval_data)
    

### Cross Validation

In [20]:
dropouts = [0.0, 0.2, 0.4, 0.6, 0.8]
for d in dropouts:
    KFold(
        train_data, train_labels, 
        word2vec_preprocessor, word2vec_model_factory, word2vec_stats, 'dropout', debug=True,
        epochs=250, hidden_neurons=200, use_norm=True, dropout_p=d, hidden_layers=1,
    )

  return (torch.tensor(flattened_words), torch.tensor(flattened_prev_label)), torch.tensor(flattened_labels)


0.0555408791078562 0.05558942655037997 0.06384860298821098
0.05125820528482197 0.05612617178425843 0.05269358834741267
0.052270287364919725 0.05692118068853517 0.05324090991261333
0.09364418725669414 0.09792291660955461 0.09841247991751839
0.10033815081194769 0.11088560088569571 0.10417063327693987
0.10072578188469783 0.11095573018659424 0.10463690468846455
0.10800099977101699 0.10215901094603777 0.10020053251163044
0.10218762138195162 0.10403105793341681 0.09821894263920047
0.10198767434478369 0.10452405575718536 0.09816234440954152
0.047072580563266846 0.052976415604279285 0.043775920886958586
0.03885719641905941 0.046160487700028625 0.0356965773409167
0.03871858111301141 0.04514799591830607 0.0356965773409167
0.04956305552367562 0.0518137436856239 0.042598501377777695
0.04142126529131868 0.041766896782646494 0.037184839295242716
0.0415152351173181 0.041766896782646494 0.036874107867490014
0.04047792772754618 0.044432651375796654 0.039749090922815554
0.037045381511196664 0.0366607060

In [21]:
norms = [True, False]
for n in norms:
    KFold(
        train_data, train_labels, 
        word2vec_preprocessor, word2vec_model_factory, word2vec_stats, 'batch_normalization', debug=True,
        epochs=250, hidden_neurons=200, use_norm=n, dropout_p=0.2, hidden_layers=1,
    )

0.040723860450927206 0.04338376867446872 0.04973288043976697
0.028896871218078945 0.023854957558012518 0.03519233181223002
0.027921882511422965 0.021944804597232897 0.03368323754489777
0.031893238953042705 0.03424729482933615 0.030151912345193505
0.0315568015443467 0.03676872618877868 0.03290938853998592
0.03173771692212779 0.036964824849890256 0.031243044719040283
0.05835387842215122 0.06113702041385716 0.06491441802171882
0.0495863937040576 0.04618112498126931 0.05028816608310517
0.04492481538000949 0.042982490542695595 0.04479772882615937
0.0609186748164504 0.06301944928193172 0.0564016226708883
0.057767708510317185 0.05882403172965593 0.0508850897623786
0.05408946194465976 0.05776370152731389 0.04734249258092995
0.05407478222178213 0.06256637511003195 0.051048030950193175
0.044987228109143435 0.0499974031396039 0.04007940975453004
0.04158706170352256 0.046299707642143084 0.038191280592452546
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0 0.0
0.0 0.0

In [44]:
epochs = [1200]
for e in epochs:
    KFold(
        train_data, train_labels, 
        word2vec_preprocessor, word2vec_model_factory, word2vec_stats, 'epochs', debug=True,
        epochs=e, hidden_neurons=200, use_norm=True, dropout_p=0.2, hidden_layers=6,
    )

0.06548844720836747 0.06118971493373407 0.06517614703183033
0.06553603725888099 0.05947090344872059 0.06136340174565695
0.05920438671402982 0.05471620552514824 0.056330764092806095
0.053548981123277765 0.043933148313724996 0.050411687506088695
0.05006846769185167 0.04261446699504368 0.04593528187205277
0.048394168309637775 0.04060075421210013 0.04396460414197655
0.04670373720821961 0.0378106978072068 0.044305901187467064
0.04508669480067466 0.03738185941244966 0.04393804368699698
0.04375376344631297 0.03548665316941733 0.04357728138261523
0.0418209074942821 0.033378373898562866 0.040629161037748954
0.04173705363671292 0.034501483511510396 0.041266649479764266
0.04097789514665518 0.03268232853816329 0.041474167967627464
0.046620655168214765 0.04497451003431869 0.048449571569203716
0.03835371615511263 0.04297076761952493 0.03901647467525388
0.04027285501056073 0.04162292560839257 0.04209899431881417
0.03876490802484768 0.04111170692071972 0.0403076789085208
0.03752821327011894 0.04180002

## Contextualized

### Word2Vec Contextualized Embeddings

In [23]:
class W2VContextualizedEmbeddings(nn.Module):
    def __init__(self, embedding_size, num_labels, dropout_p):
        super(W2VContextualizedEmbeddings, self).__init__()
        self.hidden = nn.Linear(embedding_size + num_labels, embedding_size+num_labels)
        self.hidden_activation = nn.ReLU()
        self.output = nn.Linear(embedding_size+num_labels, num_labels)
        self.softmax = nn.Softmax(dim=2)
        self.dropout = nn.Dropout(p=dropout_p)
        self.contextualizer = nn.LSTM(embedding_size, embedding_size//2, batch_first=True, bidirectional=True, dropout=self.dropout.p)

    def forward(self, sent, prev_labels):
        # x2 = prev_labels
        # print(x1.size(), x2.size())
        # x = torch.concat([x1,x2], dim=2) # dim=1?
        y, _ = self.contextualizer(sent)
        # print(x.size())
        y = torch.concat([y,prev_labels], dim=2)
        y = self.dropout(self.hidden_activation(self.hidden(y)))
        # y = self.hidden_activation(self.hidden_2(y))
        # y = self.hidden_activation(self.hidden(y))
        y = self.softmax(self.output(y))
        return y

### Functions

In [24]:
def contextualized_preprocessor(data):
    shifted_sents = list()
    shifted_labels = list()
    trimmed_labels = list()
    for i in range(len(data)):
        sent, sent_labels = data[i]
        # shifted_sent = [wv_from_bin[re.sub('[^a-z]', '', str(x))] if re.sub('[^a-z]', '', str(x)) in wv_from_bin else np.zeros(wv_from_bin.vector_size, dtype=np.float32) for x in sent[1:]]
        shifted_sent = sent[1:]
        shifted_label = [onehot(x) for x in sent_labels[1:]]
        trimmed_label = [onehot(x) for x in sent_labels[:len(sent_labels)-1]]
        shifted_sents.append(shifted_sent)
        shifted_labels.append(shifted_label)
        trimmed_labels.append(trimmed_label)

    return (torch.tensor(shifted_sents), torch.tensor(trimmed_labels)), torch.tensor(shifted_labels)

def contextualized_model_factory(**kwargs):
    return ContextualizedEmbeddings(kwargs['hidden_neurons'], len(vocab2idx)+1, len(tag2idx))

def contextualized_stats(model: nn.Module, eval_data: list):
    total_f1 = 0
    sent_data, sent_labels = contextualized_preprocessor(eval_data)
    preds = model(*sent_data)
    for i in range(len(eval_data)):
        # probs = model(torch.tensor(sent).view(1,-1), torch.tensor(labeled_sents[i-1]).view(1,len(sent),-1))[0,:,:].T.numpy()
        probs = preds[i,:,:].T.numpy()
        # print(probs)
        solution, best_path = main.viterbi(probs)
        labels = np.array(sent_labels[i]).argmax(axis=1)
        # print('====')
        # print([idx2tag[x] for x in labels])
        # print([idx2tag[x] for x in best_path])
        precision, recall, f1 = calculate_metrics(labels, best_path[1:])
        total_f1 += f1
    return total_f1 / len(eval_data)
    

### Cross Validation

In [25]:
epochs = [700]
for e in epochs:
    KFold(
        train_data, train_labels, 
        contextualized_preprocessor, contextualized_model_factory, contextualized_stats, 'epochs', debug=True,
        epochs=e, hidden_neurons=512, use_norm=True, dropout_p=0.2, hidden_layers=1,
    )



0.45886311449248357 0.22283827283827276 0.23880116088962688
0.7057582152161346 0.27101392825374715 0.27374497158931665
0.8535484614604766 0.2776414887274614 0.2697067970117618
0.8615631359229516 0.27454153341727294 0.26703017080572383
0.8682409022363058 0.25923205656933457 0.262312236561847
0.873491715627979 0.27072825268091527 0.2663272260378294
0.8752244069535793 0.27232494658530143 0.26864199821075757




0.560036858158985 0.27656466208502406 0.28558216975536893
0.773428634180439 0.2658308257855768 0.27213597408632895
0.8465154747492241 0.25710528787451853 0.26062667959831104
0.8570017772279178 0.2650551057790876 0.26757570370419786
0.8619542953452423 0.26241010121100605 0.26184054978381244
0.8630098276877085 0.2699363356603175 0.26507759765999256
0.8639290660330794 0.26942915053774774 0.26529853781626855




0.5584678673806366 0.2672989906021579 0.28209112925425006
0.7253257526820769 0.2772676041906809 0.2781108961960028
0.8432152717863048 0.28131868131868115 0.2795598424196674
0.854205011600733 0.2799232287223932 0.28461361333701785
0.860633189827992 0.280350494151399 0.2876040035614507
0.8657321047669043 0.2681327152176438 0.28462470022753744
0.8687293100129607 0.26782130543007665 0.2828125906140092




0.5503294012035507 0.28839356774579433 0.29854255935816243
0.7039575876759685 0.2693530160665191 0.2917907360335396
0.8370474556320984 0.2458610268565019 0.2802396308862728
0.8512289478831901 0.26637445008485716 0.29399399446333124
0.8529517381155012 0.2616022991588601 0.29141081457518686
0.8531706043882085 0.25913681592414606 0.292038162011045
0.8541555026153916 0.2571733947299557 0.2899658917389415




0.5703749510343541 0.27877986576211417 0.2717063209971013
0.7618431330705863 0.25151463471301516 0.2354284943876101
0.8473103651891645 0.2406581998920834 0.24140381551729076
0.868775800607325 0.2421046167231156 0.2320770717099463
0.8728685999069529 0.24650022111871997 0.2337319103276551
0.8761797339469115 0.2507361561238858 0.2413663431393929
0.8799408229345274 0.2487215041092338 0.24908346184941943


In [26]:
dropouts = [0.0, 0.2, 0.4, 0.6, 0.8]
for d in dropouts:
    KFold(
        train_data, train_labels, 
        contextualized_preprocessor, contextualized_model_factory, contextualized_stats, 'dropout', debug=True,
        epochs=600, hidden_neurons=512, use_norm=True, dropout_p=d, hidden_layers=2,
    )



0.5612278445949458 0.2776001776001775 0.3028267284268065
0.7540010930871696 0.2835161551137882 0.2636245012504813
0.8477001611325361 0.2495104172618965 0.2611740891256956
0.8609906958618863 0.26469129161436844 0.2656147627153054
0.8643857286397162 0.2724585736420055 0.2637214760702456
0.868381080336972 0.2892395244466249 0.26671274537357165




KeyboardInterrupt: 

## Graphs

In [27]:
validation_results

Unnamed: 0,model,variable,epochs,batch_norm,dropout,hidden_neurons,hidden_layers,f1,type
0,<class '__main__.RandomInitEmbeddings'>,epochs,100,False,0.0,200,1,0.488462,train
1,<class '__main__.RandomInitEmbeddings'>,epochs,100,False,0.0,200,1,0.267255,validation
2,<class '__main__.RandomInitEmbeddings'>,epochs,100,False,0.0,200,1,0.249918,test
3,<class '__main__.RandomInitEmbeddings'>,epochs,200,False,0.0,200,1,0.679646,train
4,<class '__main__.RandomInitEmbeddings'>,epochs,200,False,0.0,200,1,0.302424,validation
...,...,...,...,...,...,...,...,...,...
2683,<class '__main__.ContextualizedEmbeddings'>,dropout,500,True,0.0,512,2,0.272459,validation
2684,<class '__main__.ContextualizedEmbeddings'>,dropout,500,True,0.0,512,2,0.263721,test
2685,<class '__main__.ContextualizedEmbeddings'>,dropout,600,True,0.0,512,2,0.868381,train
2686,<class '__main__.ContextualizedEmbeddings'>,dropout,600,True,0.0,512,2,0.28924,validation


In [28]:
validation_results.to_csv('validation-results.csv')

In [48]:
validation_results.model = validation_results.model.replace({
    "<class '__main__.ContextualizedEmbeddings'>": 'Contextualized Embeddings',
    "<class '__main__.Word2VecEmbeddings'>": 'Word2Vec Embeddings',
    "<class '__main__.RandomInitEmbeddings'>": 'Randomly Initialized Embeddings'
})

validation_results

Unnamed: 0,model,variable,epochs,batch_norm,dropout,hidden_neurons,hidden_layers,f1,type
0,Randomly Initialized Embeddings,epochs,100,False,0.0,200,1,0.488462,train
1,Randomly Initialized Embeddings,epochs,100,False,0.0,200,1,0.267255,validation
2,Randomly Initialized Embeddings,epochs,100,False,0.0,200,1,0.249918,test
3,Randomly Initialized Embeddings,epochs,200,False,0.0,200,1,0.679646,train
4,Randomly Initialized Embeddings,epochs,200,False,0.0,200,1,0.302424,validation
...,...,...,...,...,...,...,...,...,...
2890,Word2Vec Embeddings,epochs,1100,True,0.2,200,6,0.035282,validation
2891,Word2Vec Embeddings,epochs,1100,True,0.2,200,6,0.037645,test
2892,Word2Vec Embeddings,epochs,1200,True,0.2,200,6,0.037526,train
2893,Word2Vec Embeddings,epochs,1200,True,0.2,200,6,0.036029,validation


In [30]:
import altair as alt

In [31]:
line = alt.Chart(validation_results[(validation_results.model == 'Randomly Initialized Embeddings') & (validation_results.hidden_neurons == 200) & (validation_results.dropout == 0.0)]).mark_line().encode(
    x=alt.X('epochs'),
    y=alt.Y('mean(f1)'),
    color=alt.Color('type')
)

band = alt.Chart(validation_results[(validation_results.model == 'Randomly Initialized Embeddings') & (validation_results.hidden_neurons == 200) & (validation_results.dropout == 0.0)]).mark_errorband(extent='ci').encode(
    x=alt.X('epochs', title='Epochs'),
    y=alt.Y('f1', title='F1 Score'),
    color=alt.Color('type')
)

(line + band).properties(title='Randomly Initialized Embeddings Loss Curve (K=5)')

In [49]:
line = alt.Chart(validation_results[(validation_results.model == 'Word2Vec Embeddings') & (validation_results.hidden_layers == 6)]).mark_line().encode(
    x=alt.X('epochs'),
    y=alt.Y('mean(f1)'),
    color=alt.Color('type')
)

band = alt.Chart(validation_results[(validation_results.model == 'Word2Vec Embeddings') & (validation_results.hidden_layers == 6)]).mark_errorband(extent='ci').encode(
    x=alt.X('epochs', title='Epochs'),
    y=alt.Y('f1', title='F1 Score'),
    color=alt.Color('type')
)

(line + band).properties(title='Word2Vec Embeddings Loss Curve (K=5)')

In [33]:
line = alt.Chart(validation_results[(validation_results.model == 'Contextualized Embeddings') & (validation_results.hidden_layers == 1)]).mark_line().encode(
    x=alt.X('epochs'),
    y=alt.Y('mean(f1)'),
    color=alt.Color('type')
)

band = alt.Chart(validation_results[(validation_results.model == 'Contextualized Embeddings') & (validation_results.hidden_layers == 1)]).mark_errorband(extent='ci').encode(
    x=alt.X('epochs', title='Epochs'),
    y=alt.Y('f1', title='F1 Score'),
    color=alt.Color('type')
)

(line + band).properties(title='Contextualized Embeddings Loss Curve (K=5)')

In [34]:
line = alt.Chart(validation_results[(validation_results.model == 'Randomly Initialized Embeddings') & (validation_results.epochs == 1000) & (validation_results.dropout == 0.0)]).mark_line().encode(
    x=alt.X('hidden_neurons'),
    y=alt.Y('mean(f1)'),
    color=alt.Color('type')
)

band = alt.Chart(validation_results[(validation_results.model == 'Randomly Initialized Embeddings') & (validation_results.epochs == 1000) & (validation_results.dropout == 0.0)]).mark_errorband(extent='ci').encode(
    x=alt.X('hidden_neurons', title='Layer Size'),
    y=alt.Y('f1', title='F1 Score'),
    color=alt.Color('type')
)

(line + band).properties(title='Randomly Initialized Embeddings Layer Size (K=5)')

In [37]:
line = alt.Chart(validation_results[(validation_results.model == 'Randomly Initialized Embeddings') & (validation_results.hidden_neurons == 300) & (validation_results.epochs == 1000)]).mark_line().encode(
    x=alt.X('dropout'),
    y=alt.Y('mean(f1)'),
    color=alt.Color('type')
)

band = alt.Chart(validation_results[(validation_results.model == 'Randomly Initialized Embeddings') & (validation_results.hidden_neurons == 300) & (validation_results.epochs == 1000)]).mark_errorband(extent='ci').encode(
    x=alt.X('dropout', title='Dropout'),
    y=alt.Y('f1', title='F1 Score'),
    color=alt.Color('type')
)

(line + band).properties(title='Randomly Initialized Embeddings Dropout (K=5)')

In [35]:
line = alt.Chart(validation_results[(validation_results.model == 'Word2Vec Embeddings') & (validation_results.epochs == 250) & (validation_results.batch_norm == True) & (validation_results.hidden_neurons == 200)]).mark_line().encode(
    x=alt.X('dropout'),
    y=alt.Y('mean(f1)'),
    color=alt.Color('type')
)

band = alt.Chart(validation_results[(validation_results.model == 'Word2Vec Embeddings') & (validation_results.epochs == 250) & (validation_results.batch_norm == True) & (validation_results.hidden_neurons == 200)]).mark_errorband(extent='ci').encode(
    x=alt.X('dropout', title='Dropout'),
    y=alt.Y('f1', title='F1 Score'),
    color=alt.Color('type')
)

(line + band).properties(title='Word2Vec Embeddings Loss Dropout (K=5)')