In [10]:
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F

In [68]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
with open('data/twpos-train.tsv') as f:
    text = f.read()

In [19]:
lines = text.split('\n\n')

In [42]:
lines[-2]

'“\t,\n<@MENTION>\t@\n:\t,\nnow\tR\nfollowing\tV\n<@MENTION>\t@\n”\t,\nlmao\t!\nare\tV\nyou\tO\nhigh\tA\n?\t,'

In [20]:
words = lines[0].split('\n')

In [21]:
words[0]

'i\tO'

In [22]:
words

['i\tO',
 'predict\tV',
 'i\tO',
 "won't\tV",
 'win\tV',
 'a\tD',
 'single\tA',
 'game\tN',
 'i\tO',
 'bet\tV',
 'on\tP',
 '.\t,',
 'got\tV',
 'cliff\t^',
 'lee\t^',
 'today\tN',
 ',\t,',
 'so\tP',
 'if\tP',
 'he\tO',
 'loses\tV',
 'its\tL',
 'on\tP',
 'me\tO',
 'rt\t~',
 '<@MENTION>\t@',
 ':\t~',
 'texas\t^',
 '(\t,',
 'cont\t~',
 ')\t,',
 '<URL-tl.gd>\tU']

In [61]:
def read_corpus(file):
    with open(file, 'rt') as f:
        text = f.read()
    lines = text.split('\n\n')
    data = []
    labels = []
    vocab = set()
    label_set = set()
    for line in lines:
        if not line: continue
        curr_data = []
        curr_labels = []
        for token_label_str in line.split('\n'):
            if not token_label_str: continue
            token, label = token_label_str.split('\t')
            vocab.add(token)
            label_set.add(label)
            curr_data.append(token)
            curr_labels.append(label)
        data.append(curr_data)
        labels.append(curr_labels)
    return data, labels, vocab, label_set

In [62]:
train, train_labels, train_vocab, train_label_set = read_corpus('data/twpos-train.tsv')
dev, dev_labels, dev_vocab, dev_label_set = read_corpus('data/twpos-dev.tsv')
devtest, devtest_labels, devtest_vocab, devtest_label_set = read_corpus('data/twpos-devtest.tsv')

In [50]:
vocab = train_vocab.copy()
vocab.update(dev_vocab)
vocab.update(devtest_vocab)

In [63]:
label_set = train_label_set.copy()
label_set.update(dev_label_set)
label_set.update(devtest_label_set)

In [49]:
len(train), len(dev), len(devtest)

(1173, 327, 327)

In [64]:
len(label_set)

25

In [51]:
len(train_vocab), len(dev_vocab), len(devtest_vocab), len(vocab)

(4420, 1750, 1705, 5989)

In [65]:
all_labels = sorted(label_set)

In [66]:
all_labels

['!',
 '#',
 '$',
 '&',
 ',',
 '@',
 'A',
 'D',
 'E',
 'G',
 'L',
 'M',
 'N',
 'O',
 'P',
 'R',
 'S',
 'T',
 'U',
 'V',
 'X',
 'Y',
 'Z',
 '^',
 '~']

In [69]:
len(train_label_set)

25

In [52]:
idx2word_map = sorted(vocab)

In [54]:
idx2word_map[100:110]

['#drake',
 '#dui',
 '#dumb',
 '#dumbniggathere',
 '#ebay',
 '#econdev',
 '#education',
 '#elearning',
 '#endtimes',
 '#englishmajors']

In [55]:
word2idx_map = {word: idx for idx, word in enumerate(idx2word_map)}

In [56]:
word2idx_map['hello']

2811

In [60]:
sum([len(l) for l in train]), sum([len(l) for l in dev]), \
sum([len(l) for l in devtest])

(17130, 4821, 4639)

In [58]:
train[0]

['i',
 'predict',
 'i',
 "won't",
 'win',
 'a',
 'single',
 'game',
 'i',
 'bet',
 'on',
 '.',
 'got',
 'cliff',
 'lee',
 'today',
 ',',
 'so',
 'if',
 'he',
 'loses',
 'its',
 'on',
 'me',
 'rt',
 '<@MENTION>',
 ':',
 'texas',
 '(',
 'cont',
 ')',
 '<URL-tl.gd>']

In [67]:
train_labels[0]

['O',
 'V',
 'O',
 'V',
 'V',
 'D',
 'A',
 'N',
 'O',
 'V',
 'P',
 ',',
 'V',
 '^',
 '^',
 'N',
 ',',
 'P',
 'P',
 'O',
 'V',
 'L',
 'P',
 'O',
 '~',
 '@',
 '~',
 '^',
 ',',
 '~',
 ',',
 'U']

In [70]:
temp = [[elm] for elm in train_labels[0]]

In [84]:
enc = OneHotEncoder(categories=[all_labels])
mat = enc.fit_transform(temp)

In [85]:
enc.categories_

[array(['!', '#', '$', '&', ',', '@', 'A', 'D', 'E', 'G', 'L', 'M', 'N',
        'O', 'P', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z', '^', '~'],
       dtype=object)]

In [86]:
print(mat.toarray())

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [87]:
class FeedForwardTagger(nn.Module):
    def __init__(self, vocab_size, window_size, output_dim,
                 emb_dim=50, pretrained_emb=None, freeze=False):
        super(FeedForwardTagger, self).__init__()
        if pretrained_emb:
            self.emb = nn.Embedding.from_pretrained(pretrain_emb)
        else:
            self.emb = nn.Embedding(vocab_size, emb_dim)
        input_dim = (2 * window_size + 1) * emb_dim
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
    
    def forward(self, inputs):
        embeds = self.emb(inputs).view((1, -1))
        out = F.tanh(self.fc1(embeds))
        out = self.fc2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs