In [10]:
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F

In [68]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
with open('data/twpos-train.tsv') as f:
    text = f.read()

In [19]:
lines = text.split('\n\n')

In [42]:
lines[-2]

'“\t,\n<@MENTION>\t@\n:\t,\nnow\tR\nfollowing\tV\n<@MENTION>\t@\n”\t,\nlmao\t!\nare\tV\nyou\tO\nhigh\tA\n?\t,'

In [20]:
words = lines[0].split('\n')

In [21]:
words[0]

'i\tO'

In [22]:
words

['i\tO',
 'predict\tV',
 'i\tO',
 "won't\tV",
 'win\tV',
 'a\tD',
 'single\tA',
 'game\tN',
 'i\tO',
 'bet\tV',
 'on\tP',
 '.\t,',
 'got\tV',
 'cliff\t^',
 'lee\t^',
 'today\tN',
 ',\t,',
 'so\tP',
 'if\tP',
 'he\tO',
 'loses\tV',
 'its\tL',
 'on\tP',
 'me\tO',
 'rt\t~',
 '<@MENTION>\t@',
 ':\t~',
 'texas\t^',
 '(\t,',
 'cont\t~',
 ')\t,',
 '<URL-tl.gd>\tU']

In [61]:
def read_corpus(file):
    with open(file, 'rt') as f:
        text = f.read()
    lines = text.split('\n\n')
    data = []
    labels = []
    vocab = set()
    label_set = set()
    for line in lines:
        if not line: continue
        curr_data = []
        curr_labels = []
        for token_label_str in line.split('\n'):
            if not token_label_str: continue
            token, label = token_label_str.split('\t')
            vocab.add(token)
            label_set.add(label)
            curr_data.append(token)
            curr_labels.append(label)
        data.append(curr_data)
        labels.append(curr_labels)
    return data, labels, vocab, label_set

In [62]:
train, train_labels, train_vocab, train_label_set = read_corpus('data/twpos-train.tsv')
dev, dev_labels, dev_vocab, dev_label_set = read_corpus('data/twpos-dev.tsv')
devtest, devtest_labels, devtest_vocab, devtest_label_set = read_corpus('data/twpos-devtest.tsv')

In [50]:
vocab = train_vocab.copy()
vocab.update(dev_vocab)
vocab.update(devtest_vocab)

In [63]:
label_set = train_label_set.copy()
label_set.update(dev_label_set)
label_set.update(devtest_label_set)

In [49]:
len(train), len(dev), len(devtest)

(1173, 327, 327)

In [64]:
len(label_set)

25

In [51]:
len(train_vocab), len(dev_vocab), len(devtest_vocab), len(vocab)

(4420, 1750, 1705, 5989)

In [65]:
all_labels = sorted(label_set)

In [66]:
all_labels

['!',
 '#',
 '$',
 '&',
 ',',
 '@',
 'A',
 'D',
 'E',
 'G',
 'L',
 'M',
 'N',
 'O',
 'P',
 'R',
 'S',
 'T',
 'U',
 'V',
 'X',
 'Y',
 'Z',
 '^',
 '~']

In [69]:
len(train_label_set)

25

In [52]:
idx2word_map = sorted(vocab)

In [54]:
idx2word_map[100:110]

['#drake',
 '#dui',
 '#dumb',
 '#dumbniggathere',
 '#ebay',
 '#econdev',
 '#education',
 '#elearning',
 '#endtimes',
 '#englishmajors']

In [55]:
word2idx_map = {word: idx for idx, word in enumerate(idx2word_map)}

In [56]:
word2idx_map['hello']

2811

In [60]:
sum([len(l) for l in train]), sum([len(l) for l in dev]), \
sum([len(l) for l in devtest])

(17130, 4821, 4639)

In [58]:
train[0]

['i',
 'predict',
 'i',
 "won't",
 'win',
 'a',
 'single',
 'game',
 'i',
 'bet',
 'on',
 '.',
 'got',
 'cliff',
 'lee',
 'today',
 ',',
 'so',
 'if',
 'he',
 'loses',
 'its',
 'on',
 'me',
 'rt',
 '<@MENTION>',
 ':',
 'texas',
 '(',
 'cont',
 ')',
 '<URL-tl.gd>']

In [67]:
train_labels[0]

['O',
 'V',
 'O',
 'V',
 'V',
 'D',
 'A',
 'N',
 'O',
 'V',
 'P',
 ',',
 'V',
 '^',
 '^',
 'N',
 ',',
 'P',
 'P',
 'O',
 'V',
 'L',
 'P',
 'O',
 '~',
 '@',
 '~',
 '^',
 ',',
 '~',
 ',',
 'U']

In [70]:
temp = [[elm] for elm in train_labels[0]]

In [84]:
enc = OneHotEncoder(categories=[all_labels])
mat = enc.fit_transform(temp)

In [85]:
enc.categories_

[array(['!', '#', '$', '&', ',', '@', 'A', 'D', 'E', 'G', 'L', 'M', 'N',
        'O', 'P', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z', '^', '~'],
       dtype=object)]

In [86]:
print(mat.toarray())

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [87]:
class FeedForwardTagger(nn.Module):
    def __init__(self, vocab_size, window_size, output_dim,
                 emb_dim=50, pretrained_emb=None, freeze=False):
        super(FeedForwardTagger, self).__init__()
        if pretrained_emb:
            self.emb = nn.Embedding.from_pretrained(pretrain_emb)
        else:
            self.emb = nn.Embedding(vocab_size, emb_dim)
        input_dim = (2 * window_size + 1) * emb_dim
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
    
    def forward(self, inputs):
        embeds = self.emb(inputs).view((1, -1))
        out = F.tanh(self.fc1(embeds))
        out = self.fc2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

In [None]:
np.read

In [88]:
!head -n20 data/twitter-embeddings.txt 

</s> 0.008005 0.008839 -0.007661 -0.006556 0.002733 0.006042 0.001882 0.000423 -0.007207 0.004437 -0.008713 0.002499 -0.001503 -0.001914 -0.006631 -0.003764 0.005159 0.006051 0.005938 0.003195 0.003090 -0.007605 -0.008192 0.009939 0.007603 0.006180 -0.001208 0.008031 -0.000990 0.001469 -0.000298 -0.005966 0.002625 -0.002675 -0.007651 0.009508 0.008759 -0.002190 -0.000452 0.001018 -0.007275 -0.008014 0.009109 0.000126 -0.005165 -0.006084 -0.006153 0.003394 0.000403 0.002662
. 0.207125 -0.031345 0.091379 0.096409 -0.131985 0.144753 0.026806 -0.175994 0.048449 0.081937 0.080954 -0.041135 -0.304135 0.055592 0.147729 -0.079038 0.295870 0.253811 -0.181798 0.248076 0.053270 0.209448 -0.219176 0.178508 0.102624 0.087713 0.233860 0.101264 0.040922 -0.127173 -0.236495 0.106080 -0.306207 0.138877 0.152549 0.144254 -0.044860 0.104159 0.041576 -0.202435 0.077845 -0.081174 -0.220780 -0.122013 -0.297300 0.247084 0.324700 0.155697 -0.067304 -0.025445
<@MENTION> -0.321289 0.050717 0.217663 0.130636 0.2

In [93]:
stuff = pd.read_csv('data/twitter-embeddings.txt', delimiter=' ', header=None)

In [94]:
stuff

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,50
0,</s>,0.008005,0.008839,-0.007661,-0.006556,0.002733,0.006042,0.001882,0.000423,-0.007207,...,-0.007275,-0.008014,0.009109,0.000126,-0.005165,-0.006084,-0.006153,0.003394,0.000403,0.002662
1,.,0.207125,-0.031345,0.091379,0.096409,-0.131985,0.144753,0.026806,-0.175994,0.048449,...,0.077845,-0.081174,-0.220780,-0.122013,-0.297300,0.247084,0.324700,0.155697,-0.067304,-0.025445
2,<@MENTION>,-0.321289,0.050717,0.217663,0.130636,0.220566,-0.221929,-0.009311,0.241410,0.420372,...,-0.196051,-0.650321,0.229833,-0.056008,-0.589202,0.037581,0.197169,-0.160581,0.057263,0.384165
3,the,0.234545,0.005101,0.228226,0.180391,0.134255,0.097655,0.077292,0.102930,0.253679,...,0.238178,-0.367623,0.009977,-0.245449,-0.156343,0.175603,-0.200755,0.089917,-0.050858,0.064103
4,i,-0.065632,-0.022718,0.058576,0.560731,0.154797,0.307549,0.063307,-0.363784,-0.133096,...,-0.482019,-0.531158,-0.545102,0.219512,-0.231775,0.191794,0.101574,-0.262040,0.026996,-0.015891
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19439,nurture,0.204274,0.164465,-0.109088,0.008565,-0.017280,0.128900,0.306388,-0.110153,-0.409500,...,0.151075,-0.173320,0.169558,-0.057272,-0.667943,-0.516890,0.430046,0.418442,-0.074517,-0.217384
19440,quotation,-0.052569,-0.284374,0.339601,-0.435571,-0.337225,-0.354862,0.075978,-0.327963,0.094719,...,-0.077608,-0.066172,0.125516,-0.159479,0.430853,-0.173255,0.292680,0.732749,-0.097984,-0.392376
19441,a5,-0.861113,0.340245,0.160136,-0.285174,0.249593,-0.123985,0.155226,0.512830,-0.140564,...,0.215618,0.437744,0.396084,0.076783,-0.034082,0.142286,-0.148950,0.401117,-0.371847,-0.407417
19442,malu,-0.105215,-0.278582,0.499606,-0.240102,0.115366,-0.423914,-0.456027,0.434088,0.235833,...,-0.445639,-0.095685,0.306776,0.139477,-0.130772,-0.350481,0.445511,-0.255516,-0.221685,0.517086
