In [148]:
import logging
import random
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import string
import re
import gensim
from string import punctuation
import torch.autograd as autograd
import torch.utils.data as Data
from torchtext.vocab import Vectors
import gensim.downloader as api
import torch.nn.functional as F

In [2]:
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [91]:
def prepare_seq(x):
    X = [[w2v[i] for i in sent if i in w2v] for sent in x]
    # X = [[i for i in sent if i in w2v] for sent in x]
    return X

In [4]:
def prep_data(x, Y):
    x = prepare_seq(list(x))
    Y = list(Y)
    return [(x, y) for x,y in zip(x, Y)]

In [5]:
def get_vocab_size(sentences):
    l = set()
    for i in sentences:
        for x in i:
            l.add(x)
    
    return len(l)

In [6]:
def build_token_to_ix(sentences):
    token_to_ix = dict()
    print(len(sentences))
    for sent in sentences:
        for token in sent:# .split(' '):
            if token not in token_to_ix:
                token_to_ix[token] = len(token_to_ix)
    token_to_ix['<pad>'] = len(token_to_ix)
    return token_to_ix

In [7]:
def build_label_to_ix(labels):
    label_to_ix = dict()
    for label in labels:
        if label not in label_to_ix:
            label_to_ix[label] = len(label_to_ix)
    
    label_to_ix

In [8]:
def remove_infrequent_words(sents):
	word_counts = {}
	for s in sents:
		for w in s:
			if w in word_counts:
				word_counts[w] += 1
			else:
				word_counts[w] = 1

	threshold = 2
	filtered_sents = []
	for s in sents:
		new_s = []
		for w in s:
			if word_counts[w] < threshold:
				new_s.append('<UNKOWN>')
			else:
				new_s.append(w)
		filtered_sents.append(new_s)
	return filtered_sents

In [9]:
def restrict_w2v(w2v, restricted_word_set):
    new_vectors = []
    new_vocab = {}
    new_index2entity = []
    # new_vectors_norm = []

    for i in range(len(w2v.vocab)):
        word = w2v.index2entity[i]
        vec = w2v.vectors[i]
        vocab = w2v.vocab[word]
        # vec_norm = w2v.vectors_norm[i]
        if word in restricted_word_set:
            vocab.index = len(new_index2entity)
            new_index2entity.append(word)
            new_vocab[word] = vocab
            new_vectors.append(vec)
            # new_vectors_norm.append(vec_norm)

    w2v.vocab = new_vocab
    w2v.vectors = new_vectors
    w2v.index2entity = new_index2entity
    w2v.index2word = new_index2entity
    # w2v.vectors_norm = new_vectors_norm

# read data

In [286]:
df = pd.read_csv("processed_request_corpus.csv").drop("Unnamed: 0", axis=1)

In [287]:
df.head(1)

Unnamed: 0,r_id,r_text,date_submitted,submission_method,receiving_department,assigned_pro,documents_released_to_requester_details,documents_released_details,request_published,request_closed,documents_released_to_requester,documents_released,department_assignment_details,request_closed_hide,request_reopened,request_opened,department_assignment,close_date,doc_released
0,19-5614,This firm is performing a Phase I Environmenta...,"November 21, 2019",web,Fire-Rescue,Angela Laurita,,,,Duplicate request This request was already ent...,,,,,,Request received via web,Fire-Rescue,Request Closed Public November 21 2019,0


In [288]:
print(list(df))

['r_id', 'r_text', 'date_submitted', 'submission_method', 'receiving_department', 'assigned_pro', 'documents_released_to_requester_details', 'documents_released_details', 'request_published', 'request_closed', 'documents_released_to_requester', 'documents_released', 'department_assignment_details', 'request_closed_hide', 'request_reopened', 'request_opened', 'department_assignment', 'close_date', 'doc_released']


In [289]:
df["r_text"] = df["r_text"].fillna("")

In [290]:
r_text = [t.translate(str.maketrans('', '', string.punctuation)).lower().split() for t in df["r_text"]]

In [291]:
r_text = remove_infrequent_words(r_text)

In [292]:
word_to_ix = build_token_to_ix(r_text)

17006


In [293]:
label_to_ix = {0:0,1:1}

In [294]:
y = list(df["doc_released"])

In [295]:
# r_text.sort(key=len)
# r_text.reverse()

In [296]:
r_len = [len(r) for r in r_text]

In [297]:
len(r_len)

17006

# Split and Embed

In [80]:
w2v = api.load('glove-wiki-gigaword-50')

In [81]:
len(w2v.vocab)

400000

In [299]:
len_df = pd.DataFrame({"x":r_text, "y":y, "len": [len(r) for r in r_text]})

In [301]:
len_df = len_df[len_df["len"] < 51]

In [303]:
len(len_df)

9608

In [315]:
train, valid, test = train_validate_test_split(pd.DataFrame({"x":list(len_df["x"]), "y":list(len_df["y"])}))

In [316]:
train_data   = prep_data(train["x"], train["y"])
dev_data     = prep_data(valid["x"], valid["y"])
test_data    = prep_data(test["x"], test["y"])

In [133]:
w2v

array([-9.4531e-01,  3.9686e-01, -8.0605e-01, -3.0215e-01,  2.7736e-01,
       -1.0019e-01, -4.0500e-01, -1.0095e-01, -6.5934e-02, -4.7258e-02,
       -2.0828e-01, -2.5721e-01,  6.8750e-02,  9.3751e-01, -8.1483e-02,
        1.3460e-01,  2.7302e-02, -1.8096e-01, -3.5638e-01, -8.8104e-01,
        1.1951e+00,  5.5556e-02, -3.1741e-01,  1.0244e+00, -8.4768e-01,
       -1.5959e+00,  2.1657e-02,  4.3628e-01,  8.8388e-04, -4.1820e-01,
        2.1247e+00, -4.3332e-01, -1.0816e+00,  3.3616e-01,  3.3399e-01,
       -2.0064e-01,  5.8633e-01,  9.0186e-02,  7.5054e-01,  4.8500e-01,
        1.7370e-01,  6.8129e-01, -1.6810e-01,  6.1265e-01,  7.6875e-02,
       -1.9797e-01, -9.9555e-02, -1.0231e+00,  9.5394e-01, -6.3500e-02],
      dtype=float32)

# Setup Model

In [306]:
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 50
HIDDEN_DIM    = 50
VOCAB_SIZE    = 40000 # len(word_to_ix)
OUTPUT_SIZE   = len(label_to_ix)

In [206]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        # self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.weights         = torch.FloatTensor(w2v.vectors)
        self.word_embeddings = nn.Embedding.from_pretrained(self.weights)
        model.word_embeddings.weight.requires_grad=False

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        # sentence = torch.abs(sentence)
        # embeds = self.word_embeddings(sentence)
        embeds      = sentence
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        # print(len(lstm_out[-1][0]))
        # tag_space   = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_space   = self.hidden2tag(lstm_out[-1])
        tag_scores  = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [207]:
model         = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, OUTPUT_SIZE)
loss_function = nn.NLLLoss()
optimizer     = optim.SGD(model.parameters(), lr=0.1)

In [212]:
torch.argmax(tag_scores)

tensor(1)

In [None]:
for epoch in range(300):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, tag in train_data:
        ntag = [[0,0]]
        #print(sentence)
        #print(tag)
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        
        sentence = torch.tensor(sentence)
        ntag[0][tag] = 1
        tag      = torch.tensor(ntag)
        print(tag)
        # Step 3. Run our forward pass.
        tag_scores = model(sentence)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, tag)
        loss.backward()
        optimizer.step()

In [115]:
len(train_data)

10203

# Binary Model

In [307]:
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 50
HIDDEN_DIM    = 50
VOCAB_SIZE    = 40000 # len(word_to_ix)
OUTPUT_SIZE   = len(label_to_ix)

In [308]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        
        # self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.weights         = torch.FloatTensor(w2v.vectors)
        self.word_embeddings = nn.Embedding.from_pretrained(self.weights)
        model.word_embeddings.weight.requires_grad=False
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2label = nn.Linear(hidden_dim, 1)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # the first is the hidden h
        # the second is the cell  c
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        # embeds = self.word_embeddings(sentence)
        # x = embeds.view(len(sentence), 1, -1)
        x = sentence
        lstm_out, self.hidden = self.lstm(x.view(len(sentence), 1, -1), self.hidden)
        y  = self.hidden2label(lstm_out[-1])
        log_probs = torch.sigmoid(y)
        return log_probs

In [309]:
# model         = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, OUTPUT_SIZE)
model = LSTMClassifier(embedding_dim=EMBEDDING_DIM,hidden_dim=HIDDEN_DIM,
                           vocab_size=VOCAB_SIZE,label_size=OUTPUT_SIZE)
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(),lr = 1e-3)

In [310]:
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    # inputs = prepare_sequence(train_data, wv) # my train data is already prepped
    tag_scores = model(torch.tensor(train_data[0][0], dtype=torch.float))
    print(tag_scores)

tensor([[0.5086]])


In [311]:
print(tag_scores.data.max(1)[1].numpy())

[0]


In [312]:
loss_l = []

In [326]:
train_data.sort(key=lambda x: len(x[0]))
train_data.reverse()

In [None]:
for epoch in range(20):  # again, normally you would NOT do 300 epochs, it is toy data
    for sentence, label in train_data:
        # print(len(sentence))
        #print(tag)
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        
        sentence = torch.tensor(sentence)
        # Step 3. Run our forward pass.
        label_score = model(sentence)
        label = torch.FloatTensor([[label]])

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        # print(pred_label)
        # print(label)
        loss = loss_function(label_score, label)
        # print(loss)
        loss_l.append(loss)
        loss.backward(retain_graph=True)
        optimizer.step()
    print (loss_l)

In [None]:
print(loss_l)