In [97]:
import logging
import random
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import string
import re
import gensim
from string import punctuation
import torch.autograd as autograd
import torch.utils.data as Data
from torchtext.vocab import Vectors
import gensim.downloader as api
import torch.nn.functional as F

In [98]:
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [99]:
def prepare_seq(x):
    X = [[w2v[i] for i in sent if i in w2v] for sent in x]
    # X = [[i for i in sent if i in w2v] for sent in x]
    return X

In [100]:
def prep_data(x, Y):
    x = prepare_seq(list(x))
    Y = list(Y)
    if(len(x) == 0):
        print("yep.")
    return [(x, y) for x,y in zip(x, Y)]

In [101]:
def get_vocab_size(sentences):
    l = set()
    for i in sentences:
        for x in i:
            l.add(x)
    
    return len(l)

In [102]:
def build_token_to_ix(sentences):
    token_to_ix = dict()
    print(len(sentences))
    for sent in sentences:
        for token in sent:# .split(' '):
            if token not in token_to_ix:
                token_to_ix[token] = len(token_to_ix)
    token_to_ix['<pad>'] = len(token_to_ix)
    return token_to_ix

In [103]:
def build_label_to_ix(labels):
    label_to_ix = dict()
    for label in labels:
        if label not in label_to_ix:
            label_to_ix[label] = len(label_to_ix)
    
    label_to_ix

In [104]:
def remove_infrequent_words(sents):
	word_counts = {}
	for s in sents:
		for w in s:
			if w in word_counts:
				word_counts[w] += 1
			else:
				word_counts[w] = 1

	threshold = 2
	filtered_sents = []
	for s in sents:
		new_s = []
		for w in s:
			if word_counts[w] < threshold:
				new_s.append('<UNKOWN>')
			else:
				new_s.append(w)
		filtered_sents.append(new_s)
	return filtered_sents

In [105]:
def restrict_w2v(w2v, restricted_word_set):
    new_vectors = []
    new_vocab = {}
    new_index2entity = []
    # new_vectors_norm = []

    for i in range(len(w2v.vocab)):
        word = w2v.index2entity[i]
        vec = w2v.vectors[i]
        vocab = w2v.vocab[word]
        # vec_norm = w2v.vectors_norm[i]
        if word in restricted_word_set:
            vocab.index = len(new_index2entity)
            new_index2entity.append(word)
            new_vocab[word] = vocab
            new_vectors.append(vec)
            # new_vectors_norm.append(vec_norm)

    w2v.vocab = new_vocab
    w2v.vectors = new_vectors
    w2v.index2entity = new_index2entity
    w2v.index2word = new_index2entity
    # w2v.vectors_norm = new_vectors_norm

In [106]:
def get_accuracy(truth, pred):
    assert len(truth)==len(pred)
    right = 0
    for i in range(len(truth)):
        if truth[i]==pred[i]:
            right += 1.0
    return right/len(truth)

In [107]:
torch.set_default_tensor_type('torch.cuda.FloatTensor')
torch.backends.cudnn.fastest = True

# read data

In [184]:
df = pd.read_csv("processed_request_corpus.csv").drop("Unnamed: 0", axis=1)

In [185]:
df.head(1)

Unnamed: 0,r_id,r_text,date_submitted,submission_method,receiving_department,assigned_pro,documents_released_to_requester_details,documents_released_details,request_published,request_closed,documents_released_to_requester,documents_released,department_assignment_details,request_closed_hide,request_reopened,request_opened,department_assignment,close_date,doc_released
0,19-5614,This firm is performing a Phase I Environmenta...,"November 21, 2019",web,Fire-Rescue,Angela Laurita,,,,Duplicate request This request was already ent...,,,,,,Request received via web,Fire-Rescue,Request Closed Public November 21 2019,0


In [186]:
print(list(df))

['r_id', 'r_text', 'date_submitted', 'submission_method', 'receiving_department', 'assigned_pro', 'documents_released_to_requester_details', 'documents_released_details', 'request_published', 'request_closed', 'documents_released_to_requester', 'documents_released', 'department_assignment_details', 'request_closed_hide', 'request_reopened', 'request_opened', 'department_assignment', 'close_date', 'doc_released']


In [187]:
df = df.dropna(subset=["r_text"])

In [188]:
r_text = [t.translate(str.maketrans('', '', string.punctuation)).lower().split() for t in df["r_text"]]

In [189]:
r_text = remove_infrequent_words(r_text)

In [190]:
word_to_ix = build_token_to_ix(r_text)

17003


In [191]:
label_to_ix = {0:0,1:1}

In [192]:
y = list(df["doc_released"])

In [193]:
# r_text.sort(key=len)
# r_text.reverse()

In [194]:
r_len = [len(r) for r in r_text]

In [195]:
len(r_len)

17003

# Split and Embed

In [196]:
w2v = api.load('glove-wiki-gigaword-50')

In [197]:
len(w2v.vocab)

400000

In [198]:
len_df = pd.DataFrame({"x":r_text, "y":y, "len": [len(r) for r in r_text]})

In [199]:
len_df = len_df[len_df["len"] < 100]

In [200]:
# len(len_df)

In [201]:
train, valid, test = train_validate_test_split(pd.DataFrame({"x":list(len_df["x"]), "y":list(len_df["y"])}), seed=2)

In [202]:
train_data   = prep_data(train["x"], train["y"])
dev_data     = prep_data(valid["x"], valid["y"])
test_data    = prep_data(test["x"], test["y"])

# Binary Model

In [203]:
# These will usually be more like 32 or 64 dimensional.
# We will keep them small, so we can see how the weights change as we train.
EMBEDDING_DIM = 50
HIDDEN_DIM    = 50
VOCAB_SIZE    = 40000 # len(word_to_ix)
OUTPUT_SIZE   = len(label_to_ix)

In [204]:
class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        
        # self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.weights         = torch.FloatTensor(w2v.vectors)
        self.word_embeddings = nn.Embedding.from_pretrained(self.weights)
        # model.word_embeddings.weight.requires_grad=False
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2label = nn.Linear(hidden_dim, 1)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # the first is the hidden h
        # the second is the cell  c
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        # embeds = self.word_embeddings(sentence)
        # x = embeds.view(len(sentence), 1, -1)
        x = sentence
        lstm_out, self.hidden = self.lstm(x.view(len(sentence), 1, -1), self.hidden)
        y  = self.hidden2label(lstm_out[-1])
        log_probs = torch.sigmoid(y)
        return log_probs

In [205]:
# model         = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, OUTPUT_SIZE)
model = LSTMClassifier(embedding_dim=EMBEDDING_DIM,hidden_dim=HIDDEN_DIM,
                           vocab_size=VOCAB_SIZE,label_size=OUTPUT_SIZE)
model.cuda()
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(),lr = 1e-3)

In [206]:
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    # inputs = prepare_sequence(train_data, wv) # my train data is already prepped
    tag_scores = model(torch.tensor(train_data[0][0], dtype=torch.float))
    print(tag_scores)

tensor([[0.5062]])


In [207]:
train_data.sort(key=lambda x: len(x[0]))
train_data.reverse()

In [208]:
len(train_data[0][0])

99

In [209]:
len(train_data)

7783

In [210]:
train_data = [(x,y) for x,y in train_data if ~(len(x)==0)]
train_data.reverse()

In [211]:
len(train_data)

7783

In [None]:
i = 0
loss_l = 0.0
loss   = []
for epoch in range(80):  # again, normally you would NOT do 300 epochs, it is toy data
    model.train()
    for sentence, label in train_data:
        if(len(sentence) == 0): # this for zero len sentences, that we cannot seem to filter out???
            # print("empty sentence")
            continue
        #print(tag)
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        
        sentence = torch.tensor(sentence).type(torch.cuda.FloatTensor)
        # Step 3. Run our forward pass.
        label_score = model(sentence)
        label = torch.tensor([[label]]).type(torch.cuda.FloatTensor)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        # print(pred_label)
        # print(label)
        loss = loss_function(label_score, label)
        loss_l+=loss.item()
        # if (i < len(train_data)):
        #     loss.backward(retain_graph=True)
        # else:
        #     print("finish epoch")
        #     loss.backward()
        loss.backward()
        optimizer.step()
        i+=1
    # loss.append((loss_l / i))
    print("epoch " + str(epoch) + str(loss_l / i))
    i = 0
    loss_l = 0.0

epoch 00.6566970421439542
epoch 10.6407897275714947


In [81]:
# See what the scores are before training
# Note that element i,j of the output is the score for tag j for word i.
# Here we don't need to train, so the code is wrapped in torch.no_grad()
with torch.no_grad():
    # inputs = prepare_sequence(train_data, wv) # my train data is already prepped
    tag_scores = model(torch.tensor(train_data[0][0], dtype=torch.float))
    print(tag_scores)

tensor([[1.0000]])


In [80]:
# torch.save(model.state_dict(), "mymodel01.pt")

In [84]:
avg_loss = 0.0
truth_res = []
pred_res = []

for sentence, label in test_data:
    if(len(sentence) == 0): # this for zero len sentences, that we cannot seem to filter out???
        print("empty sentence")
        continue
    truth_res.append(label)
    # detaching it from its history on the last instance.
    model.hidden = model.init_hidden()
    
    sentence = torch.tensor(sentence).type(torch.cuda.FloatTensor)
    pred = model(sentence)
    label = torch.tensor([[label]]).type(torch.cuda.FloatTensor)
    if pred.item() > .5:
        pred_label = 1
    else:
        pred_label = 0
    pred_res.append(pred_label)
    # model.zero_grad() # should I keep this when I am evaluating the model?
    loss = loss_function(pred, label)
    avg_loss += loss.item()
avg_loss /= len(test_data)
acc = get_accuracy(truth_res, pred_res)
print(' avg_loss:%g train acc:%g' % (avg_loss, acc ))

empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
empty sentence
 avg_loss:2.01276 train acc:0.604577
