In [1]:
import torch
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data.dataset import Dataset, TensorDataset
from torch.utils.data.dataloader import DataLoader
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import json
import pickle
import string
from dataset import SpoilerDataset
from tqdm import tqdm
from utils import *
%load_ext autoreload
%autoreload 2

In [3]:
EPOCHS = 5
WORD_EMBEDDING_DIM = 100
HIDDEN_DIM = 50
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # torch.device("cpu")
LEARNING_RATE = 0.001
ACCUMULATE_GRAD_STEPS = 64

In [3]:
train_dataset = SpoilerDataset(filename="train.pickle") # using defaults

Loading existing train_processed.pickle .


In [4]:
train_dataloader = DataLoader(train_dataset)

In [6]:
class SpoilerNet(nn.Module):
    def __init__(self,word_emb_dim, hidden_dim, word_vocab_size, num_gru_layers, bidirectional):
        super(SpoilerNet, self).__init__()
        self.device = DEVICE
        self.word_embedding = nn.Embedding(word_vocab_size, word_emb_dim) # TODO: use pretrained embeddings
        self.word_gru = nn.GRU(input_size=word_emb_dim, hidden_size=hidden_dim, 
                               num_layers=num_gru_layers, bidirectional=bidirectional ,batch_first=True)
        self.linear = nn.Linear(in_features=2*hidden_dim,out_features=hidden_dim)
        self.attention = nn.Linear(in_features=hidden_dim,out_features=1,bias=False)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        self.sentence_gru = nn.GRU(input_size=2*hidden_dim,hidden_size=hidden_dim,num_layers=num_gru_layers,
                                   bidirectional=bidirectional,batch_first=True)
        self.output_layer = nn.Linear(in_features=2*hidden_dim,out_features=2)
        self.softmax_output = nn.LogSoftmax()
        
    def forward(self,review):
        vectorized_sentences = []
        for sentence in review:
            sentence = sentence.to(self.device) # TODO: I think it's more efficient to load everything at once
            embedded_sentence = self.word_embedding(sentence)
            word_hidden_state, _ = self.word_gru(embedded_sentence) # TODO: use sentence batching
            mu = self.tanh(self.linear(word_hidden_state))
            alpha_weights = self.softmax(self.attention(mu))
            attended_vector = (alpha_weights * word_hidden_state).sum(dim=1)
            vectorized_sentences.append(attended_vector)
        stacked_vectorized_sentences = torch.stack(vectorized_sentences,dim=1)
        sentence_hidden_state , _ = self.sentence_gru(stacked_vectorized_sentences)
        output = self.output_layer(sentence_hidden_state).view(len(review),-1)
        probs = self.softmax_output(output)
        return probs,output

In [7]:
model = SpoilerNet(WORD_EMBEDDING_DIM,HIDDEN_DIM,len(train_dataset.word_to_id), 2, True)
model.to(DEVICE)

SpoilerNet(
  (word_embedding): Embedding(895248, 100)
  (word_gru): GRU(100, 50, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=100, out_features=50, bias=True)
  (attention): Linear(in_features=50, out_features=1, bias=False)
  (tanh): Tanh()
  (softmax): Softmax(dim=1)
  (sentence_gru): GRU(100, 50, num_layers=2, batch_first=True, bidirectional=True)
  (output_layer): Linear(in_features=100, out_features=2, bias=True)
  (softmax_output): LogSoftmax()
)

In [8]:
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [9]:
loss_train_list = []
for epoch in range(EPOCHS):
    loss_train_total = 0
    # each batch is a single review
    for batch_idx, input_data in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        review = input_data[0] # .to(model.device)
        labels = torch.tensor(input_data[1]).to(model.device)
        probs, output = model(review)
        loss = criterion(probs, labels)
        loss = loss/ACCUMULATE_GRAD_STEPS
        loss.backward()
        
        if batch_idx % ACCUMULATE_GRAD_STEPS == 0:
            optimizer.step()
            model.zero_grad()
            
        loss_train_total += loss.item()
        
    save_model(model, batch_idx+1)
    loss_train_total = loss_train_total / len(train_dataset)
    loss_train_list.append(float(loss_train_total))
    
    print("Epoch {} Completed,\tTrain Loss: {}".format(epoch + 1, np.mean(loss_train_list[-batch_idx:])))
    with open("output.txt", "a") as file:
        file.write("Epoch {} Completed,\tTrain Loss: {}".format(epoch + 1, np.mean(loss_train_list[-batch_idx:])))
        file.write("\n")

100%|██████████| 107554/107554 [7:02:58<00:00,  4.24it/s]  
  0%|          | 0/107554 [00:00<?, ?it/s]

Epoch 1 Completed,	Train Loss: 0.004758241493778046


 37%|███▋      | 39431/107554 [2:34:52<4:27:34,  4.24it/s] 


KeyboardInterrupt: 

In [49]:
def predict(dataloader):
    y_true = []
    y_pred = []
    for batch_idx, input_data in tqdm(enumerate(dataloader), total=len(dataloader)):
        review = input_data[0]
        labels = torch.tensor(input_data[1]).to(model.device)
        probs,ouput = model(review)
        _, predicted = torch.max(probs.data, 1)
        y_true += labels.tolist()
        y_pred += predicted.tolist()
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")    
    return {"precision": precision, "recall":recall, "f1":f1, "accuracy":accuracy_score(y_true, y_pred)}

In [11]:
valid_dataset = SpoilerDataset(filename="valid.pickle")

Generating ...


100%|██████████| 35850/35850 [00:29<00:00, 1203.29it/s]


In [13]:
valid_dataloader = DataLoader(valid_dataset)

In [50]:
predict(valid_dataloader)

100%|██████████| 35850/35850 [15:19<00:00, 38.97it/s]


{'precision': 0.7094590378735883,
 'recall': 0.3952756726691233,
 'f1': 0.5076909338512714,
 'accuracy': 0.8632353534006655}

In [51]:
predict(train_dataloader)

100%|██████████| 107554/107554 [44:34<00:00, 40.21it/s] 


{'precision': 0.7477072920671537,
 'recall': 0.4275033119570442,
 'f1': 0.5439830829275935,
 'accuracy': 0.8732016878442239}

In [52]:
loss_train_list

[0.004758241493778046]