In [1]:
import torch
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data.dataset import Dataset, TensorDataset
from torch.utils.data.dataloader import DataLoader
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import json
import pickle
import string
from dataset import SpoilerDataset
from tqdm import tqdm
from utils import *
import time
%load_ext autoreload
%autoreload 2

In [2]:
EPOCHS = 5
WORD_EMBEDDING_DIM = 50
HIDDEN_DIM = 50
WORD_FEATURES_DIM = 3
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # torch.device("cpu")
LEARNING_RATE = 0.001
ACCUMULATE_GRAD_STEPS = 64
LOAD_PROCESSED_DATA = False
SAVE_MODEL = True
SAVE_LOG_TO_FILE = True
NEGATIVE_CLASS_WEIGHT = 0.25

In [3]:
train_dataset = SpoilerDataset(filename="train.pickle", load=LOAD_PROCESSED_DATA) # using defaults

Generating ...


100%|██████████| 77622/77622 [01:03<00:00, 1216.40it/s]


In [4]:
train_dataloader = DataLoader(train_dataset)

In [5]:
class SpoilerNet(nn.Module):
    def __init__(self, train_dataset, word_emb_dim, features_dim, hidden_dim, word_vocab_size, num_gru_layers, bidirectional):
        super(SpoilerNet, self).__init__()
        self.device = DEVICE
        self.dataset = train_dataset
        pretrained_embeddings = train_dataset.load_pretrained_embeddings(word_emb_dim)
        self.word_embedding = nn.Embedding.from_pretrained(pretrained_embeddings.to(self.device), freeze=False)
        # adding word features
        word_emb_dim += features_dim
        self.word_gru = nn.GRU(input_size=word_emb_dim, hidden_size=hidden_dim, 
                               num_layers=num_gru_layers, bidirectional=bidirectional ,batch_first=True)
        self.linear = nn.Linear(in_features=2*hidden_dim,out_features=hidden_dim)
        self.attention = nn.Linear(in_features=hidden_dim,out_features=1,bias=False)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        self.sentence_gru = nn.GRU(input_size=2*hidden_dim,hidden_size=hidden_dim,num_layers=num_gru_layers,
                                   bidirectional=bidirectional,batch_first=True)
        self.output_layer = nn.Linear(in_features=2*hidden_dim,out_features=2)
        self.book_bias = nn.Parameter(torch.torch.ones(train_dataset.get_num_books()), requires_grad=True)
        self.user_bias = nn.Parameter(torch.torch.ones(train_dataset.get_num_users()), requires_grad=True)
        self.sigmoid = nn.Sigmoid()
        self.softmax_output = nn.LogSoftmax()
        
    def forward(self,review, book_ind, user_ind):
        vectorized_sentences = []
        for sentence in review:
            sentence = sentence.to(self.device) # TODO: I think it's more efficient to load everything at once
            embedded_sentence = self.word_embedding(sentence).to(self.device)
            word_features = train_dataset.get_tf_idf_features_tensor(sentence, book_id).to(self.device)
            embedded_sentence = torch.cat([embedded_sentence, word_features],dim=2)
            word_hidden_state, _ = self.word_gru(embedded_sentence)
            mu = self.tanh(self.linear(word_hidden_state))
            alpha_weights = self.softmax(self.attention(mu))
            attended_vector = (alpha_weights * word_hidden_state).sum(dim=1)
            vectorized_sentences.append(attended_vector)
        stacked_vectorized_sentences = torch.stack(vectorized_sentences,dim=1)
        sentence_hidden_state , _ = self.sentence_gru(stacked_vectorized_sentences)
        output = self.output_layer(sentence_hidden_state).view(len(review),-1)
        output += self.book_bias[book_ind] + self.book_bias[user_ind]
        probs = self.softmax_output(output)
        return probs,output

In [6]:
model = SpoilerNet(train_dataset, WORD_EMBEDDING_DIM, WORD_FEATURES_DIM, HIDDEN_DIM,len(train_dataset.word_to_id), 2, True)
model.to(DEVICE)

100%|██████████| 20001/20001 [00:00<00:00, 176066.23it/s]


SpoilerNet(
  (word_embedding): Embedding(20001, 50)
  (word_gru): GRU(53, 50, num_layers=2, batch_first=True, bidirectional=True)
  (linear): Linear(in_features=100, out_features=50, bias=True)
  (attention): Linear(in_features=50, out_features=1, bias=False)
  (tanh): Tanh()
  (softmax): Softmax(dim=1)
  (sentence_gru): GRU(100, 50, num_layers=2, batch_first=True, bidirectional=True)
  (output_layer): Linear(in_features=100, out_features=2, bias=True)
  (sigmoid): Sigmoid()
  (softmax_output): LogSoftmax()
)

In [7]:
# reduction='sum' because we're using 1 sample batch
criterion = nn.NLLLoss(weight=torch.tensor([NEGATIVE_CLASS_WEIGHT, 1]), reduction='sum').to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [10]:
beg = time.time()
loss_train_list = []
for epoch in range(EPOCHS):
    loss_train_total = 0
    # each batch is a single review
    for batch_idx, input_data in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
        review = input_data[0]
        labels = torch.tensor(input_data[1]).to(model.device)
        book_id = input_data[2]
        user_id = input_data[3]
        probs, output = model(review, book_id, user_id)
        loss = criterion(probs, labels)
        loss = loss/ACCUMULATE_GRAD_STEPS
        loss.backward()
        
        if batch_idx % ACCUMULATE_GRAD_STEPS == 0:
            optimizer.step()
            model.zero_grad()
            
        loss_train_total += loss.item()
        
        if time.time()-beg > 600:
            with open("output.txt", "a") as file:
                file.write(str(round(time.time()-beg,2)) + "\t" + str(batch_idx))
                file.write("\n")
#             print(round(time.time()-beg,2),batch_idx)
            beg = time.time()
        
    if SAVE_MODEL:
        save_model(model, epoch+1)
        
    loss_train_total = loss_train_total / len(train_dataset)
    loss_train_list.append(float(loss_train_total))
    
    print("Epoch {} Completed,\tTrain Loss: {}".format(epoch + 1, np.mean(loss_train_list[-batch_idx:])))
    if SAVE_LOG_TO_FILE:
        with open("output.txt", "a") as file:
            file.write("Epoch {} Completed,\tTrain Loss: {}".format(epoch + 1, np.mean(loss_train_list[-batch_idx:])))
            file.write("\n")

  1%|▏         | 1019/77622 [01:17<1:37:18, 13.12it/s]


KeyboardInterrupt: 

In [None]:
def predict(dataloader):
    y_true = []
    y_pred = []
    for batch_idx, input_data in tqdm(enumerate(dataloader), total=len(dataloader)):
        review = input_data[0]
        labels = torch.tensor(input_data[1]).to(model.device)
        book_id = input_data[2]
        user_id = input_data[3]
        probs,ouput = model(review, book_id, user_id)
        _, predicted = torch.max(probs.data, 1)
        y_true += labels.tolist()
        y_pred += predicted.tolist()
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")    
    if SAVE_LOG_TO_FILE:
        with open("output.txt", "a") as file:
            file.write(str({"precision": precision, "recall":recall, "f1":f1, "accuracy":accuracy_score(y_true, y_pred)}))
            file.write("\n")
    return y_true, y_pred, {"precision": precision, "recall":recall, "f1":f1, "accuracy":accuracy_score(y_true, y_pred)}

In [None]:
valid_dataset = SpoilerDataset(filename="valid.pickle", load=LOAD_PROCESSED_DATA)

In [None]:
valid_dataloader = DataLoader(valid_dataset)

In [None]:
y_true, y_pred, metrics = predict(valid_dataloader)
metrics

In [None]:
sum(y_true)/len(y_true), sum(y_pred)/len(y_pred)

In [None]:
y_true, y_pred, metrics = predict(train_dataloader)
metrics

In [None]:
sum(y_true)/len(y_true), sum(y_pred)/len(y_pred)

In [None]:
loss_train_list