In [1]:
import torch
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data.dataset import Dataset, TensorDataset
from torch.utils.data.dataloader import DataLoader
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import json
import pickle
import string
from dataset import SpoilerDataset
from tqdm import tqdm
from utils import *
import time
import warnings
warnings.filterwarnings('ignore')
# %load_ext autoreload
# %autoreload 2

In [2]:
EPOCHS = 10
WORD_EMBEDDING_DIM = 50
HIDDEN_DIM = 50
WORD_FEATURES_DIM = 3
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # torch.device("cpu")
LEARNING_RATE = 0.001
ACCUMULATE_GRAD_STEPS = 64
LOAD_PROCESSED_DATA = False
SAVE_MODEL = True
SAVE_LOG_TO_FILE = True
NEGATIVE_CLASS_WEIGHT = 0.05
MODEL_PATH = "model_10"
TRAIN_TIME_THRESHOLD = 900
OUT_FILE = "output.txt"

In [3]:
class SpoilerNet(nn.Module):
    def __init__(self, train_dataset, word_emb_dim, features_dim, hidden_dim, word_vocab_size, num_gru_layers, bidirectional):
        super(SpoilerNet, self).__init__()
        self.device = DEVICE
        self.dataset = train_dataset
        pretrained_embeddings = self.dataset.load_pretrained_embeddings(word_emb_dim)
        self.word_embedding = nn.Embedding.from_pretrained(pretrained_embeddings.to(self.device), freeze=True)
        # adding word features
        word_emb_dim += features_dim
        self.word_gru = nn.GRU(input_size=word_emb_dim, hidden_size=hidden_dim, 
                               num_layers=num_gru_layers, bidirectional=bidirectional ,batch_first=True)
        self.linear = nn.Linear(in_features=2*hidden_dim,out_features=hidden_dim)
        self.attention = nn.Linear(in_features=hidden_dim,out_features=1,bias=False)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        self.sentence_gru = nn.GRU(input_size=2*hidden_dim,hidden_size=hidden_dim,num_layers=num_gru_layers,
                                   bidirectional=bidirectional,batch_first=True)
        self.output_layer = nn.Linear(in_features=2*hidden_dim,out_features=2)
        self.book_bias = nn.Parameter(torch.torch.zeros(self.dataset.get_num_books()), requires_grad=True)
        self.user_bias = nn.Parameter(torch.torch.zeros(self.dataset.get_num_users()), requires_grad=True)
        self.softmax_out = nn.LogSoftmax()
        
    def forward(self,review, book_ind, user_ind):
        vectorized_sentences = []
        for sentence in review:
#             print([self.dataset.id_to_word[word_id.item()] for word_id in sentence[0]])
            sentence = sentence.to(self.device)
            embedded_sentence = self.word_embedding(sentence).to(self.device)
#             print((embedded_sentence == 0).all(dim=2).sum()) # number of 0 embeds
#             t = self.word_embedding(torch.tensor([[self.dataset.word_to_id[SPECIAL_CHARACTER]]]).to(self.device))
#             print(self.word_embedding(torch.tensor([[self.dataset.word_to_id[SPECIAL_CHARACTER]]]).to(self.device)))
#             print((t == 0).all(dim=2).sum())
            word_features = self.dataset.get_tf_idf_features_tensor(sentence, book_ind).to(self.device)
            embedded_sentence = torch.cat([embedded_sentence, word_features],dim=2)
            word_hidden_state, _ = self.word_gru(embedded_sentence)
            mu = self.tanh(self.linear(word_hidden_state))
            alpha_weights = self.softmax(self.attention(mu))
            attended_vector = (alpha_weights * word_hidden_state).sum(dim=1)
            vectorized_sentences.append(attended_vector)
        stacked_vectorized_sentences = torch.stack(vectorized_sentences,dim=1)
        sentence_hidden_state , _ = self.sentence_gru(stacked_vectorized_sentences)
        output = self.output_layer(sentence_hidden_state).view(len(review),-1)
#         print(output)
        output += self.book_bias[book_ind] + self.book_bias[user_ind]
#         print(output)
        probs = self.softmax_out(output)
#         print(probs)
#         print()
        return probs,output

    def predict(self,dataloader):
        self.eval()
        y_true = []
        y_pred = []
        y_probs = []
        for batch_idx, input_data in enumerate(dataloader):
#         for batch_idx, input_data in tqdm(enumerate(dataloader), total=len(dataloader)):
            review = input_data[0]
            labels = torch.tensor(input_data[1]).to(self.device)
            book_id = input_data[2]
            user_id = input_data[3]
            probs,ouput = self.forward(review, book_id, user_id)
            positive_probs, predicted = torch.max(probs.data, 1)
            y_true += labels.tolist()
            y_pred += predicted.tolist()
            y_probs += positive_probs.tolist()
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
        metrics = {"AUC":roc_auc_score(y_true,y_probs), "precision": precision, "recall":recall, "f1":f1, "accuracy":accuracy_score(y_true, y_pred)}
        if SAVE_LOG_TO_FILE:
            write_to_log(metrics)
        self.train()
        return y_true, y_pred,  metrics

In [4]:
def write_to_log(obj):
    with open(OUT_FILE, "a") as file:
        file.write(str(obj))
        file.write("\n")

        
def train():
    train_dataset = SpoilerDataset(filename="train.pickle", load=LOAD_PROCESSED_DATA) # using defaults
    train_dataloader = DataLoader(train_dataset)
    
    valid_small_dataset = SpoilerDataset(filename="valid_small.pickle", load=LOAD_PROCESSED_DATA)
    valid_small_dataloader = DataLoader(valid_small_dataset)
    
    model = SpoilerNet(train_dataset, WORD_EMBEDDING_DIM, WORD_FEATURES_DIM, HIDDEN_DIM,len(train_dataset.word_to_id), 2, True)
    model.zero_grad()
    model.to(DEVICE)
    
    # reduction='sum' because we're using 1 sample batch
    criterion = nn.NLLLoss(weight=torch.tensor([NEGATIVE_CLASS_WEIGHT, 1]), reduction='sum').to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    beg = time.time()
    loss_train_list = []
    for epoch in range(EPOCHS):
        loss_train_total = 0
        # each batch is a single review
        for batch_idx, input_data in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
            
            review = input_data[0]
            labels = torch.tensor(input_data[1]).to(model.device)
            book_id = input_data[2]
            user_id = input_data[3]
            
            probs, output = model(review, book_id, user_id)
            loss = criterion(probs, labels)
            loss = loss/ACCUMULATE_GRAD_STEPS
            loss.backward()
            
            if batch_idx % ACCUMULATE_GRAD_STEPS == 0:
                optimizer.step()
                model.zero_grad()

            loss_train_total += loss.item()
#             break
            if time.time()-beg > TRAIN_TIME_THRESHOLD:
                y_true, y_pred, metrics = model.predict(valid_small_dataloader)
                print(metrics)
                write_to_log(str(round(time.time()-beg,2)) + "\t" + str(batch_idx))
                write_to_log(metrics)
                beg = time.time()

        if SAVE_MODEL:
            save_model(model, epoch+1)

        loss_train_total = loss_train_total / len(train_dataset)
        loss_train_list.append(float(loss_train_total))

        print("Epoch {} Completed,\tTrain Loss: {}".format(epoch + 1, np.mean(loss_train_list[-batch_idx:])))
        
        if SAVE_LOG_TO_FILE:
            write_to_log("Epoch {} Completed,\tTrain Loss: {}".format(epoch + 1, np.mean(loss_train_list[-batch_idx:])))
            
    return loss_train_list

In [5]:
train()

Generating ...


100%|██████████| 107554/107554 [01:31<00:00, 1179.92it/s]
 14%|█▎        | 270/2000 [00:00<00:01, 1371.86it/s]

Generating ...


100%|██████████| 2000/2000 [00:01<00:00, 1311.35it/s]
100%|██████████| 20001/20001 [00:00<00:00, 180635.67it/s]
  3%|▎         | 3110/107554 [03:45<2:06:10, 13.80it/s]


KeyboardInterrupt: 

In [None]:
model = load_pickle(MODEL_PATH)

In [None]:
valid_dataset = SpoilerDataset(filename="valid.pickle", load=LOAD_PROCESSED_DATA)
valid_dataloader = DataLoader(valid_dataset)
y_true, y_pred, metrics = model.predict(valid_dataloader)
metrics

In [None]:
sum(y_true)/len(y_true), sum(y_pred)/len(y_pred)

In [None]:
train_dataset = SpoilerDataset(filename="train.pickle", load=LOAD_PROCESSED_DATA)
train_dataloader = DataLoader(train_dataset)
y_true, y_pred, metrics = model.predict(train_dataloader)
metrics

In [None]:
sum(y_true)/len(y_true), sum(y_pred)/len(y_pred)

In [None]:
loss_train_list

In [6]:
# samples = load_pickle("train.pickle")
# for data_dict in tqdm(samples):
#     review_sentences = data_dict['review_sentences']
#     for label, sentence in review_sentences:
#         print(label, sentence)
#     break