In [63]:
import torch
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
from torch.utils.data.dataset import Dataset, TensorDataset
from torch.utils.data.dataloader import DataLoader
import json
import pickle
import string
from SpoilerDataset import SpoilerDataset

In [2]:
train_dataset = SpoilerDataset('data/train_reviews_balanced.json','data/word_to_id.pickle','data/id_to_word.pickle')

143403


In [3]:
train_dataloader = DataLoader(train_dataset)

In [58]:
class SpoilerNet(nn.Module):
    def __init__(self,word_emb_dim, hidden_dim, word_vocab_size):
        super(SpoilerNet, self).__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.word_embedding = nn.Embedding(word_vocab_size, word_emb_dim)
        self.word_gru = nn.GRU(input_size=word_emb_dim,hidden_size=hidden_dim,num_layers=2,bidirectional=True,batch_first=True)
        self.linear = nn.Linear(in_features=2*hidden_dim,out_features=hidden_dim)
        self.attention = nn.Linear(in_features=hidden_dim,out_features=1,bias=False)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        self.sentence_gru = nn.GRU(input_size=2*hidden_dim,hidden_size=hidden_dim,num_layers=2,bidirectional=True,batch_first=True)
        self.output_layer = nn.Linear(in_features=2*hidden_dim,out_features=2)
        self.softmax_output = nn.LogSoftmax()
        
    def forward(self,review):
        vectorized_sentences = []
        for sentence in review:
            sentence = sentence.to(self.device)
            embedded_sentence = self.word_embedding(sentence)
            word_hidden_state, _ = self.word_gru(embedded_sentence)
            mu = self.tanh(self.linear(word_hidden_state))
            alpha_weights = self.softmax(self.attention(mu))
            attended_vector = (alpha_weights * word_hidden_state).sum(dim=1)
            vectorized_sentences.append(attended_vector)
        
        stacked_vectorized_sentences = torch.stack(vectorized_sentences,dim=1)
        sentence_hidden_state , _ = self.sentence_gru(stacked_vectorized_sentences)
        output = self.output_layer(sentence_hidden_state).view(len(review),-1)
        probs = self.softmax_output(output)
        return probs,output
            
        

In [64]:
EPOCHS = 5
WORD_EMBEDDING_DIM = 100
HIDDEN_DIM = 50

word_vocab_size = len(train_dataset.word_to_id)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

model = SpoilerNet(WORD_EMBEDDING_DIM,HIDDEN_DIM,word_vocab_size)

if use_cuda:
    model.cuda()
    
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
acumulate_grad_steps = 64

In [None]:
loss_list_train = []
for epoch in range(EPOCHS):
    loss_train_total = 0
    i = 0
    for batch_idx, input_data in enumerate(train_dataloader):
        i += 1
        review = input_data[0]
        labels = torch.tensor(input_data[1]).to(model.device)
        probs,output = model(review)
        loss = criterion(probs,labels)
        loss = loss/ acumulate_grad_steps
        loss.backward()
        
        if i % acumulate_grad_steps == 0:
            optimizer.step()
            model.zero_grad()
        
        loss_train_total += loss.item()
            
    
    loss_train_total = loss_train_total / len(train_dataset)
    loss_list_train.append(float(loss_train_total))
    e_interval = i
    print("Epoch {} Completed,\tTrain Loss: {}".format(epoch + 1, np.mean(loss_list_train[-e_interval:])))
        



In [61]:
true_positive = 0
positive = 0
for batch_idx, input_data in enumerate(train_dataloader):

    review = input_data[0]
    labels = torch.tensor(input_data[1]).to(model.device)
    probs,ouput = model(review)
    _, predicted = torch.max(probs.data, 1)
    for index,label in enumerate(labels):
        if label == 1:
            if predicted[index]==1:
                true_positive +=1
            positive += 1
            

print("Precision: {}".format(true_positive/positive))



Precision: 0.921875
