In [50]:
import torch
import torch.nn as nn
import pandas as pd
from torchtext.legacy import data
import random
import torch.nn as nn
import torch.optim as optim

In [51]:
# check device type: gpu or cpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if __name__=='__main__':
    print('Using device:', device)

Using device: cuda


In [52]:
# mount to google drive folder
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [65]:
SEED = 1994

torch.manual_seed(SEED)

TEXT = data.Field(include_lengths = True)

LABEL = data.LabelField(dtype = torch.float)

fields = [('label', LABEL), ('text', TEXT)]

train_data, test_data = data.TabularDataset.splits(
                            path = '/',
                            train = '/content/drive/My Drive/CS410Project/data/full_train.csv',
                            test = '/content/drive/My Drive/CS410Project/data/full_test.csv',
                            format = 'csv',
                            fields = fields)





In [66]:
VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, 
                 max_size = VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)


In [67]:
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

In [68]:
BATCH_SIZE = 500

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    sort_key = lambda x: x.text,
    device = device)



In [69]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, 
                 bidirectional, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.RNN(embedding_dim, 
                           hidden_dim, 
                           num_layers=num_layers, 
                           bidirectional=bidirectional)
        
        self.linear = nn.Linear(2 * hidden_dim, output_dim)

        
    def forward(self, text, text_lengths):
        
        text_embed = self.embedding(text)
        packed_embed = nn.utils.rnn.pack_padded_sequence(text_embed, text_lengths.to('cpu'),enforce_sorted=False)
        
        packed_output, hidden = self.rnn(packed_embed)
        
        final = torch.cat((hidden[-1,:,:],hidden[-2,:,:]), dim = 1)
                
            
        return self.linear(final)

In [70]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
LAYERS = 2
BIDIRECTIONAL=True

PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            LAYERS, 
            BIDIRECTIONAL, 
            PAD_IDX)
model = model.to(device)


In [71]:

model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)



In [72]:
criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

optimizer = optim.Adam(model.parameters())



In [73]:
def binary_accuracy(preds, y_correct):
    correct = (torch.round(torch.sigmoid(preds)) == y_correct).float()
    raw_acc = correct.sum() / len(correct)
    return raw_acc

In [74]:
def train(model, iterator, optimizer, criterion):
    
    loss = 0
    acc = 0
    
    model.train()
    
    for batch in iterator:
        label = batch.label
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        pred = model(text, text_lengths)
        pred = pred.squeeze(1)
        
        loss = criterion(pred, label)
        
        acc = binary_accuracy(pred, label)
        
        loss.backward()
        optimizer.step()
        
        loss += loss.item()
        acc += acc.item()
        
    return loss / len(iterator), acc / len(iterator)

In [75]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            pred = model(text, text_lengths)
            pred = pred.squeeze(1)
            
            loss = criterion(pred, batch.label)
            epoch_loss += loss.item()

            acc = binary_accuracy(pred, batch.label)
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [76]:
EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    if best_valid_loss > valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'rnnmodel.pt')
    
    print(f'Epoch: {epoch}')
    print(f'\t Val. Loss: {valid_loss:.23f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 0
	 Val. Loss: 0.03213745279406959171054 |  Val. Acc: 99.15%
Epoch: 1
	 Val. Loss: 0.01909013928591527659018 |  Val. Acc: 99.49%
Epoch: 2
	 Val. Loss: 0.01334136268600228866776 |  Val. Acc: 99.65%
Epoch: 3
	 Val. Loss: 0.01171928939981047371188 |  Val. Acc: 99.74%
Epoch: 4
	 Val. Loss: 0.01269897632300853729248 |  Val. Acc: 99.68%


In [77]:
model.load_state_dict(torch.load('rnnmodel.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Val. Acc: {test_acc*100:.2f}%')

Val. Acc: 99.64%
