In [36]:
import torch
import torch.nn as nn
from torchtext import data

SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

data_root = '/home/pragna/Downloads/A1-Q3_Dataset-20190401T174811Z-001/A1-Q3_Dataset'

In [37]:
fields = {'Phrase': ('p', TEXT), 'Sentiment': ('l', LABEL)}

train_data = data.TabularDataset.splits(
                            path = data_root,
                            train = 'mrdata_2.tsv',
                            test = None,
                            format = 'tsv',
                            fields = fields
)

In [38]:
print(f'Number of training examples: {len(train_data[0])}')

Number of training examples: 156060


In [39]:
print(vars(train_data[0][0]))

{'p': ['A', 'series', 'of', 'escapades', 'demonstrating', 'the', 'adage', 'that', 'what', 'is', 'good', 'for', 'the', 'goose', 'is', 'also', 'good', 'for', 'the', 'gander', ',', 'some', 'of', 'which', 'occasionally', 'amuses', 'but', 'none', 'of', 'which', 'amounts', 'to', 'much', 'of', 'a', 'story', '.'], 'l': '1'}


In [40]:
import random

train_data, valid_data = train_data[0].split(random_state=random.seed(SEED))

In [41]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

Number of training examples: 109242
Number of validation examples: 46818


In [42]:
TEXT.build_vocab(train_data, max_size=10000)
LABEL.build_vocab(train_data)

In [43]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 10002
Unique tokens in LABEL vocabulary: 5


In [44]:
print(TEXT.vocab.freqs.most_common(20))

[('the', 33060), (',', 29482), ('a', 23677), ('of', 22806), ('and', 22461), ('-', 16178), ('to', 16029), ('.', 12493), ("'s", 11901), ('in', 9611), ('is', 9475), ('that', 8614), ('it', 7394), ('as', 5894), ('with', 5323), ('for', 5169), ('its', 4905), ('film', 4660), ('movie', 4271), ('an', 4145)]


In [45]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', 'the', ',', 'a', 'of', 'and', '-', 'to', '.']


In [46]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7efc18c338c8>, {'2': 0, '3': 1, '1': 2, '4': 3, '0': 4})


In [47]:
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data),
    batch_size=BATCH_SIZE,
    device=device)

In [48]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        embedded = self.embedding(x)
        _, (h_n, c_n) = self.rnn(embedded)
        return self.fc(self.dropout(h_n.squeeze(0)))

In [49]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 50
HIDDEN_DIM = 256
OUTPUT_DIM = 5

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [50]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [51]:
criterion = nn.CrossEntropyLoss()

In [52]:
model = model.to(device)
criterion = criterion.to(device)

In [53]:
def accuracy(preds, y):
    correct = (torch.argmax(preds, dim=-1) == y).float() #convert into float for division 
    acc = correct.sum()/len(correct)
    return acc

In [54]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.p).squeeze(1)
        
        loss = criterion(predictions, batch.l.long())
        
        acc = accuracy(predictions, batch.l.long())
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [55]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.p).squeeze(1)
            
            loss = criterion(predictions, batch.l.long())
            
            acc = accuracy(predictions, batch.l.long())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [56]:
N_EPOCHS = 9

for epoch in range(N_EPOCHS):

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')

| Epoch: 01 | Train Loss: 1.176 | Train Acc: 53.34%
| Epoch: 02 | Train Loss: 0.982 | Train Acc: 60.85%
| Epoch: 03 | Train Loss: 0.863 | Train Acc: 65.42%
| Epoch: 04 | Train Loss: 0.789 | Train Acc: 68.19%
| Epoch: 05 | Train Loss: 0.730 | Train Acc: 70.56%
| Epoch: 06 | Train Loss: 0.680 | Train Acc: 72.58%
| Epoch: 07 | Train Loss: 0.632 | Train Acc: 74.60%
| Epoch: 08 | Train Loss: 0.587 | Train Acc: 76.61%
| Epoch: 09 | Train Loss: 0.542 | Train Acc: 78.47%
