In [1]:
import torch
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
import time
import random

torch.backends.cudnn.deterministic = True

## General Settings

In [2]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 1e-4
BATCH_SIZE = 64
NUM_EPOCHS = 25
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
OUTPUT_DIM = 1

PATH = 'best_model.pth' # PATH to save and load model

cuda


## Dataset

In [3]:
TEXT = data.Field(tokenize='spacy',
                  include_lengths=True) 
LABEL = data.LabelField(dtype=torch.float)
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split(random_state=random.seed(RANDOM_SEED),
                                          split_ratio=0.8)

print(f'Num Train: {len(train_data)}')
print(f'Num Valid: {len(valid_data)}')
print(f'Num Test: {len(test_data)}')

Num Train: 20000
Num Valid: 5000
Num Test: 25000


In [4]:
TEXT.build_vocab(train_data,
                 max_size=VOCABULARY_SIZE,
                 vectors='glove.6B.100d',
                 unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

Vocabulary size: 20002
Number of classes: 2


In [5]:
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE,
    sort_within_batch=True, 
    device=DEVICE)

In [6]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.label.size()}')
    break

Train
Text matrix size: torch.Size([132, 64])
Target vector size: torch.Size([64])

Valid:
Text matrix size: torch.Size([53, 64])
Target vector size: torch.Size([64])

Test:
Text matrix size: torch.Size([36, 64])
Target vector size: torch.Size([64])


## Model

In [7]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super(RNN,self).__init__()
        self.embedding = nn.Embedding(input_dim,embedding_dim)
        self.encoder = nn.GRU(input_size=embedding_dim,
                              hidden_size=hidden_dim,
                              num_layers=2)
        self.pridictor = nn.Linear(hidden_dim,output_dim)
        
    def forward(self, text):
        output, hidden = self.encoder(self.embedding(text))
        preds = self.pridictor(hidden[-1])
        return preds

In [8]:
INPUT_DIM = len(TEXT.vocab)

torch.manual_seed(RANDOM_SEED)
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()
print(model)

RNN(
  (embedding): Embedding(20002, 128)
  (encoder): GRU(128, 256, num_layers=2)
  (pridictor): Linear(in_features=256, out_features=1, bias=True)
)


## Training

In [9]:
def train_val(model,optimizer,criterion,train_loader, valid_loader, epochs, path):
    since = time.time()
    min_val_loss = 1e10
    for epoch in range(epochs):
        train_loss=0.0
        val_loss=0.0
        val_acc = 0.0
        
        # trainning
        model.train()
        for idx, batch in enumerate(train_loader):
            optimizer.zero_grad()
            inputs, labels=batch.text, batch.label
            # print(inputs[0].size())
            # print(inputs[1].size())
            inputs=inputs[0].to(DEVICE)
            labels=labels.to(DEVICE)
            outputs = model(inputs).squeeze()
            # print(outputs.size())
            # print(labels.size())
            loss = criterion(outputs,labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.data.item()
            # print(batch.text[0].size(0))
        train_loss /= len(train_loader)
        # print(len(train_loader))
        print(f"epoch: {epoch+1}: train loss:{train_loss}")
        
        model.eval()
        with torch.no_grad():
            for val_idx, val_batch in enumerate(valid_loader):
                val_inputs, val_labels=val_batch.text, val_batch.label
                val_inputs = val_inputs[0].to(DEVICE)
                val_labels = val_labels.to(DEVICE)
                val_outputs = model(val_inputs).squeeze()
                val_outputs_acc = torch.sigmoid(val_outputs) > 0.5
                val_acc+=torch.sum(val_outputs_acc==val_labels)
                loss = criterion(val_outputs,val_labels)
                val_loss += loss.data.item()
        val_loss /= len(valid_loader)
        val_acc /= len(valid_data)
        print(f"epoch: {epoch+1}: val loss:{val_loss} val acc:{val_acc}")

        if val_loss<min_val_loss:
            min_val_loss = val_loss
            torch.save(model.state_dict(),path)
            print('model saved!')

        
    time_elapsed = time.time() - since
    print('\nTraining complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

In [10]:
train_val(model,optimizer,criterion,train_loader, valid_loader, NUM_EPOCHS, PATH)

epoch: 1: train loss:0.6835421677976371
epoch: 1: val loss:0.6477129217944567 val acc:0.626800000667572
model saved!
epoch: 2: train loss:0.5965642642479735
epoch: 2: val loss:0.5897364616394043 val acc:0.7053999900817871
model saved!
epoch: 3: train loss:0.5738197668863181
epoch: 3: val loss:0.6299606252320206 val acc:0.6444000005722046
epoch: 4: train loss:0.5632631084599053
epoch: 4: val loss:0.513651861042916 val acc:0.7563999891281128
model saved!
epoch: 5: train loss:0.475137968413746
epoch: 5: val loss:0.4993498132953161 val acc:0.7689999938011169
model saved!
epoch: 6: train loss:0.4298770973286309
epoch: 6: val loss:0.43247136926349206 val acc:0.8050000071525574
model saved!
epoch: 7: train loss:0.38874159958035037
epoch: 7: val loss:0.45439645006686824 val acc:0.7863999605178833
epoch: 8: train loss:0.35756104275250966
epoch: 8: val loss:0.38473035074487516 val acc:0.8267999887466431
model saved!
epoch: 9: train loss:0.32218029333379705
epoch: 9: val loss:0.35757766758339316 

## testing


In [11]:
def test(model, test_loader):
    acc = 0.0
    test_loss = 0.0
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(test_loader):
            inputs, labels=batch.text, batch.label
            inputs = inputs[0].to(DEVICE)
            labels = labels.to(DEVICE)
            outputs = model(inputs).squeeze()
            loss = criterion(outputs,labels)
            test_loss += loss.data.item()
            outputs = torch.sigmoid(outputs)>0.5
            acc+=torch.sum(outputs==labels)
    acc /= len(test_data)
    test_loss /= len(test_loader)
    print(f"test loss: {test_loss}: test acc:{acc}")

In [12]:
model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(DEVICE)
model.load_state_dict(torch.load(PATH))
test(model, test_loader)

test loss: 0.3273980164962351: test acc:0.8684399724006653
