In [None]:
import sys
print(sys.executable)

In [1]:
import torch
from torchtext.legacy import data

TEXT = data.Field()
LABEL = data.LabelField(dtype=torch.float)

In [2]:
from torchtext.legacy import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
print(f"number of training examples: {len(train_data)}")
print(f"number of testing examples: {len(test_data)}")
print(vars(train_data.examples[0]))

In [None]:
train_data, valid_data = train_data.split()

print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

In [5]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [6]:
print(len(TEXT.vocab), len(LABEL.vocab))
print(TEXT.vocab.freqs.most_common(10))
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)
# print(vars(train_data.examples[0])['text'])
# print(' '.join(TEXT.vocab.stoi[s for s in vars(train_data.examples[0])['text']))
# print(' '.join(TEXT.vocab.stoi[vars(train_data.examples[0])['text'][s] for s in range(len(vars(train_data.examples[0])['text']))]))
for s in range(len(vars(train_data.examples[0])['text'])):
    print(TEXT.vocab.stoi[vars(train_data.examples[0])['text'][s]], end=" ")

25002 2
[('the', 200153), ('a', 108207), ('and', 106215), ('of', 99734), ('to', 92420), ('is', 71838), ('in', 59919), ('I', 46364), ('that', 44990), ('this', 39813)]
['<unk>', '<pad>', 'the', 'a', 'and', 'of', 'to', 'is', 'in', 'I']
defaultdict(None, {'pos': 0, 'neg': 1})
18 21 567 36 3 0 17 0 0 881 1246 9 103 184 77 681 16 2 4300 322 5 2 665 2483 6 0 1012 0 4 1752 37 10 9760 42 0 35 7309 11 0 2562 3 216 0 0 8 11 21 22 240 11083 4124 2638 11 21 184 93 2520 9 209 3860 12 51 18369 184 72 4933 3 1673 580 19 104 21 0 162 1539 0 4058 2163 6 149 35 40 26 117 32 8 3 4167 

In [7]:
BATCH_SIZE = 20
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)
print(torch.cuda.is_available())

True


In [8]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
    
    def forward(self, text):
        embedded = self.embedding(text)
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths=[embedded.size(0) for i in range(embedded.size(1))])
        
        _, (hidden, _) = self.rnn(packed_embedded)
        
        hidden = torch.cat((hidden[0], hidden[1]), dim=1)
        
        return self.fc(hidden.squeeze())

In [9]:
INPUT_DIM = len(TEXT.vocab)
EMBDDING_DIM = 300
HIDDEN_DIM = 300
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)

In [10]:
def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_params(model):,} trainable parameters")

The model has 8,946,001 trainable parameters


In [11]:
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [12]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float()
    acc = 1 - correct.sum() / len(correct)

    return acc

In [13]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        preds = model(batch.text).squeeze()
        loss = criterion(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [14]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            preds = model(batch.text).squeeze()
            loss = criterion(preds, batch.label)
            acc = binary_accuracy(preds, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

    return elapsed_mins, elapsed_secs

In [16]:
N_EPOCH = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCH):
    start_time = time.time()

    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bestmodel1.pt')
    
    print(f'Epoch: {epoch+1:02}, Epoch time: {epoch_mins}m {epoch_secs}s')
    print(f'\ttrain loss: {train_loss:.3f}, train acc: {train_acc*100:.2f}%')
    print(f'\tvalid loss: {valid_loss:.3f}, valid acc: {valid_acc*100:.2f}%')

Epoch: 01, Epoch time: 1m 59s
	train loss: 0.664, train acc: 43.37%
	valid loss: 0.623, valid acc: 40.32%
Epoch: 02, Epoch time: 2m 0s
	train loss: 0.620, train acc: 40.92%
	valid loss: 0.516, valid acc: 34.53%
Epoch: 03, Epoch time: 1m 58s
	train loss: 0.384, train acc: 16.48%
	valid loss: 0.384, valid acc: 10.77%
Epoch: 04, Epoch time: 1m 59s
	train loss: 0.242, train acc: 7.82%
	valid loss: 0.385, valid acc: 8.60%
Epoch: 05, Epoch time: 1m 59s
	train loss: 0.153, train acc: 4.62%
	valid loss: 0.412, valid acc: 5.01%
Epoch: 06, Epoch time: 1m 57s
	train loss: 0.081, train acc: 2.24%
	valid loss: 0.498, valid acc: 4.23%
Epoch: 07, Epoch time: 1m 58s
	train loss: 0.048, train acc: 1.23%
	valid loss: 0.551, valid acc: 3.36%
Epoch: 08, Epoch time: 1m 57s
	train loss: 0.032, train acc: 0.73%
	valid loss: 0.610, valid acc: 2.52%


In [None]:
model = RNN(INPUT_DIM, EMBDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)

model.load_state_dict(torch.load('bestmodel1.pt'))

test_loss, test_acc = evaluate(model, test_iter, criterion)

print(f'\ttext loss: {test_loss:.3f}, test acc: {test_acc*100:.2f}%')