In [1]:
import torch
from torchtext.legacy import data

TEXT = data.Field(include_lengths=True)
LABEL = data.LabelField(dtype=torch.float)

In [2]:
from torchtext.legacy import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [None]:
print(f"number of training examples: {len(train_data)}")
print(f"number of testing examples: {len(test_data)}")
print(vars(train_data.examples[0]))

number of training examples: 25000
number of testing examples: 25000
{'text': ['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"Teachers".', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', "High's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"Teachers".', 'The', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'I', 'immediately', 'recalled', '.........', 'at', '..........', 'High.', 'A', 'classic

In [None]:
train_data, valid_data = train_data.split()

print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [None]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [None]:
print(len(TEXT.vocab), len(LABEL.vocab))
print(TEXT.vocab.freqs.most_common(10))
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)
# print(vars(train_data.examples[0])['text'])
# print(' '.join(TEXT.vocab.stoi[s for s in vars(train_data.examples[0])['text']))
# print(' '.join(TEXT.vocab.stoi[vars(train_data.examples[0])['text'][s] for s in range(len(vars(train_data.examples[0])['text']))]))
for s in range(len(vars(train_data.examples[0])['text'])):
    print(TEXT.vocab.stoi[vars(train_data.examples[0])['text'][s]], end=" ")

25002 2
[('the', 199117), ('a', 108295), ('and', 106086), ('of', 99518), ('to', 92573), ('is', 72167), ('in', 59619), ('I', 46217), ('that', 45006), ('this', 39737)]
['<unk>', '<pad>', 'the', 'a', 'and', 'of', 'to', 'is', 'in', 'I']
defaultdict(None, {'neg': 0, 'pos': 1})
3440 3056 7 608 4 20443 1419 8 11 4157 0 34 772 3296 0 0 10631 0 5420 20 3 0 8305 9386 283 36 3 2474 195 6692 4 0 16 3056 1112 1937 15 2 514 850 32 0 11094 2311 300 17 3777 18 653 214 5 0 634 10585 0 2134 11735 4 3 1396 0 492 0 7 0 221 1066 81 5 2 21 101 2043 5 17070 4 2 329 5 12 633 17 55 2369 17069 272 226 7 33 5 2 130 260 129 10395 563 210 68 999 6 842 29 5822 9191 522 4 1501 16 257 4 0 56 7 210 3 0 1439 282 2 247 1090 2476 74 19845 29 4112 0 118 42 8 3 4983 16687 0 232 737 156 22 9459 2001 1212 1003 7 21080 15 3 0 11094 16698 2001 0 118 3 95 1448 5 1394 4539 34 0 15 3 0 9166 10447 4 2390 10580 0 3 10764 359 742 0 23 0 15 0 0 49 7 32 0 4 367 1440 2303 4 2208 17 296 35 429 3 366 2064 6 2 4323 5 9879 0 0 20824 1098 2

In [None]:
BATCH_SIZE = 20
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)
print(torch.cuda.is_available())

True


In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths=text_lengths.to(torch.device('cpu')), enforce_sorted=False)
        
        _, hidden= self.rnn(packed_embedded)
        
        # hidden = torch.cat((hidden[0], hidden[1]), dim=1)
        
        return self.fc(hidden.squeeze())

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBDDING_DIM = 300
HIDDEN_DIM = 300
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)

In [None]:
def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_params(model):,} trainable parameters")

The model has 7,681,501 trainable parameters


In [None]:
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float()
    acc = 1 - correct.sum() / len(correct)

    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        text, text_lengths = batch.text
        preds = model(text, text_lengths).squeeze()
        loss = criterion(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            preds = model(text, text_lengths).squeeze()
            loss = criterion(preds, batch.label)
            acc = binary_accuracy(preds, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCH = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCH):
    start_time = time.time()

    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bestmodel2.pt')
    
    print(f'Epoch: {epoch+1:02}, Epoch time: {epoch_mins}m {epoch_secs}s')
    print(f'\ttrain loss: {train_loss:.3f}, train acc: {train_acc*100:.2f}%')
    print(f'\tvalid loss: {valid_loss:.3f}, valid acc: {valid_acc*100:.2f}%')

ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
model = RNN(INPUT_DIM, EMBDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)

model.load_state_dict(torch.load('bestmodel2.pt'))

test_loss, test_acc = evaluate(model, test_iter, criterion)

print(f'\ttext loss: {test_loss:.3f}, test acc: {test_acc*100:.2f}%')

	text loss: 0.509, test acc: 90.86%
