In [2]:
import torch
from torchtext.legacy import data

TEXT = data.Field()
LABEL = data.LabelField(dtype=torch.float)

In [3]:
from torchtext.legacy import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [4]:
print(f"number of training examples: {len(train_data)}")
print(f"number of testing examples: {len(test_data)}")
print(vars(train_data.examples[0]))

number of training examples: 25000
number of testing examples: 25000
{'text': ['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"Teachers".', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', "High's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"Teachers".', 'The', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'I', 'immediately', 'recalled', '.........', 'at', '..........', 'High.', 'A', 'classic

In [5]:
train_data, valid_data = train_data.split()

print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [6]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [7]:
print(len(TEXT.vocab), len(LABEL.vocab))
print(TEXT.vocab.freqs.most_common(10))
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)
# print(vars(train_data.examples[0])['text'])
# print(' '.join(TEXT.vocab.stoi[s for s in vars(train_data.examples[0])['text']))
# print(' '.join(TEXT.vocab.stoi[vars(train_data.examples[0])['text'][s] for s in range(len(vars(train_data.examples[0])['text']))]))
for s in range(len(vars(train_data.examples[0])['text'])):
    print(TEXT.vocab.stoi[vars(train_data.examples[0])['text'][s]], end=" ")

25002 2
[('the', 201066), ('a', 108665), ('and', 106650), ('of', 100438), ('to', 93156), ('is', 71982), ('in', 59930), ('I', 46018), ('that', 45031), ('this', 40068)]
['<unk>', '<pad>', 'the', 'a', 'and', 'of', 'to', 'is', 'in', 'I']
defaultdict(None, {'neg': 0, 'pos': 1})
8197 22396 0 1421 102 0 1412 4 16614 57 7 26 55 33 5 2 129 7138 135 16 0 12 7 33 5 2 2128 417 1608 11838 10191 1448 0 2068 1097 2 1194 8 18099 17 2 0 19 52 7617 3105 3256 3 0 0 12 14 20388 17 2 11838 299 975 5 0 4 4834 20566 334 14 743 15 0 358 0 57 575 14 108 53 17 2 7138 6 1430 989 4 12 14 492 932 31 2 7138 30 2 179 5 0 13 0 1095 0 2170 12897 9794 0 0 4 8690 45 5 1581 10 29 77 27 14261 281 38 4457 30 23 2248 0 41 23160 4 23 10899 4 14715 17608 332 6 75 3838 5 2 0 1459 38 158 0 4 661 2 10899 4 17608 6 0 107 13 0 7 576 239 8 11 142 411 3966 110 101 331 3 0 31 8004 0 22 0 29 368 23 5492 503 0 4 1864 0 7 16769 245 26 6 0 1721 239 17875 2 9491 5 0 16753 290 144 2 429 0 4 2 0 567 16 37 210 1234 20064 13 0 2 0 2223 31 727

In [8]:
BATCH_SIZE = 20
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)
print(torch.cuda.is_available())

True


In [9]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
    
    def forward(self, text):
        embedded = self.embedding(text)
        
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths=[embedded.size(0) for i in range(embedded.size(1))])
        
        _, (hidden, _) = self.rnn(packed_embedded)
        
        hidden = torch.cat((hidden[0], hidden[1]), dim=1)
        
        return self.fc(hidden.squeeze())

In [10]:
INPUT_DIM = len(TEXT.vocab)
EMBDDING_DIM = 300
HIDDEN_DIM = 300
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)

In [11]:
def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_params(model):,} trainable parameters")

The model has 8,946,001 trainable parameters


In [12]:
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [13]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float()
    acc = 1 - correct.sum() / len(correct)

    return acc

In [14]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        preds = model(batch.text).squeeze()
        loss = criterion(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            preds = model(batch.text).squeeze()
            loss = criterion(preds, batch.label)
            acc = binary_accuracy(preds, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [16]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

    return elapsed_mins, elapsed_secs

In [18]:
N_EPOCH = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCH):
    start_time = time.time()

    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bestmodel2.pt')
    
    print(f'Epoch: {epoch+1:02}, Epoch time: {epoch_mins}m {epoch_secs}s')
    print(f'\ttrain loss: {train_loss:.3f}, train acc: {train_acc*100:.2f}%')
    print(f'\tvalid loss: {valid_loss:.3f}, valid acc: {valid_acc*100:.2f}%')

Epoch: 01, Epoch time: 2m 1s
	train loss: 0.091, train acc: 97.18%
	valid loss: 0.511, valid acc: 95.05%
Epoch: 02, Epoch time: 2m 1s
	train loss: 0.044, train acc: 98.59%
	valid loss: 0.625, valid acc: 96.85%
Epoch: 03, Epoch time: 2m 0s
	train loss: 0.021, train acc: 99.42%
	valid loss: 0.686, valid acc: 96.15%
Epoch: 04, Epoch time: 1m 59s
	train loss: 0.017, train acc: 99.57%
	valid loss: 0.869, valid acc: 97.48%
Epoch: 05, Epoch time: 1m 59s
	train loss: 0.012, train acc: 99.63%
	valid loss: 0.821, valid acc: 98.03%
Epoch: 06, Epoch time: 1m 59s
	train loss: 0.011, train acc: 99.74%
	valid loss: 0.862, valid acc: 98.17%
Epoch: 07, Epoch time: 2m 0s
	train loss: 0.006, train acc: 99.86%
	valid loss: 0.862, valid acc: 97.08%
Epoch: 08, Epoch time: 2m 0s
	train loss: 0.007, train acc: 99.70%
	valid loss: 1.001, valid acc: 98.40%
Epoch: 09, Epoch time: 1m 58s
	train loss: 0.013, train acc: 99.67%
	valid loss: 0.862, valid acc: 97.56%
Epoch: 10, Epoch time: 1m 59s
	train loss: 0.007, t

In [19]:
model = RNN(INPUT_DIM, EMBDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)

model.load_state_dict(torch.load('bestmodel2.pt'))

test_loss, test_acc = evaluate(model, test_iter, criterion)

print(f'\ttext loss: {test_loss:.3f}, test acc: {test_acc*100:.2f}%')

	text loss: 0.525, test acc: 94.82%
