In [1]:
import sys
print(sys.executable)

C:\Users\Tyrone\anaconda3\envs\nlp\python.exe


In [2]:
import torch
from torchtext.legacy import data

TEXT = data.Field()
LABEL = data.LabelField(dtype=torch.float)

In [3]:
from torchtext.legacy import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [4]:
print(f"number of training examples: {len(train_data)}")
print(f"number of testing examples: {len(test_data)}")
print(vars(train_data.examples[0]))

number of training examples: 25000
number of testing examples: 25000
{'text': ['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"Teachers".', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', "High's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"Teachers".', 'The', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'I', 'immediately', 'recalled', '.........', 'at', '..........', 'High.', 'A', 'classic

In [5]:
train_data, valid_data = train_data.split()

print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 17500
Number of validation examples: 7500
Number of testing examples: 25000


In [6]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [7]:
print(len(TEXT.vocab), len(LABEL.vocab))
print(TEXT.vocab.freqs.most_common(10))
print(TEXT.vocab.itos[:10])
print(LABEL.vocab.stoi)
# print(vars(train_data.examples[0])['text'])
# print(' '.join(TEXT.vocab.stoi[s for s in vars(train_data.examples[0])['text']))
# print(' '.join(TEXT.vocab.stoi[vars(train_data.examples[0])['text'][s] for s in range(len(vars(train_data.examples[0])['text']))]))
for s in range(len(vars(train_data.examples[0])['text'])):
    print(TEXT.vocab.stoi[vars(train_data.examples[0])['text'][s]], end=" ")

25002 2
[('the', 201643), ('a', 108915), ('and', 106802), ('of', 100477), ('to', 92977), ('is', 72551), ('in', 59699), ('I', 45921), ('that', 45342), ('this', 40123)]
['<unk>', '<pad>', 'the', 'a', 'and', 'of', 'to', 'is', 'in', 'I']
defaultdict(None, {'neg': 0, 'pos': 1})
158 26 6 6283 20 19 0 61 26 108 702 18 41 47 766 30 2 1274 234 2 21 269 892 12 198 3 110 48 1778 18 187 4438 9 58 36 644 0 4 9 98 267 52 667 158 26 3 2633 6692 5899 2172 18 9 97 450 44 628 4 506 2296 667 19 0 7 26 3 578 2166 159 18 57 28 36 2 865 104 28 213 359 115 

In [8]:
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device)
print(torch.cuda.is_available())

True


In [9]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        return self.fc(hidden.squeeze())

In [10]:
INPUT_DIM = len(TEXT.vocab)
EMBDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)

In [11]:
def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"The model has {count_params(model):,} trainable parameters")

The model has 2,592,105 trainable parameters


In [12]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

criterion = nn.BCEWithLogitsLoss()
criterion = criterion.to(device)

In [13]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(preds)
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)

    return acc

In [14]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        preds = model(batch.text).squeeze()
        loss = criterion(preds, batch.label)
        acc = binary_accuracy(preds, batch.label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            preds = model(batch.text).squeeze()
            loss = criterion(preds, batch.label)
            acc = binary_accuracy(preds, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [16]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))

    return elapsed_mins, elapsed_secs

In [19]:
N_EPOCH = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCH):
    start_time = time.time()

    train_loss, train_acc = train(model, train_iter, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bestmodel1.pt')
    
    print(f'Epoch: {epoch}, Epoch time: {epoch_mins}m {epoch_secs}s')
    print(f'\ttrain loss: {train_loss:.3f}, train acc: {train_acc*100:.2f}%')
    print(f'\tvalid loss: {valid_loss:.3f}, valid acc: {valid_acc*100:.2f}%')

Epoch: 0, Epoch time: 0m 19s
	train loss: 0.693, train acc: 50.53%
	valid loss: 0.695, valid acc: 48.50%
Epoch: 1, Epoch time: 0m 19s
	train loss: 0.693, train acc: 50.55%
	valid loss: 0.695, valid acc: 48.49%
Epoch: 2, Epoch time: 0m 20s
	train loss: 0.693, train acc: 50.54%
	valid loss: 0.695, valid acc: 48.50%
Epoch: 3, Epoch time: 0m 19s
	train loss: 0.693, train acc: 50.52%
	valid loss: 0.695, valid acc: 48.50%
Epoch: 4, Epoch time: 0m 19s
	train loss: 0.693, train acc: 50.55%
	valid loss: 0.695, valid acc: 48.50%


In [20]:
model = RNN(INPUT_DIM, EMBDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model = model.to(device)

model.load_state_dict(torch.load('bestmodel1.pt'))

test_loss, test_acc = evaluate(model, test_iter, criterion)

print(f'\ttext loss: {test_loss:.3f}, test acc: {test_acc*100:.2f}%')

	text loss: 0.712, test acc: 49.89%
