In [1]:
import torch
from torchtext import data
from torchtext import datasets
import torch.optim as optim
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter

In [2]:
%load_ext tensorboard

In [3]:
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [4]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:02<00:00, 33.5MB/s]


Number of training examples: 25000
Number of testing examples: 25000


In [5]:
print(vars(train_data.examples[0]))

{'text': ['Clint', 'Eastwood', 'would', 'star', 'again', 'as', 'the', 'battle', '-', 'weary', 'Detective', 'Harry', 'Callahan', ',', 'but', 'would', 'also', 'direct', 'the', 'fourth', 'entry', 'in', 'the', "'", 'Dirty', 'Harry', "'", 'series', '.', "'", 'Sudden', 'Impact', "'", 'again', 'like', 'the', 'other', 'additions', ',', 'brings', 'its', 'own', 'distinguishable', 'style', 'and', 'tone', ',', 'but', 'if', 'anything', 'it', "'s", 'probably', 'the', 'most', 'similar', 'to', 'the', 'original', 'in', 'it', "'s", 'darker', 'and', 'seedy', 'moments', '(', 'and', 'bestowing', 'a', 'classic', 'line', '"', 'Go', 'ahead', '.', 'Make', 'my', 'day', '"', ')', '\x85 ', 'but', 'some', 'of', 'its', 'humor', 'has', 'to', 'been', 'seen', 'to', 'believe', '.', 'A', 'bulldog', '\x85 ', 'named', 'meathead', 'that', 'pisses', 'and', 'farts', '.', 'Oh', 'yeah', '.', 'However', 'an', 'interesting', 'fact', 'this', 'entry', 'was', 'only', 'one', 'in', 'series', 'to', 'not', 'have', 'it', 'set', 'entirel

In [6]:
train_data, valid_data = train_data.split()

In [7]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

Number of training examples: 17500
Number of validation examples: 7500


In [11]:
vocab_size = 25000

TEXT.build_vocab(train_data, max_size = vocab_size)
LABEL.build_vocab(train_data)

In [12]:
TEXT.vocab.freqs.most_common(20)

[('the', 204019),
 (',', 193654),
 ('.', 166331),
 ('and', 110329),
 ('a', 110150),
 ('of', 101409),
 ('to', 94242),
 ('is', 76799),
 ('in', 61400),
 ('I', 54426),
 ('it', 54053),
 ('that', 49499),
 ('"', 44109),
 ("'s", 43703),
 ('this', 42479),
 ('-', 37154),
 ('/><br', 36011),
 ('was', 35256),
 ('as', 30408),
 ('with', 30268)]

In [13]:
TEXT.vocab.itos[:10]

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']

In [14]:
batch_size = 32
device = 'cuda'

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = batch_size,
    device = device
)

In [15]:
input_size = len(TEXT.vocab)
embedding_size = 100
hidden_size = 256
output_size = 1
learning_rate = 0.001
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

In [50]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx, n_layers=2, 
                 bidirectional=True, dropout=0.5):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)

model = RNN(input_size, embedding_size, hidden_size, output_size, pad_idx)

In [17]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
logs_writer = SummaryWriter(log_dir='./logs/rnn')
criterion = nn.BCEWithLogitsLoss()

In [18]:
model = model.to(device)
criterion = criterion.to(device)

In [23]:
from tqdm import tqdm

def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc


def _train(model, iterator, optimizer, criterion, logs_writer, epoch):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for i, batch in enumerate(iterator):
        optimizer.zero_grad()
                
        text, text_lengths = batch.text
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        logs_writer.add_scalar('Itearation Loss/train', loss, epoch*len(iterator) + i)
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def train(model, train_iterator, val_iterator, optimizer, criterion, logs_writer, num_epochs):
    for epoch in tqdm(range(num_epochs)):
        train_loss, train_acc = _train(model, train_iterator, optimizer, criterion, logs_writer, epoch)
        valid_loss, valid_acc = evaluate(model, val_iterator, criterion)

        logs_writer.add_scalar('Accuracy/train', train_acc, epoch)
        logs_writer.add_scalar('Accuracy/validation', valid_acc, epoch)
        logs_writer.add_scalar('Loss/train', train_loss, epoch)
        logs_writer.add_scalar('Loss/validation', valid_loss, epoch)

In [25]:
train(model, train_iterator, valid_iterator, optimizer, criterion, logs_writer, 5)

100%|██████████| 5/5 [11:03<00:00, 132.71s/it]


In [27]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.482 | Test Acc: 77.93%


In [51]:
from torchtext.vocab import GloVe
# build the vocabulary
embedding_size = 100
TEXT.build_vocab(train_data, max_size = vocab_size, vectors=GloVe(name='6B', dim=embedding_size))
LABEL.build_vocab(train_data)

In [52]:
model = RNN(input_size, embedding_size, hidden_size, output_size, pad_idx)

In [53]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [54]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [55]:
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
unk_idx = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[unk_idx] = torch.zeros(embedding_size)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_size)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])


In [56]:
batch_size = 32
device = 'cuda'

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = batch_size,
    device = device
)

In [57]:
input_size = len(TEXT.vocab)
hidden_size = 256
output_size = 1
learning_rate = 0.001

In [58]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
logs_writer = SummaryWriter(log_dir='./logs/glove')
criterion = nn.BCEWithLogitsLoss()

In [59]:
model = model.to(device)
criterion = criterion.to(device)

In [60]:
train(model, train_iterator, valid_iterator, optimizer, criterion, logs_writer, 5)



  0%|          | 0/5 [00:00<?, ?it/s][A[A

 20%|██        | 1/5 [02:12<08:48, 132.11s/it][A[A

 40%|████      | 2/5 [04:25<06:37, 132.45s/it][A[A

 60%|██████    | 3/5 [06:37<04:24, 132.34s/it][A[A

 80%|████████  | 4/5 [08:51<02:12, 132.91s/it][A[A

100%|██████████| 5/5 [11:06<00:00, 133.23s/it][A[A


In [61]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.356 | Test Acc: 87.19%


In [70]:
class RNN2(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx, n_layers=2, 
                 bidirectional=True, dropout=0.5):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
        
        hidden = self.fc1(hidden)
        return self.fc2(hidden)

In [71]:
model = RNN2(input_size, embedding_size, hidden_size, output_size, pad_idx)

In [72]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
unk_idx = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[unk_idx] = torch.zeros(embedding_size)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_size)

In [73]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
logs_writer = SummaryWriter(log_dir='./logs/rnn-additional_fc')
criterion = nn.BCEWithLogitsLoss()

In [74]:
model = model.to(device)
criterion = criterion.to(device)

In [68]:
train(model, train_iterator, valid_iterator, optimizer, criterion, logs_writer, 5)



  0%|          | 0/5 [00:00<?, ?it/s][A[A

RuntimeError: Given groups=1, weight of size 32 32 3, expected input[916, 28, 100] to have 32 channels, but got 28 channels instead

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')