In [1]:
import torch
from torchtext import data
from torchtext import datasets
import torch.optim as optim
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter

In [2]:
%load_ext tensorboard

In [3]:
TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [4]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [5]:
print(vars(train_data.examples[0]))

{'text': ['For', 'a', 'movie', 'that', 'gets', 'no', 'respect', 'there', 'sure', 'are', 'a', 'lot', 'of', 'memorable', 'quotes', 'listed', 'for', 'this', 'gem', '.', 'Imagine', 'a', 'movie', 'where', 'Joe', 'Piscopo', 'is', 'actually', 'funny', '!', 'Maureen', 'Stapleton', 'is', 'a', 'scene', 'stealer', '.', 'The', 'Moroni', 'character', 'is', 'an', 'absolute', 'scream', '.', 'Watch', 'for', 'Alan', '"', 'The', 'Skipper', '"', 'Hale', 'jr', '.', 'as', 'a', 'police', 'Sgt', '.'], 'label': 'pos'}


In [6]:
train_data, valid_data = train_data.split()

In [7]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

Number of training examples: 17500
Number of validation examples: 7500


In [8]:
vocab_size = 5000

TEXT.build_vocab(train_data, max_size = vocab_size)
LABEL.build_vocab(train_data)

In [9]:
TEXT.vocab.freqs.most_common(20)

[('the', 202115),
 (',', 192665),
 ('.', 165458),
 ('and', 109176),
 ('a', 109063),
 ('of', 100331),
 ('to', 93604),
 ('is', 76279),
 ('in', 61023),
 ('I', 54270),
 ('it', 53490),
 ('that', 49138),
 ('"', 44222),
 ("'s", 43136),
 ('this', 42564),
 ('-', 36964),
 ('/><br', 35599),
 ('was', 34797),
 ('as', 30176),
 ('movie', 29835)]

In [10]:
TEXT.vocab.itos[:10]

['<unk>', '<pad>', 'the', ',', '.', 'and', 'a', 'of', 'to', 'is']

In [11]:
batch_size = 32
device = 'cpu'

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = batch_size,
    device = device
)

In [12]:
input_size = len(TEXT.vocab)
embedding_size = 100
hidden_size = 256
output_size = 1
learning_rate = 0.001
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

In [37]:
class RCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx, n_layers=2, 
                 bidirectional=True, dropout=0.5):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels=32, out_channels=32, kernel_size=3),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        self.rnn = nn.LSTM(49, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        embedded = self.cnn(embedded)
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)
    
    
model = RNN1(input_size, embedding_size, hidden_size, output_size, pad_idx)

In [38]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
logs_writer = SummaryWriter(log_dir='./logs/small')
criterion = nn.BCEWithLogitsLoss()

In [39]:
model = model.to(device)
criterion = criterion.to(device)

In [40]:
from train import train, evaluate

In [41]:
train(model, train_iterator, valid_iterator, optimizer, criterion, logs_writer, 5)

  0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')