In [1]:
!pip install torchtext==0.11.2 

Collecting torchtext==0.11.2
  Downloading torchtext-0.11.2-cp37-cp37m-manylinux1_x86_64.whl (8.0 MB)
[K     |████████████████████████████████| 8.0 MB 11.6 MB/s 
[?25hCollecting torch==1.10.2
  Downloading torch-1.10.2-cp37-cp37m-manylinux1_x86_64.whl (881.9 MB)
[K     |██████████████████████████████▎ | 834.1 MB 1.4 MB/s eta 0:00:36tcmalloc: large alloc 1147494400 bytes == 0x3a5ca000 @  0x7f580f0bd615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7
[K     |████████████████████████████████| 881.9 MB 1.9 kB/s 
Installing collected packages: torch, torchtext
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
  Attempting uninstall: torchtext
    Found

# Environment Setup


In [2]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Pytorch version is: ", torch.__version__)
print("You are using: ", device)

Pytorch version is:  1.10.2+cu102
You are using:  cuda



# Recurrent Sequence to Sequence Model



### Preparing Data


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k

# import spacy
import random
import math
import os
import time

SEED = 1

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.enabled = False 
torch.backends.cudnn.deterministic = True

In [4]:
%%capture
! python -m spacy download en
! python -m spacy download de
from torchtext.data.utils import get_tokenizer
de_tokenizer = get_tokenizer('spacy', language='de')
en_tokenizer = get_tokenizer('spacy', language='en')

In [5]:
# get train test split using Multi30k 
train_data, valid_data, test_data = Multi30k()

100%|██████████| 1.21M/1.21M [00:00<00:00, 5.84MB/s]
100%|██████████| 46.3k/46.3k [00:00<00:00, 1.97MB/s]
100%|██████████| 43.9k/43.9k [00:00<00:00, 1.98MB/s]


In [6]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [7]:
# example testing 
example_src, example_trg = next(Multi30k(split="train"))
print(f"Original: {example_src}", f"Tokenized: {de_tokenizer(example_src)}")
print(f"Original: {example_trg}", f"Tokenized: {en_tokenizer(example_trg)}")

Original: Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.
 Tokenized: ['Zwei', 'junge', 'weiße', 'Männer', 'sind', 'im', 'Freien', 'in', 'der', 'Nähe', 'vieler', 'Büsche', '.', '\n']
Original: Two young, White males are outside near many bushes.
 Tokenized: ['Two', 'young', ',', 'White', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.', '\n']


## Building the vocabulary for the source and target languages 



In [8]:
# Only allow tokens that appear at least 2 times to appear in our vocabulary. 
# Tokens that appear only once are converted into an `<unk>` (unknown) token. 

from torchtext.vocab import build_vocab_from_iterator

de_generator = (de_tokenizer(pair[0].strip().lower()) for pair in Multi30k(split="train"))
specials = ["<unk>", "<pad>", "<bos>", "<eos>"]
de_vocab = build_vocab_from_iterator(de_generator, specials=specials, min_freq=2)
en_generator = (en_tokenizer(pair[1].strip().lower()) for pair in Multi30k(split="train"))
en_vocab = build_vocab_from_iterator(en_generator, specials=specials, min_freq=2)

for vocab in (de_vocab, en_vocab):
    vocab.set_default_index(vocab["<unk>"])

In [9]:
print(f"Unique tokens in source (de) vocabulary: {len(de_vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(en_vocab)}")

Unique tokens in source (de) vocabulary: 7855
Unique tokens in target (en) vocabulary: 5894


## Define Data Preprocessing Pipeline.


In [10]:
BOS_IDX = de_vocab['<bos>']
EOS_IDX = de_vocab['<eos>']

from typing import List, Tuple
from torch import Tensor

def data_process(raw_dataset) -> List[Tuple[Tensor, Tensor]]: 
    ret = []
    for pair in raw_dataset: 
      # lower case and strip both German and English tokens in each sentence pair 
      d_tokens = de_tokenizer(pair[0].strip().lower()) 
      e_tokens = de_tokenizer(pair[1].strip().lower()) 

      # reverse sentence order of German tokens 
      d_tokens = d_tokens[::-1]

      # add <bos> and <eos> tokens to both German and English sentences 
      d_tokens.insert(0, '<bos>')
      d_tokens.append('<eos>')
      e_tokens.insert(0, '<bos>')
      e_tokens.append('<eos>') 

      # get encoded tensor tuple from vocabs  
      d_tens = torch.tensor([de_vocab[token] for token in d_tokens], dtype=torch.long)
      e_tens = torch.tensor([en_vocab[token] for token in e_tokens], dtype=torch.long)
      tup = (d_tens, e_tens)

      # add tensor tuple to list 
      ret.append(tup) 

    return ret 

train_data, valid_data, test_data = Multi30k()
train_data_processed = data_process(train_data)
valid_data_processed = data_process(valid_data)
test_data_processed = data_process(test_data) 

In [11]:
# example testing 
de_itos = de_vocab.get_itos()
en_itos = en_vocab.get_itos()
de_encoded, en_encoded = train_data_processed[0]
print(" ".join([de_itos[item] for item in de_encoded]))
print(" ".join([en_itos[item] for item in en_encoded]))

<bos> . büsche vieler nähe der in freien im sind männer weiße junge zwei <eos>
<bos> two young , white males are outside near many bushes . <eos>


In [12]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

BATCH_SIZE = 128
PAD_IDX = de_vocab['<pad>']

def collate_fn(data_batch) -> Tuple[Tensor, List[int], Tensor]: 
    # initialize lists to be added to returned tuple 
    de_batch = [] 
    en_batch = [] 
    sent_lens = [] 
    for pair in data_batch: 
      de = pair[0]
      en = pair[1] 
      # add indices/lengths to corresponding batches/list 
      de_batch.append(de)
      en_batch.append(en)
      sent_lens.append(len(de))
    # pad sequences 
    # send to gpu 
    de_batch = torch.tensor(pad_sequence(de_batch, padding_value=PAD_IDX)).to(device) 
    en_batch = torch.tensor(pad_sequence(en_batch, padding_value=PAD_IDX)).to(device) 
    ret = (de_batch, sent_lens, en_batch)

    return ret 

train_dl = DataLoader(
    train_data_processed,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)
valid_dl = DataLoader(
    valid_data_processed,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)
test_dl = DataLoader(
    test_data_processed,
    batch_size=BATCH_SIZE,
    shuffle=True,
    collate_fn=collate_fn,
)

In [13]:
# testing collate_fn 
indices = [0, 1, 2, 3] 
collate_fn([train_data_processed[i] for i in indices])



(tensor([[   2,    2,    2,    2],
         [   4,    4,    4,    4],
         [3171,    0,  499,  248],
         [7649,    5,   56,    5],
         [ 110, 2069, 7316,  681],
         [  15,  831,    5,   10],
         [   7,   11,    7,  535],
         [  88,   30,  217,   14],
         [  20,   76,   25,   12],
         [  84,    3,   66,   29],
         [  30,    1,    5,   40],
         [ 253,    1,    3,   46],
         [  26,    1,    1,    6],
         [  18,    1,    1,    7],
         [   3,    1,    1,   13],
         [   1,    1,    1,    5],
         [   1,    1,    1,    3]], device='cuda:0'),
 [15, 10, 12, 17],
 tensor([[   2,    2,    2,    2],
         [  16,  113,    4,    4],
         [  24,   30,   53,    9],
         [  15,    6,   33,    6],
         [  25,  325,  230,    4],
         [ 778,  279,   69,   29],
         [  17,   17,    4,   23],
         [  57, 1200,  248,   10],
         [  80,    4, 4286,   36],
         [ 202,  715,    5,    8],
         [1312, 4

## Seq2Seq RNN Model



In [14]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional=True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_len):
  
        embedded = self.dropout(self.embedding(src))

        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len, enforce_sorted=False)
        packed_outputs, hidden = self.rnn(packed_embedded)       
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs) 
            
        hidden = self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        
        return outputs, hidden

### Decoder



In [15]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers

        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers)
        self.out = nn.Linear(hid_dim, output_dim)
        
        
    def forward(self, input, hidden):
        input = input.unsqueeze(0) 
        
        embedded = self.embedding(input) 

        output, hidden = self.rnn(embedded, hidden) 
        
        prediction = self.out(output.squeeze(0))
        
        return prediction, hidden

### Seq2Seq



In [16]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.enc_hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"


    def forward(self, src, src_len, trg, teacher_forcing_ratio = 0.5): 
        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        _, hidden = self.encoder(src, src_len)
        hidden = hidden.unsqueeze(0)
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, max_len): 
            output, hidden = self.decoder(input, hidden)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = (trg[t] if teacher_force else top1)
        
        return outputs

### Seq2Seq Model Training



In [17]:
# specify params 
INPUT_DIM = len(de_vocab)
OUTPUT_DIM = len(en_vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 1
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS)

model = Seq2Seq(enc, dec, device).to(device)

In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 10,616,326 trainable parameters


In [19]:
optimizer = optim.Adam(model.parameters())

In [20]:
PAD_IDX = en_vocab['<pad>']
assert PAD_IDX == de_vocab['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [21]:
# training loop 
def train(model, train_dl, optimizer, criterion, clip):
    
    model.train() 
    
    epoch_loss = 0
    
    for i, (src, src_len, trg) in enumerate(train_dl):
        
        optimizer.zero_grad() 
        output = model(src, src_len, trg) 
        output = output[1:].view(-1, output.shape[-1])
    
        trg = trg[1:].view(-1)
       
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(train_dl)

In [22]:
# evaluation 
def evaluate(model, val_dl, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
    
        for i, (src, src_len, trg) in enumerate(val_dl):

            output = model(src, src_len, trg, 0)  
            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1) 
            loss = criterion(output, trg)
            epoch_loss += loss.item()
        
    return epoch_loss / len(val_dl)

In [23]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [24]:
# main training loop 
N_EPOCHS = 5
CLIP = 1
SAVE_DIR = 'models'
MODEL_SAVE_PATH = os.path.join(SAVE_DIR, 'cpsc477_hw4_rnn.pt')

best_valid_loss = float('inf')

# save model to reduce future runtime 
if not os.path.isdir(f'{SAVE_DIR}'):
    os.makedirs(f'{SAVE_DIR}')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()

    train_loss = train(model, train_dl, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_dl, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), MODEL_SAVE_PATH)
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')



Epoch: 01 | Time: 0m 57s
	Train Loss: 4.372 | Train PPL:  79.173
	 Val. Loss: 4.180 |  Val. PPL:  65.342
Epoch: 02 | Time: 0m 57s
	Train Loss: 3.487 | Train PPL:  32.695
	 Val. Loss: 3.922 |  Val. PPL:  50.516
Epoch: 03 | Time: 0m 58s
	Train Loss: 3.143 | Train PPL:  23.169
	 Val. Loss: 3.735 |  Val. PPL:  41.904
Epoch: 04 | Time: 0m 57s
	Train Loss: 2.841 | Train PPL:  17.134
	 Val. Loss: 3.743 |  Val. PPL:  42.212
Epoch: 05 | Time: 0m 57s
	Train Loss: 2.641 | Train PPL:  14.030
	 Val. Loss: 3.761 |  Val. PPL:  42.973


In [25]:
model.load_state_dict(torch.load(MODEL_SAVE_PATH))

test_loss = evaluate(model, test_dl, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')



| Test Loss: 3.740 | Test PPL:  42.081 |
