In [1]:
pip install torchtext==0.6.0

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l[K     |█████                           | 10 kB 28.3 MB/s eta 0:00:01[K     |██████████▏                     | 20 kB 8.9 MB/s eta 0:00:01[K     |███████████████▎                | 30 kB 8.1 MB/s eta 0:00:01[K     |████████████████████▍           | 40 kB 7.7 MB/s eta 0:00:01[K     |█████████████████████████▌      | 51 kB 3.0 MB/s eta 0:00:01[K     |██████████████████████████████▋ | 61 kB 3.4 MB/s eta 0:00:01[K     |████████████████████████████████| 64 kB 1.7 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 8.7 MB/s 
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.11.0
    Uninstalling torchtext-0.11.0:
      Successfully uninstalled torchtext-0.11.0
Successfully install

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torchtext.data import Field, BucketIterator, TabularDataset 

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import numpy as np

import random
import math
import time
import pandas as pd

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
data_path = '/content/drive/MyDrive/DataSetVoice/' # news_data or dict_data

def tokenize(word): # create a tokenizer function
    # word = word.replace('\n', '')
    return word.split(' ')

# <sos>: start of a sequence; <eos>: end of a sequence.
SRC = Field(tokenize=tokenize, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=True,
            include_lengths = True)

TRG = Field(tokenize=tokenize, 
            init_token='<sos>', 
            eos_token='<eos>', 
            lower=False,
            include_lengths = True)

In [4]:
df = pd.read_csv(data_path+'/parallel_data_sample3.csv')

In [5]:
df.shape

(4000, 3)

In [6]:
df

Unnamed: 0.1,Unnamed: 0,Song,Guitar
0,0,'C11' 'G4' 'D♯-1' 'C11' 'G5' 'D-1' 'C♯11' 'G4'...,'C11' 'D9' 'D-1' 'C♯11' 'D9' 'D-1' 'G♯9' 'D9' ...
1,1,'C♯11' 'D7' 'D-1' 'C♯11' 'F7' 'D-1' 'G♯9' 'G4'...,'C11' 'C9' 'D♯-1' 'C11' 'C♯9' 'D♯-1' 'C11' 'D9...
2,2,'C♯11' 'D6' 'G-1' 'C♯11' 'G6' 'E-1' 'C♯11' 'B6...,'C11' 'G7' 'D-1' 'C♯11' 'G4' 'A-1' 'C♯11' 'G5'...
3,3,'G♯9' 'G4' 'C-1' 'G♯9' 'G5' 'C-1' 'G♯9' 'D6' '...,'C11' 'G5' 'A-1' 'C11' 'G6' 'D-1' 'C♯11' 'G4' ...
4,4,'C♯11' 'G5' 'C♯0' 'C♯11' 'G6' 'D♯-1' 'G♯9' 'G4...,'G♯9' 'B6' 'C-1' 'A9' 'G4' 'C-1' 'A9' 'D6' 'C-...
...,...,...,...
3995,3995,'G♯9' 'G5' 'C-1' 'G♯9' 'D6' 'C-1' 'G♯9' 'G6' '...,'A9' 'G4' 'C-1' 'C11' 'G4' 'D♯-1' 'C♯11' 'G4' ...
3996,3996,'A9' 'G5' 'C-1' 'A9' 'D6' 'C-1' 'A9' 'G6' 'C-1...,'C♯11' 'G4' 'D-1' 'G♯9' 'G4' 'C-1' 'A9' 'G4' '...
3997,3997,'C11' 'G6' 'E-1' 'C11' 'B6' 'D-1' 'C11' 'G7' '...,'A9' 'G5' 'C-1' 'C11' 'G4' 'B-1' 'C♯11' 'G4' '...
3998,3998,'C11' 'G5' 'F♯-1' 'C11' 'D6' 'D♯-1' 'C♯11' 'G4...,'C♯11' 'G4' 'D-1' 'G♯9' 'G4' 'C-1' 'A9' 'G4' '...


In [7]:
df.drop("Unnamed: 0",inplace=True,axis=1)

In [8]:
train = df.iloc[0:3500]
valid = df.iloc[3500:4000]

In [9]:
train.to_csv("train.csv",index=False)
valid.to_csv("valid.csv",index=False)

In [10]:
train

Unnamed: 0,Song,Guitar
0,'C11' 'G4' 'D♯-1' 'C11' 'G5' 'D-1' 'C♯11' 'G4'...,'C11' 'D9' 'D-1' 'C♯11' 'D9' 'D-1' 'G♯9' 'D9' ...
1,'C♯11' 'D7' 'D-1' 'C♯11' 'F7' 'D-1' 'G♯9' 'G4'...,'C11' 'C9' 'D♯-1' 'C11' 'C♯9' 'D♯-1' 'C11' 'D9...
2,'C♯11' 'D6' 'G-1' 'C♯11' 'G6' 'E-1' 'C♯11' 'B6...,'C11' 'G7' 'D-1' 'C♯11' 'G4' 'A-1' 'C♯11' 'G5'...
3,'G♯9' 'G4' 'C-1' 'G♯9' 'G5' 'C-1' 'G♯9' 'D6' '...,'C11' 'G5' 'A-1' 'C11' 'G6' 'D-1' 'C♯11' 'G4' ...
4,'C♯11' 'G5' 'C♯0' 'C♯11' 'G6' 'D♯-1' 'G♯9' 'G4...,'G♯9' 'B6' 'C-1' 'A9' 'G4' 'C-1' 'A9' 'D6' 'C-...
...,...,...
3495,'C♯11' 'G8' 'D-1' 'C♯11' 'G♯8' 'D-1' 'C♯11' 'A...,'A9' 'D7' 'C-1' 'C11' 'G4' 'F♯-1' 'C♯11' 'G4' ...
3496,'A9' 'A7' 'C-1' 'C11' 'G4' 'D♯0' 'C11' 'G5' 'D...,'C11' 'G7' 'E-1' 'C♯11' 'F7' 'G-1' 'C♯11' 'G7'...
3497,'G♯9' 'F7' 'C-1' 'G♯9' 'G7' 'C-1' 'G♯9' 'A7' '...,'C11' 'G6' 'D-1' 'C♯11' 'G4' 'C♯0' 'C♯11' 'G5'...
3498,'G♯9' 'G6' 'C-1' 'G♯9' 'B6' 'C-1' 'G♯9' 'D7' '...,'C♯11' 'G4' 'D-1' 'C♯11' 'G5' 'F♯-1' 'C♯11' 'D...


In [11]:
train_data = TabularDataset(
           path = "train.csv", 
           format='csv',
           skip_header=True, 
           fields=([("Song", SRC), ("Guitar", TRG)]))

valid_data = TabularDataset(
           path = "valid.csv", 
           format='csv',
           skip_header=True, 
           fields=([("Song", SRC), ("Guitar", TRG)]))

In [12]:
SRC.build_vocab(train_data, min_freq=1)
TRG.build_vocab(train_data, min_freq=1)

In [13]:
len(SRC.vocab)

107

In [14]:
len(TRG.vocab)

136

In [15]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of testing examples: {len(valid_data.examples)}")

Number of training examples: 3500
Number of testing examples: 500


In [16]:
print(vars(train_data.examples[0]))

{'Song': ["'c11'", "'g4'", "'d♯-1'", "'c11'", "'g5'", "'d-1'", "'c♯11'", "'g4'", "'d♯-1'", "'c♯11'", "'g5'", "'d-1'", "'g♯9'", "'g4'", "'c-1'", "'g♯9'", "'g5'", "'c-1'", "'a9'", "'g4'", "'c-1'", "'a9'", "'g5'", "'c-1'", "'c11'", "'g4'", "'f♯-1'", "'c11'", "'g5'", "'d-1'", "'c11'", "'d6'", "'d♯-1'", "'c♯11'", "'g4'", "'f♯-1'", "'c♯11'", "'g5'", "'d-1'", "'c♯11'", "'d6'", "'d♯-1'", "'g♯9'", "'g4'", "'c-1'", "'g♯9'", "'g5'", "'c-1'", "'g♯9'", "'d6'", "'c-1'", "'a9'", "'g4'", "'c-1'", "'a9'", "'g5'", "'c-1'", "'a9'", "'d6'", "'c-1'", "'c11'", "'g4'", "'g♯-1'", "'c11'", "'g5'", "'e-1'", "'c11'", "'d6'", "'d-1'", "'c♯11'", "'g4'", "'g♯-1'", "'c♯11'", "'g5'", "'e-1'", "'c♯11'", "'d6'", "'d-1'", "'g♯9'", "'g4'", "'c-1'", "'g♯9'", "'g5'", "'c-1'", "'g♯9'", "'d6'", "'c-1'", "'a9'", "'g4'", "'c-1'", "'a9'", "'g5'", "'c-1'", "'a9'", "'d6'", "'c-1'", "'c11'", "'g4'", "'a♯-1'", "'c11'", "'g5'", "'e-1'", "'c11'", "'d6'", "'d-1'", "'c♯11'", "'g4'", "'a♯-1'", "'c♯11'", "'g5'", "'e-1'", "'c♯11'", "'d6'"

In [17]:
TRG.vocab.freqs

Counter({"'A-1'": 13445,
         "'A0'": 2012,
         "'A1'": 492,
         "'A2'": 177,
         "'A3'": 97,
         "'A4'": 51,
         "'A5'": 53,
         "'A6'": 23,
         "'A7'": 49907,
         "'A8'": 2597,
         "'A9'": 262541,
         "'A♯-1'": 10798,
         "'A♯0'": 1758,
         "'A♯1'": 432,
         "'A♯2'": 142,
         "'A♯3'": 92,
         "'A♯4'": 52,
         "'A♯5'": 39,
         "'A♯6'": 20,
         "'A♯7'": 20,
         "'A♯8'": 2950,
         "'B-1'": 8969,
         "'B0'": 1599,
         "'B1'": 364,
         "'B2'": 161,
         "'B3'": 83,
         "'B4'": 37,
         "'B5'": 33,
         "'B6'": 93445,
         "'B7'": 26488,
         "'B8'": 3554,
         "'C-1'": 530468,
         "'C0'": 7215,
         "'C1'": 1442,
         "'C11'": 267952,
         "'C2'": 332,
         "'C3'": 131,
         "'C4'": 82,
         "'C5'": 57,
         "'C6'": 40,
         "'C7'": 24,
         "'C8'": 18,
         "'C9'": 5583,
         "'C♯0'": 6142,
   

In [18]:
BATCH_SIZE = 1

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, valid_data), 
     batch_size = BATCH_SIZE,
     sort_within_batch = False,
     sort_key = lambda x : len(x.src),
     device = device)

cuda


In [19]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, enc_hid_dim, bidirectional = True)
        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)
        self.dropout = nn.Dropout(dropout)    
    def forward(self, src, src_len):        
        embedded = self.dropout(self.embedding(src))                
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len,enforce_sorted=False)
                
        packed_outputs, hidden = self.rnn(packed_embedded)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs) 
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))
        return outputs, hidden

In [20]:
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs, mask):
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2))) 
        attention = self.v(energy).squeeze(2)
        attention = attention.masked_fill(mask == 0, -1e10)
        return F.softmax(attention, dim = 1)

In [70]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((enc_hid_dim * 2) + emb_dim, dec_hid_dim)
        self.fc_out = nn.Linear((enc_hid_dim * 2) + dec_hid_dim + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, input, hidden, encoder_outputs, mask):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs, mask)
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim = 2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        print(output.shape)
        print(hidden.shape)
        # assert (output == hidden).all()
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim = 1))
        return prediction, hidden.squeeze(0), a.squeeze(1)

In [71]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx, device, teacher_forcing_ratio = 0.5):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.device = device
        self.teacher_forcing_ratio = teacher_forcing_ratio 
        
    def create_mask(self, src):
        mask = (src != self.src_pad_idx).permute(1, 0)
        return mask
        
    def forward(self, src, src_len, trg,):
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src, src_len.cpu())
        input = trg[0,:]

        mask = self.create_mask(src)
        for t in range(1, trg_len):
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs, mask)
            outputs[t] = output
            teacher_force = random.random() < self.teacher_forcing_ratio
            top1 = output.argmax(1) 
            input = trg[t] if teacher_force else top1
            
        return outputs

In [72]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 128
DEC_EMB_DIM = 128
ENC_HID_DIM = 256
DEC_HID_DIM = 256
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]

attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, DEC_DROPOUT, attn)

model = Seq2Seq(enc, dec, SRC_PAD_IDX, device).to(device)

In [73]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(107, 128)
    (rnn): GRU(128, 256, bidirectional=True)
    (fc): Linear(in_features=512, out_features=256, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (attention): Attention(
      (attn): Linear(in_features=768, out_features=256, bias=True)
      (v): Linear(in_features=256, out_features=1, bias=False)
    )
    (embedding): Embedding(136, 128)
    (rnn): GRU(640, 256)
    (fc_out): Linear(in_features=896, out_features=136, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [74]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,764,104 trainable parameters


In [75]:
learning_rate = 0.003  # 0.003 in paper
patience = 0
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# optimizer = optim.SGD(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                optimizer=optimizer,
                mode='min', factor=0.9, # 0.9 in paper
                patience=patience)

In [76]:
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)

In [77]:
def update(epoch, valid_loss, valid_acc, 
           best_valid_loss, best_valid_acc, acc_valid_loss,
           update_type='acc'):
    global best_valid_epoch, early_stop_patience, full_patience, best_train_step, train_steps, exp_num
    print("\n---------------------------------------")
    print("[Epoch: {}][Validatiing...]".format(epoch))
    if valid_loss < best_valid_loss:
        print('\t\t Better Valid Loss!')
        best_valid_loss = valid_loss
        if update_type == 'loss':
            torch.save(model.state_dict(), 'loss-model.pt')
        early_stop_patience = full_patience  # restore full patience if obtain new minimum of the loss
    else:
        if early_stop_patience > 0:
            early_stop_patience += -1
    
    if valid_acc > best_valid_acc or (valid_acc == best_valid_acc and valid_loss < acc_valid_loss):
        print('\t\t Better Valid Acc!')
        best_valid_acc = valid_acc
        acc_valid_loss = valid_loss
        best_valid_epoch = epoch
        best_train_step = train_steps
        if update_type == 'acc':
            torch.save(model.state_dict(), 'experiments/exp' + str(exp_num) + '/acc-model-seq2seq.pt')
    print(f'\t patience: {early_stop_patience}/{full_patience}')
    print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')
    print(f'\t BEST. Val. Loss: {best_valid_loss:.3f} | BEST. Val. Acc: {best_valid_acc:.3f} | Val. Loss: {acc_valid_loss:.3f} | BEST. Val. Epoch: {best_valid_epoch} | BEST. Val. Step: {best_train_step}')
    print("---------------------------------------\n")
    return best_valid_loss, best_valid_acc, acc_valid_loss

In [78]:
n_examples = len(train_data.examples)

def train(model, iterator, 
          optimizer, criterion, 
          clip, epoch,
          scheduler, valid_iterator):
    
    global best_valid_loss, acc_valid_loss, best_valid_acc, best_valid_epoch, train_steps, report_steps, tfr
    model.train()
    model.teacher_forcing_ratio = tfr
    print("[Train]: Current Teacher Forcing Ratio: {:.3f}".format(model.teacher_forcing_ratio))
    
    epoch_loss = 0
    running_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src, src_len = batch.Song
        trg, trg_len = batch.Guitar
        
        optimizer.zero_grad()
        
        output = model(src, src_len, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        running_loss = epoch_loss / (i + 1)
        
        # print every 50 batches (50 steps)
        if i % report_steps == report_steps - 1:
            train_steps += report_steps  # by doing so, the last batch is neglected
            for param_group in optimizer.param_groups:
                lr = param_group['lr']
            print('[Epoch: {}][#examples: {}/{}][#steps: {}]'.format(epoch, (i+1) * BATCH_SIZE, n_examples, train_steps))
            print(f'\tTrain Loss: {running_loss:.3f} | Train PPL: {math.exp(running_loss):7.3f} | lr: {lr:.3e}')
            
            # eval the validation set for every * steps
            if (train_steps % (10 * report_steps)) == 0:
                print('-----val------')
                valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, scheduler)
                print('-----tst------')
                test_loss, test_acc = evaluate(model, test_iterator, criterion, scheduler, is_test=True)
                best_valid_loss, best_valid_acc, acc_valid_loss = update(epoch, valid_loss, valid_acc, 
                                                         best_valid_loss, best_valid_acc, acc_valid_loss,
                                                         update_type='acc')
                scheduler.step(valid_loss)  # must be placed here otherwise the test acc messes up
                model.train()
                
            
    return epoch_loss / len(iterator)

In [79]:
def evaluate(model, iterator, criterion, scheduler, is_test=False):
    
    model.eval()
    model.teacher_forcing_ratio = 0 #  turn off teacher forcing
    print("[Eval Start]: Current Teacher Forcing Ratio: {:.3f}".format(model.teacher_forcing_ratio))
    
    epoch_loss = 0
    correct = 0
    
    global valid_data, test_data, tfr
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src, src_len = batch.Song
            trg, trg_len = batch.Guitar

            output = model(src, src_len, trg)
            
            # ---------compute acc START----------
            pred = output[1:].argmax(2).permute(1, 0) # [batch_size, trg_len]
            ref = trg[1:].permute(1, 0)
            # consider the last batch as well
            size = pred.shape[0]
            for j in range(size):
                
                pred_j = pred[j, :]
                pred_j_toks = []
                for t in pred_j:
                    tok = TRG.vocab.itos[t]
                    if tok == '<eos>':
                        break
                    else:
                        pred_j_toks.append(tok)
                pred_j = ''.join(pred_j_toks)
                
                ref_j = ref[j, :]
                ref_j_toks = []
                for t in ref_j:
                    tok = TRG.vocab.itos[t]
                    if tok == '<eos>':
                        break
                    else:
                        ref_j_toks.append(tok)
                ref_j = ''.join(ref_j_toks)
                
                if pred_j == ref_j:
                    correct += 1
            # ---------compute acc END----------
            
            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
        # compute loss and acc
        epoch_loss = epoch_loss / len(iterator)
        # sheduler applies on acc
        if not is_test:
            acc = correct / len(valid_data.examples)
            
        else:
            acc = correct / len(test_data.examples)
        
        print('The number of correct predictions: {}'.format(correct))
        
        model.teacher_forcing_ratio = tfr  # restore teacher-forcing ratio
        print("[Eval End]: Current Teacher Forcing Ratio: {:.3f}".format(model.teacher_forcing_ratio))
    
    return epoch_loss, acc

In [80]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 1
CLIP = 1

best_valid_loss = float('inf')
acc_valid_loss = float('inf')
best_valid_acc = float(-1)
best_valid_epoch = -1
best_train_step = -1
full_patience = 20
early_stop_patience = full_patience
train_steps = 0
report_steps = 250
exp_num = 1

try:
    for epoch in range(N_EPOCHS):

        if epoch <= 15:
            early_stop_patience = full_patience

        if early_stop_patience == 0:
            print("Early Stopping!")
            # break
            # abandon early stopping because we found best epoch in a long run

        start_time = time.time()

        tfr = max(1 - (float(10 + epoch * 1.5) / 50), 0.2) 

        train_loss = train(model, train_iterator, optimizer, criterion, CLIP, epoch, scheduler, valid_iterator)

        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, scheduler, is_test=False)
        #test_loss, test_acc = evaluate(model, test_iterator, criterion, scheduler, is_test=True)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        best_valid_loss, best_valid_acc, acc_valid_loss = update(epoch, valid_loss, valid_acc, 
                                                 best_valid_loss, best_valid_acc, acc_valid_loss, update_type='loss')

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc:.3f} | Val. PPL: {math.exp(valid_loss):7.3f}')
        # print(f'\t Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} | Test ACC: {test_acc:.3f}')
except KeyboardInterrupt:
        print("Exiting loop")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])
torch.Size([1, 1, 256])