# Seq2Seq EqLearn -- First Attempt

### Imports

In [28]:
import torch.optim as optim
import torch
import torch.nn as nn
import numpy as np
import random
from torch.utils.data import TensorDataset, DataLoader,Dataset
from torch.utils.data.sampler import SubsetRandomSampler
import torchtext.datasets 


### Deterministic Results

In [29]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### Generate Datasets
As a first experiment we will create a dataset with all the targets padded to same length. Additionally, we discard equation yielding too large values associated with the fixed support. This is to apply a scaling and have data from 0 tpo 1. 
Next stesps should create variable length output and batch them according to their length so that the amount of padding is minimized. See torchtext.dataset.

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Parameters

In [31]:
#device = torch.device('cpu')

INPUT_DIM = 1
OUTPUT_DIM = 16
EMB_DIM = 256
HID_DIM = 512 # each conv. layer has 2 * hid_dim filters
ENC_LAYERS = 2 # number of conv. blocks in encoder 10 original
DEC_LAYERS = 2 # number of conv. blocks in decoder 10 original
ENC_KERNEL_SIZE = 3 # must be odd!
DEC_KERNEL_SIZE = 3 # can be even or odd
ENC_DROPOUT = 0.25
DEC_DROPOUT = 0.25
TRG_PAD_IDX = 0

## Model Declaration

In [32]:
from eq_learner.architectures.cnn import Decoder, Encoder, Seq2Seq
from eq_learner.architectures.embedding import NaiveEmbedding

In [33]:
enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, ENC_LAYERS, ENC_KERNEL_SIZE, ENC_DROPOUT, device)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DEC_LAYERS, DEC_KERNEL_SIZE, DEC_DROPOUT, TRG_PAD_IDX, device)

model = Seq2Seq(enc, dec).to(device)

### Apply initial weights

In [34]:
from eq_learner.architectures import utils
utils.init_weights(model)
        


In [35]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 7,101,712 trainable parameters


In [36]:
optimizer = optim.Adam(model.parameters())

In [37]:
criterion = nn.CrossEntropyLoss(ignore_index = 0)

In [38]:
"""def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch[0]
        trg = batch[1]
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        

        output = output[1:].view(-1, output_dim)
        trg = trg.T[1:].contiguous().view(-1)
        #trg = trg.T[1:].reshape(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]

        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)"""

'def train(model, iterator, optimizer, criterion, clip):\n    \n    model.train()\n    \n    epoch_loss = 0\n    \n    for i, batch in enumerate(iterator):\n        \n        src = batch[0]\n        trg = batch[1]\n        \n        optimizer.zero_grad()\n        \n        output = model(src, trg)\n        \n        #trg = [trg len, batch size]\n        #output = [trg len, batch size, output dim]\n        \n        output_dim = output.shape[-1]\n        \n\n        output = output[1:].view(-1, output_dim)\n        trg = trg.T[1:].contiguous().view(-1)\n        #trg = trg.T[1:].reshape(-1)\n        \n        #trg = [(trg len - 1) * batch size]\n        #output = [(trg len - 1) * batch size, output dim]\n\n        loss = criterion(output, trg)\n        \n        loss.backward()\n        \n        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)\n        \n        optimizer.step()\n        \n        epoch_loss += loss.item()\n        \n    return epoch_loss / len(iterator)'

In [39]:
"""def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch[0]
            trg = batch[1]

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg.T[1:].contiguous().view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)"""

'def evaluate(model, iterator, criterion):\n    \n    model.eval()\n    \n    epoch_loss = 0\n    \n    with torch.no_grad():\n    \n        for i, batch in enumerate(iterator):\n\n            src = batch[0]\n            trg = batch[1]\n\n            output = model(src, trg, 0) #turn off teacher forcing\n\n            #trg = [trg len, batch size]\n            #output = [trg len, batch size, output dim]\n\n            output_dim = output.shape[-1]\n            \n            output = output[1:].view(-1, output_dim)\n            trg = trg.T[1:].contiguous().view(-1)\n\n            #trg = [(trg len - 1) * batch size]\n            #output = [(trg len - 1) * batch size, output dim]\n\n            loss = criterion(output, trg)\n            \n            epoch_loss += loss.item()\n        \n    return epoch_loss / len(iterator)'

In [40]:
"""def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs"""

'def epoch_time(start_time, end_time):\n    elapsed_time = end_time - start_time\n    elapsed_mins = int(elapsed_time / 60)\n    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))\n    return elapsed_mins, elapsed_secs'

In [41]:
N_EPOCHS = 200
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

NameError: name 'time' is not defined

In [42]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss = evaluate(model, test_loader, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

RuntimeError: Error(s) in loading state_dict for Seq2Seq:
	Missing key(s) in state_dict: "encoder.emb2hid.weight", "encoder.emb2hid.bias", "encoder.hid2emb.weight", "encoder.hid2emb.bias", "encoder.convs.0.weight", "encoder.convs.0.bias", "encoder.convs.1.weight", "encoder.convs.1.bias", "decoder.tok_embedding.weight", "decoder.pos_embedding.weight", "decoder.emb2hid.weight", "decoder.emb2hid.bias", "decoder.hid2emb.weight", "decoder.hid2emb.bias", "decoder.attn_hid2emb.weight", "decoder.attn_hid2emb.bias", "decoder.attn_emb2hid.weight", "decoder.attn_emb2hid.bias", "decoder.convs.0.weight", "decoder.convs.0.bias", "decoder.convs.1.weight", "decoder.convs.1.bias". 
	Unexpected key(s) in state_dict: "encoder.linear.weight", "encoder.linear.bias", "encoder.rnn.weight_ih_l0", "encoder.rnn.weight_hh_l0", "encoder.rnn.bias_ih_l0", "encoder.rnn.bias_hh_l0", "encoder.rnn.weight_ih_l1", "encoder.rnn.weight_hh_l1", "encoder.rnn.bias_ih_l1", "encoder.rnn.bias_hh_l1", "decoder.embedding.weight", "decoder.rnn.weight_ih_l0", "decoder.rnn.weight_hh_l0", "decoder.rnn.bias_ih_l0", "decoder.rnn.bias_hh_l0", "decoder.rnn.weight_ih_l1", "decoder.rnn.weight_hh_l1", "decoder.rnn.bias_ih_l1", "decoder.rnn.bias_hh_l1". 
	size mismatch for decoder.fc_out.weight: copying a param with shape torch.Size([16, 500]) from checkpoint, the shape in current model is torch.Size([16, 256]).

In [43]:
model.eval()
with torch.no_grad():
    a = model(x_test[:1000].cuda(), y_test_p[:1000].long().cuda())

NameError: name 'x_test' is not defined

In [44]:
n = 91

In [45]:
ll = len((y_test[n].numpy()))
print(tokenization.get_string(np.array([12]+list(a.detach().argmax(2)[1:ll,n].cpu().numpy()))))
print(tokenization.get_string(y_test[n].numpy()))

NameError: name 'y_test' is not defined

In [46]:
seq = y_test[n].numpy().astype('int')
print(seq)
l = len(y_test[n].numpy())
seqs = a.detach().argmax(2)[:l,n].cpu().numpy()
print(seqs)
print(seq == seqs)


NameError: name 'y_test' is not defined

In [47]:
for j in range(len(y_test)):
    w=0
    for i in range(len(y_train)):
        try:
            if torch.sum(y_test[j] == y_train[i]) == len(y_test[j]):
                w = w+1
        except RuntimeError:
            pass
    if w == 0:
        print(j)

NameError: name 'y_test' is not defined

In [48]:
print(y_train[2034])
print(y_test[10])

NameError: name 'y_train' is not defined