In [1]:
import torch
import seqgen.seq_gen as g
import random
import matplotlib.pyplot as plt
import seaborn as sns
from seqgen.model import rnn
from seqgen.vocabulary import *
from seqgen.model import transformer, embedding
from seqgen.datasets.sequences import *
from seqgen.datasets.realdata import RealSequencesDataset
import random

torch.autograd.set_detect_anomaly(True)

%load_ext autoreload
%autoreload 2

In [2]:
if torch.cuda.device_count():
    device="cuda"
else:
    device="cpu"
print("Device", device)

Device cuda


# The Transformer

In [3]:
lr=1e-3
num_layers=4
embedding_dim=512
batch_size=512
max_length=10
heads=32
dropout=0.1

vocab_in = Vocabulary(vocab_filename="seqgen/vocab_in.txt")
vocab_out = Vocabulary(vocab_filename="seqgen/vocab_out.txt")

dataset = RealSequencesDataset(filename="data/train/label.txt", vocab_in=vocab_in, vocab_out=vocab_out, max_length=max_length-1, batch_size=batch_size, device=device, use_random=False)
dataset_val = RealSequencesDataset(filename="data/val/label.txt", vocab_in=vocab_in, vocab_out=vocab_out, max_length=max_length-1, batch_size=batch_size, device=device, use_random=False)


load_from_checkpoint = True
checkpoint_file = "transformer_2023-02-13_17-11-10.pt"

# Transformer model
model = transformer.Transformer(
    encoder_embedding_type=embedding.EmbeddingType.COORDS_DIRECT,
    src_vocab_size=len(vocab_in),
    trg_vocab_size=len(vocab_out),
    embedding_dim=embedding_dim,
    num_layers=num_layers,
    heads=heads,
    dropout=dropout,
    src_pad_idx=3,
    trg_pad_idx=3,
    max_length=max_length,
    device=device
).to(device)

# Initialize optimizer for encoder and decoder
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 30, gamma=0.95)

# Loss function
criterion = torch.nn.NLLLoss(ignore_index=3) # reduction="sum"

# Load model weights from checkpoint
if load_from_checkpoint:
    checkpoint = torch.load(checkpoint_file, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

In [4]:
input_seqs, coordinates, target_seqs = dataset[0]
output = model(input_seqs, target_seqs, coordinates)

accuracy = 0.0
loss = 0.0
for i in range(max_length-1):
    topv, topi = output[:, i, :].topk(1)
    _loss = criterion(output[:, i, :], target_seqs[:, i+1])
    loss += _loss
    accuracy += float((topi.squeeze() == target_seqs[:, i+1]).sum()) / (target_seqs.size(0) * (target_seqs.size(1)-1))

print(loss.item(), accuracy)

13.323533058166504 0.3151041666666667


In [5]:
def predict(input_seqs, coordinates, target_seqs):
    vocab_in = Vocabulary(vocab_filename="seqgen/vocab_in.txt")
    vocab_out = Vocabulary(vocab_filename="seqgen/vocab_out.txt")

    with torch.no_grad():
        output = model(input_seqs.cuda(), target_seqs.cuda(), coordinates.cuda())
        # Get the predicted classes of the model
        topv, topi = output.topk(1, dim=2)
        
        return topi.squeeze()
    
def predict_sequentially(input_seqs, coordinates):
    prediction = torch.zeros((input_seqs.size(0), input_seqs.size(1)-1)).to(torch.int64)
    for i in range(max_length-1):
        output = predict(input_seqs, coordinates, prediction)
        prediction[:, i] = output[:, i]
    return prediction

# Training

In [6]:
history = []
accuracies = []
accuracies_val = []

for epoch in range(1000):    
    # Get a batch of training data
    input_seqs, coordinates, target_seqs = dataset[0]
    input_seqs_val, coordinates_val, target_seqs_val = dataset_val[0]
    
    # Set gradients of all model parameters to zero
    optimizer.zero_grad()

    # Initialize loss
    loss = 0
    accuracy = 0.0
    accuracy_val = 0.0

    #####################
    #    TRANSFORMER    #
    #####################
    
    # Run the input sequences through the model
    output = model(input_seqs, target_seqs, coordinates)
    
    # Iterate over sequence positions to compute the loss
    for i in range(max_length-1):
        # Get the predicted classes of the model
        topv, topi = output[:, i, :].topk(1)
        loss += criterion(output[:, i, :], target_seqs[:, i+1])
        accuracy += float((topi.squeeze() == target_seqs[:, i+1]).sum()) / (target_seqs.size(0) * (target_seqs.size(1)-1))
    
    predictions = predict_sequentially(input_seqs_val, coordinates_val).cuda()
    for i in range(max_length-1):
        topi_val = predictions[:, i]
        accuracy_val += float((topi_val == target_seqs_val[:, i+1]).sum()) / (target_seqs_val.size(0)*(target_seqs_val.size(1)-1))
    
    history.append(loss.item())
    accuracies.append(accuracy)
    accuracies_val.append(accuracy_val)
    
    print_every = 50
    if not epoch % print_every:
        _accuracy = sum(accuracies[-print_every:]) / print_every
        _accuracy_val = sum(accuracies_val[-print_every:]) / print_every
        lr = scheduler.get_last_lr()[0]
        print(f"LOSS after epoch {epoch}", loss.item(), "LR", lr, "ACCURACY", _accuracy, "ACCURACY_Val", _accuracy_val)

    # Compute gradient
    loss.backward()
    accuracy = 0.0

    # Update weights of encoder and decoder
    optimizer.step()
    # scheduler.step()

LOSS after epoch 0 13.374762535095215 LR 0.001 ACCURACY 0.006874999999999999 ACCURACY_Val 0.0013237847222222223
LOSS after epoch 50 13.214398384094238 LR 0.001 ACCURACY 0.32578993055555555 ACCURACY_Val 0.07720486111111109
LOSS after epoch 100 13.07995891571045 LR 0.001 ACCURACY 0.3291970486111112 ACCURACY_Val 0.07569010416666666
LOSS after epoch 150 12.695701599121094 LR 0.001 ACCURACY 0.333381076388889 ACCURACY_Val 0.0756684027777778
LOSS after epoch 200 12.16403579711914 LR 0.001 ACCURACY 0.33809027777777784 ACCURACY_Val 0.07125
LOSS after epoch 250 12.164941787719727 LR 0.001 ACCURACY 0.3403168402777777 ACCURACY_Val 0.07113715277777781
LOSS after epoch 300 11.88941478729248 LR 0.001 ACCURACY 0.34415364583333335 ACCURACY_Val 0.07014756944444447
LOSS after epoch 350 11.602581977844238 LR 0.001 ACCURACY 0.3491710069444444 ACCURACY_Val 0.06879774305555558


KeyboardInterrupt: 

#### Save model history

In [None]:
import pickle
from datetime import datetime

model_data = {
    "history": history,
    "accuracy": accuracies,
    "lr": lr,
    "num_layers": num_layers,
    "embedding_dim": embedding_dim,
    "batch_size": batch_size,
    "max_length": max_length,
    "heads": heads,
    "dropout": dropout,
}

now = datetime.now() # current date and time
date_time = now.strftime("%Y-%m-%d_%H-%M-%S")

torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
    "embedding_dim": embedding_dim,
    "batch_size": batch_size,
    "max_length": max_length,
    "num_layers": num_layers,
    "heads": heads,
    "dropout": dropout,
}, "transformer_" + date_time + ".pt")


with open("training_" + date_time + '.pkl', 'wb') as f:
    pickle.dump(model_data, f)

## Make predictions

We run our input sequences through the model and get output seuences. Then we decode the output sequences with the Vocabulary class and get our final latex code.

In [None]:
def predict(input_seqs, coordinates, target_seqs):
    vocab_in = Vocabulary(vocab_filename="seqgen/vocab_in.txt")
    vocab_out = Vocabulary(vocab_filename="seqgen/vocab_out.txt")

    with torch.no_grad():
        output = model(input_seqs.cuda(), target_seqs.cuda(), coordinates.cuda())
        # Get the predicted classes of the model
        topv, topi = output.topk(1, dim=2)
        
        return topi.squeeze()
    
def predict_sequentially(input_seqs, coordinates):
    prediction = torch.zeros((input_seqs.size(0), input_seqs.size(1)-1)).to(torch.int64)
    for i in range(max_length-1):
        output = predict(input_seqs, coordinates, prediction)
        prediction[:, i] = output[:, i]
    return prediction

In [None]:
dataset_val = RealSequencesDataset(filename="data/val/label.txt", vocab_in=vocab_in, vocab_out=vocab_out, max_length=max_length-1, batch_size=batch_size, device=device)

In [None]:
input_seqs, coordinates, target_seqs = dataset[0]
predictions = predict_sequentially(input_seqs, coordinates)

In [None]:
# Pick random sequence and its prediction from the model


vocab_in = Vocabulary(vocab_filename="seqgen/vocab_in.txt")
vocab_out = Vocabulary(vocab_filename="seqgen/vocab_out.txt")

i = random.randint(0, predictions.size(0)-1)

input = vocab_in.decode_sequence(input_seqs[i].cpu().numpy())
output = vocab_out.decode_sequence(predictions[i].cpu().numpy())
target = vocab_out.decode_sequence(target_seqs[i].cpu().numpy())

#input = list(filter(lambda x:(x != '<pad>') and (x != '<start>') and (x != '<end>'), input))
#output = list(filter(lambda x: (x != '<pad>'), output))
#target = list(filter(lambda x: (x != '<pad>') and (x != '<start>') and (x != '<end>'), target))


print("MODEL INPUT: \t", " ".join(input))
print("MODEL OUTPUT: \t", " ".join(output))
print("TARGET OUTPUT: \t", " ".join(target))