In [None]:
import random
import torch
import scaleformer

## Loading data

In [None]:
with open("data/sentence_pairs.txt", encoding="utf-8") as file:
    data = file.read().split("\n")
    en, fr = zip(*[d.lower().split("\t") for d in data if len(d) > 0])

## Training input's tokenizer

In [None]:
try:
    tokenizer_in = scaleformer.BytePairEncoder.load("files/tokenizer_in.json")
except:
    tokenizer_in = scaleformer.BytePairEncoder()
    subwords_en = tokenizer_in.train(en, min_frequency=1.0E-7, max_tokens=5000, prune=True)
    tokenizer_in.save("files/tokenizer_in.json", overwrite=True)

## Training target's tokenizer

In [None]:
try:
    tokenizer_out = scaleformer.BytePairEncoder.load("files/tokenizer_out.json")
except:
    tokenizer_out = scaleformer.BytePairEncoder()
    subwords_fr = tokenizer_out.train(fr, min_frequency=1.0E-7, max_tokens=5000, prune=True)
    tokenizer_out.save("files/tokenizer_out.json", overwrite=True)

In [None]:
from importlib import reload
scaleformer = reload(scaleformer)

## Converting dataset to tensors

In [None]:
try:
    x_train = torch.load("files/x_train.pty")
    y_train = torch.load("files/y_train.pty")
    x_val = torch.load("files/x_val.pty")
    y_val = torch.load("files/y_val.pty")
except:
    x = scaleformer.strings_to_tensor(en, tokenizer_in)
    y = scaleformer.strings_to_tensor(fr, tokenizer_out)
    indexes = list(range(len(x)))
    random.shuffle(indexes)
    lim = int(round(0.8*len(x)))
    i_train, i_val = indexes[:lim], indexes[lim:]
    x_train, y_train = x[i_train], y[i_train]
    x_val, y_val = x[i_val], y[i_val]
    torch.save(x_train, "files/x_train.pty")
    torch.save(y_train, "files/y_train.pty")
    torch.save(x_val, "files/x_val.pty")
    torch.save(y_val, "files/y_val.pty")

## Training the model

In [None]:
model = scaleformer.Transformer(tokenizer_in, tokenizer_out, 6, 64, 4, dropout=0., scalable=True)
model.to("cuda:0")
optimizer = torch.optim.Adam(model.parameters(), lr=1.0E-3)
train_losses, val_losses, best_epoch = scaleformer.train_loop(model, optimizer, (x_train, y_train), (x_val, y_val), n_epochs=1000, patience=100, batch_size=100)
torch.save(model, "files/model.pty")
torch.save(optimizer, "files/optimizer.pty")

## Display results

In [None]:
scaleformer.plot_loss(train_losses, val_losses, best_epoch)