In [1]:
from src.data_preprocessing import load_processed_fairytales_dataset, save_experiment_input, shuffle
from src.my_transformer import MyTransformer
import torch
from tqdm import tqdm

In [2]:
encoding_pairs, decoding_pairs, vocab, reverse_vocab = load_processed_fairytales_dataset(how_many=10, context_size=50)

Reading files: 2124 files [00:00, 4148.45 files/s]
Preprocessing: 100%|██████████| 10/10 [00:00<00:00, 2502.12it/s]
Converting strings to integers: 100%|██████████| 10/10 [00:00<00:00, 9988.82it/s]
Generating encoded pairs: 100%|██████████| 10/10 [00:00<00:00, 1666.72it/s]


In [3]:
src_data = torch.tensor(encoding_pairs, dtype=torch.long)
tgt_data = torch.tensor(decoding_pairs, dtype=torch.long)

In [4]:
experiment_number = 8  # experiment_number = get_next_folder_number(Path('./models'))
save_experiment_input(src_data, tgt_data, vocab, reverse_vocab, experiment_number)

Saved successfully


In [5]:
# configuration
d_model = 512  # dimensionality of the input vectors
num_heads = 8  # number of transformer heads (multi-head attention)
drop_prob = 0.1  # dropout
batch_size = 10
max_sequence_length = 50  # longest sequence in corpus (or limit) - the input length is always padded to this
ffn_hidden = 2048  # feed-forward part size (expanding dimensionality)
num_layers = 5  # Transformer layers
src_vocab_size = len(vocab)
target_vocab_size = len(vocab)

In [6]:
optimus_prime = MyTransformer(src_vocab_size, target_vocab_size, d_model, num_heads, num_layers, ffn_hidden,
                              max_sequence_length, drop_prob)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
optimus_prime.to(device)
src_data = src_data.to(device)
tgt_data = tgt_data.to(device)

num_epochs = 50

criterion = torch.nn.CrossEntropyLoss(ignore_index=0).to(device)
optimizer = torch.optim.Adam(optimus_prime.parameters(), lr=0.01)

average_loss = 0
for epoch in range(1, num_epochs + 1):
    p_bar = tqdm(range(0, len(src_data), batch_size), desc=f"Learning epoch: {epoch}, average loss: {average_loss:.4f}")
    optimus_prime.train()
    src_data, tgt_data = shuffle(src_data, tgt_data)
    total_loss_in_epoch = 0.0
    for batch in p_bar:
        src_batch = src_data[batch:batch + batch_size].to(device)
        tgt_batch = tgt_data[batch:batch + batch_size].to(device)

        optimizer.zero_grad()

        output = optimus_prime(src_batch, tgt_batch)
        output = output.view(-1, target_vocab_size)

        tgt_batch = tgt_batch.view(-1)

        loss = criterion(output, tgt_batch)
        loss.backward()

        optimizer.step()

        total_loss_in_epoch += loss.item()

        # if not (batch % (10000 * batch_size)):
        #     torch.save(optimus_prime, f'./models/{experiment_number}/model_{datetime.datetime.now().strftime("%Y_%m_%d__%H_%M_%S")}.pth')

        p_bar.set_description(
            desc=f"Learning epoch: {epoch}, average loss: {(total_loss_in_epoch / ((batch // batch_size) + 1)):.4f}")

    if not epoch % 5:
        torch.save(optimus_prime, f'./models/{experiment_number}/model_after_{epoch}_epoch.pth')

Learning epoch: 1, average loss: 7.4212:  26%|██▌       | 130/497 [00:10<00:30, 12.09it/s]
