In [4]:
import sys
sys.path.append('../')  # required to use harmony package

from harmony.models.seq2seq import TransformerEncoder, TransformerDecoder, Seq2SeqTransformer

In [5]:
import os
import h5py
import math
import json
import torch
import librosa

import numpy as np
import torch.nn as nn
import torch.optim as optim
import IPython.display as ipd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from collections import defaultdict
from skimage.util import view_as_windows
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


## Carregamento dos dados

In [7]:
with h5py.File('/local/thiago.poppe/beatles_vocab.h5') as vocab:
    vocab_size = len(vocab)

print('Vocab size:', vocab_size)

Vocab size: 450


In [8]:
class BeatlesDataset(Dataset):
    def __init__(self, split='train'):
        self.filepath = f'/local/thiago.poppe/beatles_{split}_chunks.h5'
        with h5py.File(self.filepath, 'r') as h5:
            self.chunk_ids = list(h5.keys())
    
    def __len__(self):
        return len(self.chunk_ids)

    def __getitem__(self, idx):
        chunk_id = self.chunk_ids[idx]
        with h5py.File(self.filepath, 'r') as h5:
            harmony = h5[chunk_id]['harmony'][:]
            spec = h5[chunk_id]['spectrogram'][:].T  # transformer expects (seq_length, n_features)

        return spec, harmony

train_dataset = BeatlesDataset(split='train')
valid_dataset = BeatlesDataset(split='valid')

In [9]:
def collate_fn(batches):
    specs = [torch.from_numpy(batch[0]).float() for batch in batches]
    specs = pad_sequence(specs, batch_first=True)
    padding_mask = (specs == 0)[..., 0]
    
    harmony = [torch.from_numpy(batch[1]) for batch in batches]
    harmony = torch.stack(harmony)

    return specs, harmony, padding_mask

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(valid_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

print('Length of train dataloader:', len(train_dataloader))
print('Length of valid dataloader:', len(valid_dataloader))

Length of train dataloader: 50
Length of valid dataloader: 12


## Definindo modelo seq2seq

In [15]:
encoder = TransformerEncoder(in_features=6, embedding_size=8, num_heads=1, dim_feedforward=16, num_layers=1)
decoder = TransformerDecoder(vocab_size=vocab_size, embedding_size=8, num_heads=1, dim_feedforward=16, num_layers=1)
model = Seq2SeqTransformer(encoder, decoder).to(device)

print('Number of learnable parameters:', sum(p.numel() for p in model.parameters()))
print(model)

Number of learnable parameters: 9210
Seq2SeqTransformer(
  (encoder): TransformerEncoder(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=8, out_features=8, bias=True)
          )
          (linear1): Linear(in_features=8, out_features=16, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=16, out_features=8, bias=True)
          (norm1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (embedding): LinearEmbeddingLayer(
      (embedding): Linear(in_features=6, out_features=8, bias=True)
      (pos_encoding): PositionalEncoding(
        (dropout): Dropout(p=0.1, inplace=Fa

Aqui iremos testar o modelo com um dado aleatório, apenas para ver se tudo está funcionando.
- Tamanho da entrada: `(batch_size, seq_length, in_features) -> (4, 128, 6)`
- Tamanho da saída: `(batch_size, seq_length) -> (4, 64)`

In [16]:
src = torch.rand(4, 128, 6).to(device)
tgt = torch.randint(low=0, high=vocab_size, size=(4, 64)).long().to(device)

padding_mask = torch.zeros(4, 128).bool().to(device)  # padding mask da entrada
tgt_causal_mask = nn.Transformer.generate_square_subsequent_mask(64).to(device)  # máscara causal tem que ter o mesmo tamanho do seq_length da saída

outputs = model(src, tgt, tgt_mask=tgt_causal_mask, padding_mask=padding_mask)
print('Tamanho da sáida:', outputs.shape)  # (batch_size, seq_length, vocab_size)

Tamanho da sáida: torch.Size([4, 64, 450])


## Treinando modelo seq2seq

In [21]:
def train(model, train_dataloader, criterion, optimizer):
    train_loss = 0.0
    train_acc = 0.0
    
    model.train()
    for specs, harmony, padding_mask in train_dataloader:
        specs = specs.to(device)
        
        with h5py.File('/local/thiago.poppe/beatles_vocab.h5') as vocab:
            batch_size = harmony.size(0)
            sos_tokens = torch.full((batch_size, 1), fill_value=vocab['<sos>'][()])
            eos_tokens = torch.full((batch_size, 1), fill_value=vocab['<eos>'][()])
            
            decoder_input = torch.cat([sos_tokens, harmony], dim=1).to(device)
            decoder_target = torch.cat([harmony, eos_tokens], dim=1).to(device)

        padding_mask = padding_mask.to(device)
        tgt_causal_mask = nn.Transformer.generate_square_subsequent_mask(decoder_input.size(1)).to(device)
        
        outputs = model(specs, decoder_input, tgt_mask=tgt_causal_mask, padding_mask=padding_mask)
        loss = criterion(outputs.transpose(1,2), decoder_target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        predictions = torch.argmax(outputs, dim=-1)
        train_acc += torch.sum(predictions.flatten() == decoder_target.flatten()) / len(predictions.flatten())
    
    train_loss /= len(train_dataloader)
    train_acc /= len(train_dataloader)
    
    return train_loss, train_acc


def validate(model, valid_dataloader, criterion):
    valid_loss = 0.0
    valid_acc = 0.0

    # Por algum motivo não rodou com model.eval() aqui e num_heads = 1... vou ver com mais calma o pq depois!
    # model.eval()
    
    with torch.no_grad():
        for specs, harmony, padding_mask in valid_dataloader:
            specs = specs.to(device)
            
            with h5py.File('/local/thiago.poppe/beatles_vocab.h5') as vocab:
                batch_size = harmony.size(0)
                sos_tokens = torch.full((batch_size, 1), fill_value=vocab['<sos>'][()])
                eos_tokens = torch.full((batch_size, 1), fill_value=vocab['<eos>'][()])
                
                decoder_input = torch.cat([sos_tokens, harmony], dim=1).to(device)
                decoder_target = torch.cat([harmony, eos_tokens], dim=1).to(device)
    
            padding_mask = padding_mask.to(device)
            tgt_causal_mask = nn.Transformer.generate_square_subsequent_mask(decoder_input.size(1)).to(device)
            
            outputs = model(specs, decoder_input, tgt_mask=tgt_causal_mask, padding_mask=padding_mask)
            loss = criterion(outputs.transpose(1,2), decoder_target)
            valid_loss += loss.item()

            predictions = torch.argmax(outputs, dim=-1)
            valid_acc += torch.sum(predictions.flatten() == decoder_target.flatten()) / len(predictions.flatten())
    
        valid_loss /= len(valid_dataloader)
        valid_acc /= len(valid_dataloader)
        
        return valid_loss, valid_acc

In [22]:
num_epochs = 100
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_accs = []
valid_accs = []

train_losses = []
valid_losses = []

for epoch in range(1, num_epochs+1):
    train_loss, train_acc = train(model, train_dataloader, criterion, optimizer)
    valid_loss, valid_acc = validate(model, valid_dataloader, criterion)

    if epoch % 5 == 0:
        print(f'Epoch {epoch}/{num_epochs}:')
        print(f' - Train loss: {train_loss:.5f}, train accuracy: {train_acc:.5f}')
        print(f' - Valid loss: {valid_loss:.5f}, valid accuracy: {valid_acc:.5f}', end='\n\n')

        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        train_accs.append(train_acc)
        valid_accs.append(valid_acc)

Epoch 5/100:
 - Train loss: 4.88566, train accuracy: 0.13386
 - Valid loss: 5.11720, valid accuracy: 0.18678

Epoch 10/100:
 - Train loss: 3.92718, train accuracy: 0.26294
 - Valid loss: 4.52776, valid accuracy: 0.32019

Epoch 15/100:
 - Train loss: 3.31506, train accuracy: 0.36389
 - Valid loss: 4.25788, valid accuracy: 0.36434

Epoch 20/100:
 - Train loss: 2.88806, train accuracy: 0.43604
 - Valid loss: 4.15363, valid accuracy: 0.39367

Epoch 25/100:
 - Train loss: 2.57766, train accuracy: 0.50606
 - Valid loss: 4.13561, valid accuracy: 0.41482

Epoch 30/100:
 - Train loss: 2.32899, train accuracy: 0.56241
 - Valid loss: 4.08987, valid accuracy: 0.43534

Epoch 35/100:
 - Train loss: 2.13842, train accuracy: 0.60636
 - Valid loss: 4.06285, valid accuracy: 0.45729

Epoch 40/100:
 - Train loss: 1.97911, train accuracy: 0.64349
 - Valid loss: 4.14391, valid accuracy: 0.45841

Epoch 45/100:
 - Train loss: 1.85372, train accuracy: 0.67543
 - Valid loss: 4.08670, valid accuracy: 0.48037

Ep