In [1]:
import torch
import torch.nn as nn
import h5py
import sys
sys.path.append('../')  # required to use harmony package

from harmony.models.seq2seq import TransformerEncoder, TransformerDecoder, Seq2SeqTransformer

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [3]:
with h5py.File ("/storage/datasets/thiago.poppe/BPS_FH/dataset/train.h5","r") as fp:
    print(len(fp))
    print(fp['data_0'].keys())
    print(fp['data_0']['labels'][:].shape)
    print(fp['data_0']['pianoroll'][:].shape)
    print(fp['data_0']['labels'][:][:, :10])
    print(fp['data_0']['pianoroll'][:])


5316
<KeysViewHDF5 ['labels', 'mask', 'pianoroll']>
(6, 160)
(1, 640, 12)
[[12 12 12 12 12 12 12 12 12 12]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 4  4  4  4  4  4  4  4  4  4]
 [ 0  0  0  0  0  0  0  0  0  0]
 [ 1  1  1  1  1  1  1  1  1  1]
 [ 7  7  7  7  7  7  7  7  7  7]]
[[[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]


Carregando Dados


In [4]:
import h5py
import pickle

base_path = "/storage/datasets/thiago.poppe/BPS_FH/dataset/"
splits = ['train', 'test', 'validation']

label_train_dict = {}
label_test_dict = {}
label_validation_dict = {}

all_labels = {'<sos>', '<eos>'}
for nome_split in splits:
    with h5py.File(f"{base_path}{nome_split}.h5", "r") as fp:
        for dt in fp:
            labels = fp[dt]['labels'][:]
            transposed_labels = labels.T
            
            for row in transposed_labels:
                label_str = ' '.join([str(num) for num in row])
                all_labels.add(label_str)
    
token_dict = {string: i for i, string in enumerate(all_labels)}
    
    #with open(f"{base_path}{nome_split}_token_dict.pkl", 'wb') as f:
        #pickle.dump(token_dict, f)

    # if nome_split == 'train':
    #     label_train_dict = token_dict
    # elif nome_split == 'test':
    #     label_test_dict = token_dict
    # elif nome_split == 'validation':
    #     label_validation_dict = token_dict

    # print(f"Dicionário de tokenização para o split {nome_split} salvo com sucesso")

# print("Dicionário para o conjunto de treinamento:")
# print(label_train_dict)
# print("\nDicionário para o conjunto de teste:")
# print(label_test_dict)
# print("\nDicionário para o conjunto de validação:")
# print(label_validation_dict)

In [5]:
len(token_dict)

2794

In [6]:
import numpy as np
from torch.utils.data import Dataset, DataLoader

class PianoRollDataSet(Dataset):
    def __init__(self, split):
        self.filepath = f'/storage/datasets/thiago.poppe/BPS_FH/dataset/{split}.h5'
        with h5py.File(self.filepath, 'r') as fp:
            self.dataset_size = len(fp.keys())
    
    def __len__(self):
        return self.dataset_size
        
    def __getitem__(self, idx):
        with h5py.File(self.filepath, 'r') as fp:
            data = fp[f'data_{idx}']
            pianoroll = data['pianoroll'][:][0]
            labels = data['labels'][:]
            mask = data['mask'][:][0]

        labels = np.array(list(map(lambda row: token_dict[' '.join([str(num) for num in row])], labels.T)))
        return pianoroll, labels, mask

In [7]:
with h5py.File('/storage/datasets/thiago.poppe/BPS_FH/dataset/validation.h5', 'r') as fp:
    print(len(fp))

69


In [8]:
train_dataset = PianoRollDataSet('train')

test_dataset = PianoRollDataSet('test')
valid_dataset = PianoRollDataSet('validation')

len(train_dataset)


5316

In [9]:
pianoroll, labels, mask = train_dataset[0]
pianoroll.shape, labels.shape, mask.shape

((640, 12), (160,), (160,))

In [10]:
from torch.nn.utils.rnn import pad_sequence

In [17]:
def collate_fn(batches):
    pianoroll = [torch.from_numpy(batch[0]).float() for batch in batches]
    pianoroll = pad_sequence(pianoroll, batch_first=True)
    
    labels = [torch.from_numpy(batch[1]) for batch in batches]
    labels = pad_sequence(labels, batch_first=True)

    mask = [torch.from_numpy(batch[2]) for batch in batches]
    mask = pad_sequence(mask, batch_first=True)

    return pianoroll, labels, ~mask.bool()


In [18]:
from tqdm.notebook import tqdm

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
len(train_dataloader)
validation_dataloader = DataLoader(valid_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
len(train_dataloader)

for pianoroll, labels, mask in tqdm(train_dataloader):
    continue
for pianoroll, labels, mask in tqdm(validation_dataloader):
    continue
    
pianoroll.shape, labels.shape, mask.shape

  0%|          | 0/1329 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

(torch.Size([1, 640, 12]), torch.Size([1, 160]), torch.Size([1, 160]))

Definindo Modelo

In [19]:

encoder = TransformerEncoder(in_features=12 , embedding_size=8, num_heads=1, dim_feedforward=16, num_layers=1)
decoder = TransformerDecoder(vocab_size=len(token_dict), embedding_size=8, num_heads=1, dim_feedforward=16, num_layers=1)
model = Seq2SeqTransformer(encoder, decoder).to(device)

print('Number of learnable parameters:', sum(p.numel() for p in model.parameters()))
print(model)

Number of learnable parameters: 49106
Seq2SeqTransformer(
  (encoder): TransformerEncoder(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=8, out_features=8, bias=True)
          )
          (linear1): Linear(in_features=8, out_features=16, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=16, out_features=8, bias=True)
          (norm1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (embedding): LinearEmbeddingLayer(
      (embedding): Linear(in_features=12, out_features=8, bias=True)
      (pos_encoding): PositionalEncoding(
        (dropout): Dropout(p=0.1, inplace=

In [22]:
src = torch.rand(4, 640, 12).to(device)
tgt = torch.randint(low=0, high=len(token_dict), size=(4, 160)).long().to(device)

padding_mask = torch.zeros(4, 160).bool().to(device)  # padding mask da entrada
tgt_causal_mask = nn.Transformer.generate_square_subsequent_mask(160).to(device)  # máscara causal tem que ter o mesmo tamanho do seq_length da saída

outputs = model(src, tgt, tgt_mask=tgt_causal_mask, padding_mask=padding_mask)
print('Tamanho da sáida:', outputs.shape)  # (batch_size, seq_length, vocab_size)

Tamanho da sáida: torch.Size([4, 160, 2794])


Treinando o Modelo

In [34]:
from tqdm.notebook import tqdm

def train(model, train_dataloader, criterion, optimizer):
    train_loss = 0.0
    train_acc = 0.0
    
    model.train()
    for pianoroll, label, padding_mask in tqdm(train_dataloader):
        pianoroll = pianoroll.to(device)

        batch_size = 4
        sos_tokens = torch.full((batch_size, 1), fill_value=token_dict['<sos>'])
        eos_tokens = torch.full((batch_size, 1), fill_value=token_dict['<eos>'])

        # print(eos_tokens, sos_tokens, label)
        # print(eos_tokens.shape, sos_tokens.shape, label.shape)
        decoder_input = torch.cat([sos_tokens, label], dim=1).to(device)
        decoder_target = torch.cat([label, eos_tokens], dim=1).to(device)

        falses = torch.full((batch_size, 1), fill_value=False)
        padding_mask = torch.cat([falses, padding_mask], dim=1).to(device)
        tgt_causal_mask = nn.Transformer.generate_square_subsequent_mask(decoder_input.size(1)).to(device)
        
        outputs = model(pianoroll, decoder_input, tgt_mask=tgt_causal_mask, padding_mask=padding_mask)
        loss = criterion(outputs.transpose(1,2), decoder_target)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        predictions = torch.argmax(outputs, dim=-1)
        train_acc += torch.sum(predictions.flatten() == decoder_target.flatten()) / len(predictions.flatten())
    
    train_loss /= len(train_dataloader)
    train_acc /= len(train_dataloader)
    
    return train_loss, train_acc

In [35]:
import torch.optim as optim

num_epochs = 20
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_accs = []
valid_accs = []

train_losses = []
valid_losses = []
for epoch in range(1, num_epochs+1):
    train_loss, train_acc = train(model, train_dataloader, criterion, optimizer)
    #valid_loss, valid_acc = validate(model, valid_dataloader, criterion)

    if epoch % 2 == 0:
        print(f'Epoch {epoch}/{num_epochs}:')
        print(f' - Train loss: {train_loss:.5f}, train accuracy: {train_acc:.5f}')
        #print(f' - Valid loss: {valid_loss:.5f}, valid accuracy: {valid_acc:.5f}', end='\n\n')

        train_losses.append(train_loss)
        #valid_losses.append(valid_loss)

        train_accs.append(train_acc)
        #valid_accs.append(valid_acc)

  0%|          | 0/1329 [00:00<?, ?it/s]

  0%|          | 0/1329 [00:00<?, ?it/s]

Epoch 2/20:
 - Train loss: 4.19266, train accuracy: 0.31928


  0%|          | 0/1329 [00:00<?, ?it/s]

  0%|          | 0/1329 [00:00<?, ?it/s]

Epoch 4/20:
 - Train loss: 3.29840, train accuracy: 0.46144


  0%|          | 0/1329 [00:00<?, ?it/s]

  0%|          | 0/1329 [00:00<?, ?it/s]

Epoch 6/20:
 - Train loss: 2.87999, train accuracy: 0.52058


  0%|          | 0/1329 [00:00<?, ?it/s]

  0%|          | 0/1329 [00:00<?, ?it/s]

Epoch 8/20:
 - Train loss: 2.62265, train accuracy: 0.55954


  0%|          | 0/1329 [00:00<?, ?it/s]

  0%|          | 0/1329 [00:00<?, ?it/s]

Epoch 10/20:
 - Train loss: 2.44093, train accuracy: 0.58735


  0%|          | 0/1329 [00:00<?, ?it/s]

  0%|          | 0/1329 [00:00<?, ?it/s]

Epoch 12/20:
 - Train loss: 2.30932, train accuracy: 0.60746


  0%|          | 0/1329 [00:00<?, ?it/s]

  0%|          | 0/1329 [00:00<?, ?it/s]

Epoch 14/20:
 - Train loss: 2.20980, train accuracy: 0.62342


  0%|          | 0/1329 [00:00<?, ?it/s]

  0%|          | 0/1329 [00:00<?, ?it/s]

Epoch 16/20:
 - Train loss: 2.12845, train accuracy: 0.63629


  0%|          | 0/1329 [00:00<?, ?it/s]

  0%|          | 0/1329 [00:00<?, ?it/s]

Epoch 18/20:
 - Train loss: 2.06257, train accuracy: 0.64667


  0%|          | 0/1329 [00:00<?, ?it/s]

  0%|          | 0/1329 [00:00<?, ?it/s]

Epoch 20/20:
 - Train loss: 2.00775, train accuracy: 0.65514
