In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

In [2]:
def loss_fn(output, target):
    pitch_loss = nn.CrossEntropyLoss()(
        output['pitch'].reshape(-1, output['pitch'].shape[-1]),
        target[:, :, 0].reshape(-1).long()  # <-- cast to long
    )
    mse_loss = nn.MSELoss()
    
    velocity_loss = mse_loss(
        output['velocity'].reshape(-1, output['velocity'].shape[-1]),
        target[:, :, 1].reshape(-1, 1)
    )
    time_loss = mse_loss(
        output['time'].reshape(-1, output['time'].shape[-1]),
        target[:, :, 2].reshape(-1, 1)
    )
    duration_loss = mse_loss(
        output['duration'].reshape(-1, output['duration'].shape[-1]),
        target[:, :, 3].reshape(-1, 1)
    )

    return pitch_loss + velocity_loss + time_loss + duration_loss


In [3]:
from src.models.GRU import GRUAutoencoder
from src.dataloader import EventMidiDataset

In [4]:
import os
from tqdm import tqdm

In [5]:
folder_path = 'data/midi_dataset/midis'
file_names = os.listdir(folder_path)
file_names = [os.path.join(folder_path, f) for f in file_names]

In [6]:
model = GRUAutoencoder().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [7]:
dataset = EventMidiDataset(file_names[:1000])

midi_dataloader = DataLoader(
    dataset,
    batch_size=16,
    pin_memory=True,
    num_workers=4,
    shuffle=True,
)

In [11]:
max_vals = None
min_vals = None

for X_batch in midi_dataloader:
    batch_max = X_batch.view(-1, X_batch.shape[-1]).max(dim=0).values
    batch_min = X_batch.view(-1, X_batch.shape[-1]).min(dim=0).values

    if max_vals is None:
        max_vals = batch_max
        min_vals = batch_min
    else:
        max_vals = torch.max(max_vals, batch_max)
        min_vals = torch.min(min_vals, batch_min)

print("Max values across all batches (pitch, velocity, time, duration):", max_vals)
print("Min values across all batches (pitch, velocity, time, duration):", min_vals)


Max values across all batches (pitch, velocity, time, duration): tensor([105.0000,   0.9370, 233.6133,   6.0052])
Min values across all batches (pitch, velocity, time, duration): tensor([2.2000e+01, 3.1496e-02, 0.0000e+00, 1.3021e-03])


In [None]:
def fit(dataloader, model, optimizer, loss_fn, epochs=50):
    for _ in range(epochs):
        for X_batch in tqdm(dataloader):
            X_batch = X_batch.cuda()

            input_seq = X_batch[:, :-1]
            target_seq = X_batch[:, 1:]

            logits = model(input_seq)
            loss = loss_fn(logits, target_seq)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [None]:
fit(midi_dataloader, model, optimizer, loss_fn, 50)

  0%|          | 0/23224 [00:15<?, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)

In [None]:
test_dataset = EventMidiDataset(file_names[1000:1100])

test_dl = DataLoader(
    dataset,
    batch_size=16,
    pin_memory=True,
    num_workers=4,
    shuffle=True,
)

In [None]:
total_loss = 0
num_batches = 0

model.eval()
with torch.no_grad():
    for X_batch in test_dl:
        X_batch = X_batch.cuda()
        input_seq = X_batch[:, :-1]
        target_seq = X_batch[:, 1:]

        output = model(input_seq)
        loss = loss_fn(output, target_seq)

        total_loss += loss.item()
        num_batches += 1

avg_loss = total_loss / num_batches
print(f"Average Test Loss: {avg_loss:.4f}")

Average Test Loss: 0.5023


In [None]:
sample = test_dataset[0]  # shape: (seq_len, 4)
sample = sample.unsqueeze(0).cuda()  # Add batch dimension → shape: (1, seq_len, 4)

In [None]:
model.eval()
with torch.no_grad():
    reconstructed = model(sample)
reconstructed_pitch = torch.argmax(reconstructed['pitch'], dim=-1)
reconstructed_velocity = output['velocity'].squeeze(-1)
reconstructed_time = output['time'].squeeze(-1)
reconstructed_duration = output['duration'].squeeze(-1)

print(reconstructed_pitch.shape, reconstructed_velocity.shape, reconstructed_time.shape, reconstructed_duration.shape)

reconstructed_seq = torch.stack([
    reconstructed_pitch,
    reconstructed_velocity,
    reconstructed_time,
    reconstructed_duration
], dim=-1).squeeze(0)
print("Original vs Reconstructed (first 10 steps):")
for i in range(10):
    orig = sample[0, i].cpu().numpy()
    recon = reconstructed_seq[i].cpu().numpy()
    print(f"{i:02d}: Orig: {orig}  |  Recon: {recon}")
import matplotlib.pyplot as plt

fig, axs = plt.subplots(4, 1, figsize=(10, 6), sharex=True)
labels = ["Pitch", "Velocity", "Time", "Duration"]

for i in range(4):
    axs[i].plot(sample[0, :, i].cpu(), label="Original")
    axs[i].plot(reconstructed_seq[:, i].cpu(), label="Reconstructed", linestyle='--')
    axs[i].set_ylabel(labels[i])
    axs[i].legend()

plt.xlabel("Time Step")
plt.suptitle("Original vs Reconstructed MIDI Features")
plt.tight_layout()
plt.show()


torch.Size([1, 10]) torch.Size([14, 9, 128]) torch.Size([14, 9, 255]) torch.Size([14, 9, 64])


RuntimeError: stack expects each tensor to be equal size, but got [1, 10] at entry 0 and [14, 9, 128] at entry 1