In [None]:
import os
os.chdir(r'4 - Transformer')
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import random
from itertools import chain
from itertools import groupby
from functools import reduce
from typing import Collection, List
from pathlib import Path
import music21 as m21
musescore_path = '/usr/bin/mscore'
m21.environment.set('musicxmlPath', musescore_path)
m21.environment.set('musescoreDirectPNGPath', musescore_path)
from midi_encoding import *

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(f"Using {device}.")

In [None]:
if device == "cuda":
    print(f"Device: {torch.cuda.get_device_name()}.")

In [None]:
from midi_encoding import *

vocab = MusicVocab()

In [None]:
vg_large_path = Path('../data/midi/vg_large')
vg_large_file_names = [f for f in os.listdir(vg_large_path) if os.path.isfile(os.path.join(vg_large_path, f))]
len(vg_large_file_names)

### Data loading

Previously we used a fixed context window to split out data into blocks, e.g. 'if we see this block of 8 tokens, what is the ninth?'.

That meant we always had to init the generation with an entire block of context, mostly made of padding. We also needed to add these 'padding init' blocks to training so that the network had seen them before inference time.

Now that we are working with transformers, we will take each block and split it into samples of varying lengths, e.g.

Data - (1, 2, 3, 4)

X(1) -> Y(2)

X(1,2) -> Y(3)

X(1,2,3) -> Y(4)

Andrej loads the entire text dataset into a single 1D tensor, then randomly indexes into it to choose a block.

Because I want to prevent blocks of performances overlapping (i.e. I never want a block with a token *after* `<|eos|>`), I don't want to just append all the sample data into a flat tensor. Instead I will block encode the data upfront and then randomly index into the resulting 2D tensor of blocks.

This is similar to the approach taken in previous notebooks. One difference here is that I am going to encode the entire dataset at once and *then* split into train / test / validation data, rather than split by filenames, because it occured to me that the MIDI tracks might have very different lengths.

### Position

We are going to need to add positional encodings to our data.

Transformers employ a position embedding layer to distinguish between positions in a sequence.

In addition to this, we want to create embeddings for the bar and beat.

These can all be added together to get a hybrid 'position' vector.

To allow us to calculate the bar and beat we need the absolute position of each token, which we can encode at the same time as the batches.

In [None]:
def block_encode(file_names, vocab, block_size):
    xs, ys = [], []
    for file_name in file_names[0:1]:
        file_path = Path(vg_large_path, file_name)
        idx_pos_score = midifile_to_idx_score(file_path, vocab)
        for i in range(0, len(idx_pos_score) - block_size, 1):
            xs.append(idx_pos_score[i:i+block_size])
            ys.append(idx_pos_score[i+block_size][0])
    return np.stack(xs), ys # stack xs and ps to create 2D tensor

def load_or_create(file_names, vocab, sample_path, label_path):
    if sample_path.exists() and label_path.exists():
        xs, ys = np.load(sample_path, allow_pickle=True), np.load(label_path, allow_pickle=True)
    else:
        xs, ys = block_encode(file_names, vocab, block_size)
        np.save(sample_path, xs)
        np.save(label_path, ys)
    return torch.tensor(xs, device=device), torch.tensor(ys, device=device)

block_size = 8
vg_large_samples_path = Path(f'../data/numpy/vg_large/block_{block_size}_samples_with_position_all.npy')
vg_large_labels_path = Path(f'../data/numpy/vg_large/block_{block_size}_labels_all.npy')

X, Y = load_or_create(vg_large_file_names, vocab, vg_large_samples_path, vg_large_labels_path)

X.shape, Y.shape

In [None]:
X[0:3]

In [None]:
Y[0:3]