In [None]:
import os
os.chdir(r'4 - Transformer')
import torch
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import random
from itertools import chain
from itertools import groupby
from functools import reduce
from typing import Collection, List
from pathlib import Path
import music21 as m21
musescore_path = '/usr/bin/mscore'
m21.environment.set('musicxmlPath', musescore_path)
m21.environment.set('musescoreDirectPNGPath', musescore_path)
from midi_encoding import *

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

print(f"Using {device}.")

In [None]:
if device == "cuda":
    print(f"Device: {torch.cuda.get_device_name()}.")

In [None]:
from midi_encoding import *

vocab = MusicVocab()

In [None]:
vg_large_path = Path('../data/midi/vg_large')
vg_large_file_names = [f for f in os.listdir(vg_large_path) if os.path.isfile(os.path.join(vg_large_path, f))]
len(vg_large_file_names)

### Data loading

Previously we used a fixed context window to split out data into blocks, e.g. 'if we see this block of 8 tokens, what is the ninth?'.

That meant we always had to init the generation with an entire block of context, mostly made of padding. We also needed to add these 'padding init' blocks to training so that the network had seen them before inference time.

Now that we are working with transformers, we will take each block and split it into samples of varying lengths, e.g.

Data - (1, 2, 3, 4)

X(1) -> Y(2)

X(1,2) -> Y(3)

X(1,2,3) -> Y(4)

Because we are going to drastically scale up our block size, it is no longer feasible to pre-block encode the data as every token is duplicated up to block size.

We will need to load a single 1D tensor, then randomly indexes into it to choose a block.

This does mean that at boundaries we will have blocks of performances overlapping (i.e. there will be blocks with a token *after* `<|eos|>`), but hopefully the network will realise that data before and after is correlated.

Alternatively, we could try re-sampling if we get a block with `<|eos|>` anywhere other than the end?

In this notebook we are going to encode the entire dataset at once and *then* split into train / test / validation data, rather than first split by filenames, because it occured to me that the MIDI tracks might have very different lengths.


### Labels

Because we are training every position in the sequence at once, we need a block of labels for each block of data.

This is just the next block in the array (i.e. `X[1]` is the labels for `X[0]`, and `X[2]` is the labels for `X[1]` etc etc), so there is no point in encoding them separately.

### Position

We are going to need to add positional encodings to our data.

Transformers employ a position embedding layer to distinguish between positions in a sequence.

In addition to this, we want to create embeddings for the bar and beat.

These can all be added together to get a hybrid 'position' vector.

To allow us to calculate the bar and beat we need the absolute position of each token, which we can encode at the same time as the batches.

> See `sparse_to_position_enc` - this is where we throw away timestep info, so we now 'snaphot' tidx and package alongside the token values.

In [None]:
def idx_encode(file_names):
    # We can't shuffle the blocks afterwards as we rely on them being contiguous in order to get the targets
    # If we didn't shuffle at all then the train and test splits would contain specific game soundtracks rather than a mixture
    random.shuffle(file_names)
    xs = []
    for file_name in file_names:
        file_path = Path(vg_large_path, file_name)
        idx_score = midifile_to_idx_score(file_path, vocab)
        if idx_score is not None: # Files which aren't 4/4 are ignored
            xs.append(idx_score)
    if len(xs) == 0:
        return np.empty((0,2))
    else:
        return np.concat(xs) # concat scores to create 1D tensor

def load_or_create(file_names, sample_path):
    if sample_path.exists():
        xs = np.load(sample_path, allow_pickle=True)
    else:
        xs = idx_encode(file_names)
        np.save(sample_path, xs)

    return torch.tensor(xs, device=device)

vg_large_samples_path = Path(f'../data/numpy/vg_large/samples_with_position_all.npy')

data = load_or_create(vg_large_file_names, vg_large_samples_path)

data.shape

Maybe experiment with 8 divisions per q to give 32nd resolution

In [None]:
X = data[0:-1, :]
Y = data[1:, 0] # Drop position from Y

In [None]:
vocab.to_tokens(X[:, 0][:12])

In [None]:
vocab.to_tokens(Y[:12])

In [None]:
vocab.to_tokens(X[:, 0][-12:])

In [None]:
vocab.to_tokens(Y[-12:])