# NNDL Project - Riccardo De Vidi - 2152869

## A. Preprocessing

### 1. Import

In [None]:
!sudo apt-get install fluidsynth > /dev/null 2>&1
!pip install pretty_midi mir_eval pyfluidsynth > /dev/null 2>&1
import pretty_midi

import IPython.display as display

import numpy as np
import os
import shutil

### 2. Data download

Download *The Lakh MIDI Dataset v0.1* and extract it

In [None]:
!wget http://hog.ee.columbia.edu/craffel/lmd/clean_midi.tar.gz
!tar -xzf "/content/clean_midi.tar.gz"

Paths:

* ORIGINAL_DATA_PATH stores the uncompressed .tar.gz archive (made up of many subfolders, one per artist)
* DATA_PATH will store a reorganized version of the uncompressed archive in ORIGINAL_DATA_PATH (without subfolders)
* MELODY_FOLDER will store the melody tracks
* CHORD_FOLDER will store the chord tracks



In [None]:
ORIGINAL_DATA_PATH = 'clean_midi'
DATA_PATH = 'midi_files'
MELODY_FOLDER = 'melody'
CHORD_FOLDER = 'chord'

os.mkdir(DATA_PATH)
os.mkdir(MELODY_FOLDER)
os.mkdir(CHORD_FOLDER)

Reorganize midi files in a simpler structure

In [None]:
file_count = 0
for (root,dirs,files) in os.walk(ORIGINAL_DATA_PATH, topdown=True):
    for f in files:
        shutil.copy(os.path.join(root, f), os.path.join(DATA_PATH, ''))
        file_count += 1

print(f"There are {file_count} files in the dataset")

### 3. Melody and chord data generation

In [None]:
def roll_to_bars(filename, melody_folder, chord_folder):

    save_name = filename.split('/')[-1].split('.')[0]
    melody_path = os.path.join(melody_folder, save_name)
    chord_path = os.path.join(chord_folder, save_name)

    melody_bars = []
    chord_bars = []

    try:

        midi_data = pretty_midi.PrettyMIDI(filename)
        time_changes = midi_data.time_signature_changes
        tempo_change_times, tempi = midi_data.get_tempo_changes()

        if (len(time_changes) == 1 and
            time_changes[0].numerator == 4 and
            time_changes[0].denominator == 4 and
            time_changes[0].time == 0.0 and
            len(tempo_change_times) == 1 and
            tempo_change_times[0] == 0.0):

                quarter_note_length = 60.0/tempi[0]
                bar_length_s = quarter_note_length*4
                bar_len = 16
                fs = int(bar_len/bar_length_s)

                end_time = midi_data.get_end_time()
                roll_length = (int(end_time*fs)//bar_len)*bar_len

                for instrument in midi_data.instruments:

                    roll = instrument.get_piano_roll(fs) > 0
                    if np.shape(roll)[1] < roll_length:
                        roll = np.pad(roll, ((0, 0), (0, roll_length - np.shape(roll)[1])), 'constant', constant_values=0) > 0
                    elif np.shape(roll)[1] > roll_length:
                        roll = roll[:, :roll_length]

                    if np.shape(roll)[0] == 128 and np.sum(roll[0:60, :]) == 0 and np.sum(roll[83:128, :]) == 0:

                        cutted_roll = roll[60:84, :]

                        roll_bars = np.hsplit(cutted_roll, cutted_roll.shape[1]//bar_len)

                        if len(melody_bars) == 0:
                            for t in roll_bars:
                                melody_bars.append(np.array(np.zeros((np.shape(t))), dtype=bool))

                        if len(chord_bars) == 0:
                            for s in roll_bars:
                                chord_bars.append(np.array(np.zeros((np.shape(t))), dtype=bool))

                        bar_index = 0
                        for t in roll_bars:
                            is_melody = np.max(np.sum(t, axis=0)) < 3
                            if is_melody:
                                melody_bars[bar_index] = np.logical_or(melody_bars[bar_index], t)
                            else:
                                chord_bars[bar_index] = np.logical_or(chord_bars[bar_index], t)
                            bar_index = bar_index + 1

        if len(melody_bars) > 0:
            np.savez(melody_path + ".npz", *melody_bars)
            np.savez(chord_path + ".npz", *chord_bars)

    except Exception as e:
        print(e)

In [None]:
preprocessing_file_count = 0
for entry in os.scandir(DATA_PATH):
    roll_to_bars(entry.path, MELODY_FOLDER, CHORD_FOLDER)
    preprocessing_file_count = preprocessing_file_count + 1
    display.clear_output(wait=True)
    print(f"Processed {preprocessing_file_count}/{file_count} files")

!zip -r melody.zip melody/ > /dev/null 2>&1
!zip -r chord.zip chord/ > /dev/null 2>&1

### 4. Dataset generation

In [None]:
data = []
th = 3

melody_file_count = 0
for (root,dirs,files) in os.walk(MELODY_FOLDER, topdown=True):
    for f in files:
        melody_file_count = melody_file_count + 1

data_file_count = 0
for (root,dirs,files) in os.walk(MELODY_FOLDER, topdown=True):
    for f in files:
        melody_bar_list = list(np.load(os.path.join(root, f)).values())
        chord_bar_list = list(np.load(os.path.join(CHORD_FOLDER, f)).values())
        for i in range(len(melody_bar_list) - 1):
            previous_chord_bar = chord_bar_list[i]
            previous_melody_bar = melody_bar_list[i]
            chord_bar = chord_bar_list[i + 1]
            melody_bar = melody_bar_list[i + 1]
            if np.sum(previous_chord_bar) > th and np.sum(previous_melody_bar) > th and np.sum(chord_bar) > th and np.sum(melody_bar) > th:
                data.append([previous_chord_bar, previous_melody_bar, chord_bar, melody_bar])
                data.append([np.zeros_like(previous_chord_bar), previous_melody_bar, np.zeros_like(chord_bar), melody_bar])
                data.append([previous_chord_bar, np.zeros_like(previous_melody_bar), chord_bar, np.zeros_like(melody_bar)])
        data_file_count = data_file_count + 1
        display.clear_output(wait=True)
        print(f"Processed {data_file_count}/{melody_file_count} files")

np.savez("data" + ".npz", *data)

print(f"There are {len(data)} items in the generated dataset")