In [1]:
import mido
import matplotlib.pyplot as plt
import numpy as np
import os
import shutil
import pretty_midi
from preprocess_midi import *

#### Standard MIDI File Structure    

A standard MIDI file consists of three main components:

1. Header Chunk: This provides metadata about the file.Chunk Type: 4 bytes, always "MThd".
   - Length: 4 bytes, always 6 (the length of the header data).|
   - Format Type: 2 bytes, indicating the file format (0, 1, or 2).
   - Number of Tracks: 2 bytes, indicating the number of track chunks in the file.
   - Time Division: 2 bytes, indicating the timing information (ticks per quarter note or frames per second).    
   

2. Track Chunk(s): Each track contains a sequence of MIDI events.
   - Chunk Type: 4 bytes, always "MTrk".
   - Length: 4 bytes, indicating the length of the track data.
   - Track Event Data: Variable length, consisting of MIDI events such as note on/off, control changes, and meta events.

# 0. PREPROCESSING

## 0.1 Get melodies from full dataset

https://colinraffel.com/projects/lmd/#get

In [15]:
path_full = 'LAKH/lmd_full'
paths = []

directories = os.listdir(path_full)

for direc in directories:
    directory = f"{path_full}/{direc}"
    midi_files = [name for name in os.listdir(directory) if name.endswith('.mid')]
    for midi in midi_files:
        paths.append(f"{path_full}/{direc}/{midi}")

In [16]:
len(paths)

178561

In [17]:
melodies = []
corrupt = 0    

for i,path in enumerate(paths):
    res = search_melodies(path)
    if res is not None:
        if res==1:
            melodies.append(path)
    else:
        corrupt += 1



In [18]:
print('Melodies found: ', len(melodies), '\nCorrupted files: ', corrupt)

Melodies found:  3144 
Corrupted files:  69210


In [19]:
# Directory  to copy these files
target_directory = "/home/sara/Scrivania/Physics_of_Data/2nd_Year/NeuralNetworks_DeepLearning/Lab/Lakh_melodies"

for file_path in melodies:
    if os.path.isfile(file_path): 
        shutil.copy(file_path, target_directory)
    else:
        print(f"File not found: {file_path}")

print("Files copied successfully.")

Files copied successfully.


## 0.2 Check for no-notes tracks and remove them

In [21]:
target_directory = "/home/sara/Scrivania/Physics_of_Data/2nd_Year/NeuralNetworks_DeepLearning/Lab/Lakh_melodies"
paths = []
midi_files = os.listdir(target_directory)
for midi in midi_files:
    paths.append(f"{target_directory}/{midi}")

    
metadata = []

for midi_path in paths:
    notes, time = extract_note_events(midi_path, track_name='MELODY')
    notes = np.array(notes)

	#Check for corrupted files with no notes and remove them
    if (notes.shape[0]==0) or time==0:
        os.remove(midi_path)
        #print(f"No-note midi file, removed: {midi_path}")
    else:
        midi_file = mido.MidiFile(midi_path)
        tpb = midi_file.ticks_per_beat
        track0 = [track for track in midi_file.tracks if track.name=='']
        if len(track0)!=0:
            #get metamessage
            meta = [track0[0][i] for i in range(0,len(track0[0]))]
            metadata.append([tpb,meta,time])
        else:
            os.remove(midi_path)
            #print(f"No header file, removed: {midi_path}")
            
paths = []
midi_files = os.listdir(target_directory)
for midi in midi_files:
    paths.append(f"{target_directory}/{midi}")
print('Uncorrupted files remained: ', len(paths))

Uncorrupted files remained:  3140


## 0.4 Select 4/4 tempo, 384 ticks per beat

In [22]:
indexes_, selected_paths, selected_meta = select_tempo(paths=paths, tpb=384, tempo_num=4, tempo_den=4, metadata=metadata) 

In [25]:
# define subset of the dataset with only these files:
target_directory = "/home/sara/Scrivania/Physics_of_Data/2nd_Year/NeuralNetworks_DeepLearning/Lab/Lakh_melodies_4_4"

# Copy each file to the target directory
for file_path in selected_paths:
    if os.path.isfile(file_path):  # Check if the file exists
        destination_path = os.path.join(target_directory, os.path.basename(file_path))
        if file_path != destination_path:  # Check if the source and destination are the same
            shutil.copy(file_path, destination_path)
        else:
            print(f"File is already in the target directory: {file_path}")
    else:
        print(f"File not found: {file_path}")

print("Files copied successfully.")

Files copied successfully.


In [27]:
print('Cleaned melody dataset:\nTempo : 4/4\nTicks per beat : 384\nNumber of files : ',len(selected_paths))

Cleaned melody dataset:
Tempo : 4/4
Ticks per beat : 384
Number of files :  2698


 ## 0.5 Final formatting

The final phase of formatting consists of:
- Unpack notes into time series (one note for each tick)
- Remove silences by prolonging previous notes
- Restrict notes into 2 octaves
- Represent notes as a 128 dimensional array with all zeroes except for the note 
- Augment the dataset through transposition and rotation

At the end of this phase, an array $[\text{augmented songs}\times\text{bars}\times\text{notes per bar}\times \text{notes range}]=[43168\times 8 \times 16 \times 128]$ is obtained.

In [28]:
# Importing files from dataset directory
# get data from new directory
paths_4_4 = []
target_directory = "/home/sara/Scrivania/Physics_of_Data/2nd_Year/NeuralNetworks_DeepLearning/Lab/Lakh_melodies_4_4"
midi_files = os.listdir(target_directory)

for midi in midi_files:
    paths_4_4.append(f"{target_directory}/{midi}")

In [30]:
dataset = formatting(paths_4_4, tpb=384, tempo=4, npb=16, n_bars=8, restrict=False, shift_range=8, n_aug=15)

0 / 2698  midis processed successfully.

500 / 2698  midis processed successfully.

1000 / 2698  midis processed successfully.

1500 / 2698  midis processed successfully.

2000 / 2698  midis processed successfully.

2500 / 2698  midis processed successfully.



In [32]:
# Check one-hot encoding
unenc = 0
for i in range(dataset.shape[0]):
    for j in range(dataset.shape[1]):
        for k in range(dataset.shape[2]):
            a = np.where(dataset[i,j,k,:]==1)
            if(len(a)!=1):
                unenc +=1
                print('Uncorrectly encoded track:', i, j, k)
print('Tracks correctly formatted:', dataset.shape[0]-unenc)

Tracks correctly formatted: 43168


In [34]:
np.save('lakh_pfa_16.npy', dataset)