In [1]:
import mido
import matplotlib.pyplot as plt
import numpy as np
import os
import shutil
import pretty_midi
from preprocess_midi import *

#### Standard MIDI File Structure    

A standard MIDI file consists of three main components:

1. Header Chunk: This provides metadata about the file.Chunk Type: 4 bytes, always "MThd".
   - Length: 4 bytes, always 6 (the length of the header data).|
   - Format Type: 2 bytes, indicating the file format (0, 1, or 2).
   - Number of Tracks: 2 bytes, indicating the number of track chunks in the file.
   - Time Division: 2 bytes, indicating the timing information (ticks per quarter note or frames per second).    
   

2. Track Chunk(s): Each track contains a sequence of MIDI events.
   - Chunk Type: 4 bytes, always "MTrk".
   - Length: 4 bytes, indicating the length of the track data.
   - Track Event Data: Variable length, consisting of MIDI events such as note on/off, control changes, and meta events.

# 0. PREPROCESSING

## 0.1 Get melodies from full dataset

https://colinraffel.com/projects/lmd/#get

In [None]:
path_full = 'LAKH/lmd_full'
paths = []

directories = os.listdir(path_full)

for direc in directories:
    #if direc in excluded:
    directory = f"{path_full}/{direc}"
    midi_files = [name for name in os.listdir(directory) if name.endswith('.mid')]
    for midi in midi_files:
        paths.append(f"{path_full}/{direc}/{midi}")

In [3]:
len(paths)

178561

In [5]:
melodies = []
corrupt = 0    

for i,path in enumerate(paths):
    res = search_melodies(path)
    if res is not None:
        if res==1:
            melodies.append(path)
    else:
        corrupt += 1

Iteration:  0   , n =  0   melody tracks found




Iteration:  1000   , n =  27   melody tracks found
Iteration:  2000   , n =  41   melody tracks found
Iteration:  3000   , n =  61   melody tracks found
Iteration:  4000   , n =  73   melody tracks found
Iteration:  5000   , n =  90   melody tracks found
Iteration:  6000   , n =  113   melody tracks found
Iteration:  7000   , n =  133   melody tracks found
Iteration:  8000   , n =  148   melody tracks found
Iteration:  9000   , n =  160   melody tracks found
Iteration:  10000   , n =  178   melody tracks found
Iteration:  11000   , n =  191   melody tracks found
Iteration:  12000   , n =  212   melody tracks found
Iteration:  13000   , n =  227   melody tracks found
Iteration:  14000   , n =  241   melody tracks found
Iteration:  15000   , n =  251   melody tracks found
Iteration:  16000   , n =  264   melody tracks found
Iteration:  17000   , n =  284   melody tracks found
Iteration:  18000   , n =  301   melody tracks found
Iteration:  19000   , n =  323   melody tracks found
Iterati

Iteration:  154000   , n =  2705   melody tracks found
Iteration:  155000   , n =  2728   melody tracks found
Iteration:  156000   , n =  2741   melody tracks found
Iteration:  157000   , n =  2760   melody tracks found
Iteration:  158000   , n =  2776   melody tracks found
Iteration:  159000   , n =  2795   melody tracks found
Iteration:  160000   , n =  2817   melody tracks found
Iteration:  161000   , n =  2833   melody tracks found
Iteration:  162000   , n =  2857   melody tracks found
Iteration:  163000   , n =  2879   melody tracks found
Iteration:  164000   , n =  2898   melody tracks found
Iteration:  165000   , n =  2920   melody tracks found
Iteration:  166000   , n =  2941   melody tracks found
Iteration:  167000   , n =  2949   melody tracks found
Iteration:  168000   , n =  2970   melody tracks found
Iteration:  169000   , n =  2991   melody tracks found
Iteration:  170000   , n =  3005   melody tracks found
Iteration:  171000   , n =  3016   melody tracks found
Iteration:

In [6]:
len(melodies)

3144

In [7]:
corrupt

69210

In [8]:
# Save melodies paths found
with open("melodies_clean.txt", "w") as f: 
    for line in melodies:
        f.write(f"{line}\n")

In [9]:
# Load melodies paths found
melodies_lakh = []
with open("melodies_clean.txt", "r") as f1:
    for line in f1:
        #line_as_list = line.strip()
        melodies_lakh.append(line.strip())
        

In [10]:
# Directory  to copy these files
target_directory = "/home/sara/Scrivania/Physics_of_Data/2nd_Year/NeuralNetworks_DeepLearning/Lab/Lakh_clean_2.0"

for file_path in melodies_lakh:
    if os.path.isfile(file_path): 
        shutil.copy(file_path, target_directory)
    else:
        print(f"File not found: {file_path}")

print("Files copied successfully.")

Files copied successfully.


## 0.2 Check for no-notes tracks and remove them

In [5]:
# get data from new directory
target_directory = "/home/gloria/Scrivania/Neural networks and deep learning/Project/lahck/Lakh_clean_4_4"
paths = []

midi_files = os.listdir(target_directory)

for midi in midi_files:
    paths.append(f"{target_directory}/{midi}")

In [7]:
# check for tracks that don't have notes
path_toremove = []
for path in paths:
    # Open the MIDI file
    midi_file = mido.MidiFile(path)
    #compute total time
    t = 0
    for track in midi_file.tracks:
        if track.name=='MELODY':
            for msg in track:
                t = t + msg.time
            break
            
    if t==0:
        path_toremove.append(path)

In [9]:
#Remove these tracks from the directory
for file_path in path_toremove:
    try:
        if os.path.exists(file_path):
            os.remove(file_path)
            print(f"Removed: {file_path}")
        else:
            print(f"File does not exist: {file_path}")
    except Exception as e:
        print(f"Error removing {file_path}: {e}")


## 0.3 Define metadata and remove zero-times' tracks

In [10]:
#Define new paths
paths_new = []
midi_files = os.listdir(target_directory)
for midi in midi_files:
    paths_new.append(f"{target_directory}/{midi}")

    
#creates list with metadata
#for each entry [ticks per beat, metamessage, total time in ticks]
#also check that there are no more tracks with no notes
metadata = []
path_toremove = []

for path in paths_new:

    # Open the MIDI file
    midi_file = mido.MidiFile(path)

    tpb = midi_file.ticks_per_beat

    track0 = [track for track in midi_file.tracks if track.name=='']
        
    #compute total time
    t = 0
    for track in midi_file.tracks:
        if track.name=='MELODY':
            for msg in track:
                t = t + msg.time
            break
            
    #get metamessage
    meta = [track0[0][i] for i in range(0,len(track0[0]))]
    if t!=0:
        metadata.append([tpb,meta,t])
    else:
        path_toremove.append(path)


In [12]:
len(paths_new)

2699

## 0.4 Select 4/4 tempo, 384 ticks per beat

In [14]:
indexes_, selected_paths, selected_meta = select_tempo(tpb=384, tempo_num=4, tempo_den=4) 

2699

In [None]:
# define subset of the dataset with only these files:
target_directory = "/home/sara/Scrivania/Physics_of_Data/2nd_Year/NeuralNetworks_DeepLearning/Lab/Lakh_clean_4_4"

# Copy each file to the target directory
for file_path in selected_paths:
    if os.path.isfile(file_path):  # Check if the file exists
        destination_path = os.path.join(target_directory, os.path.basename(file_path))
        if file_path != destination_path:  # Check if the source and destination are the same
            shutil.copy(file_path, destination_path)
        else:
            print(f"File is already in the target directory: {file_path}")
    else:
        print(f"File not found: {file_path}")

print("Files copied successfully.")

 ## 0.5 Final formatting

The final phase of formatting consists of:
- Unpack notes into time series (one note for each tick)
- Remove silences by prolonging previous notes
- Restrict notes into 2 octaves
- Represent notes as a 128 dimensional array with all zeroes except for the note    

At the end of this phase, an array $[\text{songs}\times\text{bars}\times\text{notes per bar}\times \text{notes range}]=[2698\times 8 \times 16 \times 128]$ is obtained.

In [2]:
# Importing files from dataset directory
# get data from new directory
paths_4_4 = []
target_directory = "/home/sara/Scrivania/Physics_of_Data/2nd_Year/NeuralNetworks_DeepLearning/Lab/Lakh_clean_4_4"
midi_files = os.listdir(target_directory)

for midi in midi_files:
    paths_4_4.append(f"{target_directory}/{midi}")

In [4]:
len(midi_files)

2698

In [4]:
dataset = formatting(paths_4_4, tpb=384, tempo=4, npb=16, n_bars=8)

0 / 2698  midis processed successfully.

500 / 2698  midis processed successfully.

1000 / 2698  midis processed successfully.

1500 / 2698  midis processed successfully.

2000 / 2698  midis processed successfully.

2500 / 2698  midis processed successfully.



In [5]:
dataset = np.array(dataset)

In [8]:
dataset.shape

(2698, 8, 16, 128)

In [9]:
np.save('lakh_preprocessed.npy', dataset)