# Install the required dependencies

In [1]:
%pip install miditok
%pip install miditoolkit
%pip install pretty_midi
%pip install tqdm

Collecting miditok
  Downloading miditok-3.0.1-py3-none-any.whl.metadata (8.0 kB)
Collecting huggingface-hub>=0.16.4 (from miditok)
  Downloading huggingface_hub-0.21.4-py3-none-any.whl.metadata (13 kB)
Collecting symusic>=0.3.2 (from miditok)
  Downloading symusic-0.4.2-cp312-cp312-win_amd64.whl.metadata (9.1 kB)
Collecting tokenizers>=0.13.0 (from miditok)
  Downloading tokenizers-0.15.2-cp312-none-win_amd64.whl.metadata (6.8 kB)
Collecting tqdm (from miditok)
  Downloading tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     --------------------- ------------------ 30.7/57.6 kB 1.4 MB/s eta 0:00:01
     ---------------------------------------- 57.6/57.6 kB 1.0 MB/s eta 0:00:00
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.16.4->miditok)
  Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pySmartDL (from symusic>=0.3.2->miditok)
  Downloading pySmartDL-1.3.4-py3-none-any.whl.metadata 

# Import required packages and load the dataset

In [2]:
from pathlib import Path
from copy import deepcopy
from math import ceil

import pretty_midi
from miditoolkit import MidiFile
from tqdm import tqdm

from typing import List, Dict

### Define the parameters for chunking

In [5]:
MAX_NB_BAR = 8
MIN_NB_NOTES = 16
path = "test/prompt"

` The load_dataset function creates new directory and loads the midi paths`

In [6]:
def load_valid_midi_files(path: str) -> Dict[str, bool]:
    """
    Load and validate MIDI files from a specified path based on the number of notes,
    and return a dictionary mapping MIDI file paths to a boolean indicating validity.

    Parameters:
    - path (str): The path to the dataset directory.

    Returns:
    - Dict[str, bool]: A dictionary where keys are MIDI file paths (as strings) and values are booleans indicating validity.
    """
    valid_midi_dict = {}
    for midi_path in list(Path(path).glob("**/*.mid")) + list(Path(path).glob("**/*.midi")):
        try:
            midi_data = pretty_midi.PrettyMIDI(str(midi_path))
            valid = any(len(instrument.notes) >= MIN_NB_NOTES for instrument in midi_data.instruments)
            valid_midi_dict[str(midi_path)] = valid
        except Exception as e:
            print(f"Error loading {midi_path}: {e}")
            valid_midi_dict[str(midi_path)] = False  # Consider invalid if there was an error loading it

    return valid_midi_dict

In [7]:
def load_dataset(path: str) -> List:
  """
  dataset: The directory name of the dataset
  path: The path to the dataset
  """
  dataset = path.split("/")[-1]
  parent_directory = Path(path).parent  # This gets the parent directory of the dataset
  print("=====> Parent directory:", parent_directory)
  new_directory_name = f"{dataset}-chunked"
  new_directory = parent_directory / new_directory_name  # This creates the new path correctly

  print("=====> Created new directory:", new_directory)
  new_directory.mkdir(parents=True, exist_ok=True)

  valid_midi_dict = load_valid_midi_files(path)
  valid_midi_paths = [Path(key) for key, value in valid_midi_dict.items() if value]
  # valid_midi_paths = list(Path(path).glob("**/*.mid")) + list(Path(path).glob("**/*.midi"))

  return new_directory, valid_midi_paths

In [8]:
midi_files = load_valid_midi_files(path)
print("Total midi files: ", len(midi_files), end='\n')
print("Valid midi files =======>")
for key, value in midi_files.items():
  if value:
    print(key)
  else:
    print(f"Not valid midi: {key}")

Total midi files:  1
test\prompt\0_prompt.wav.mid


In [9]:
new_directory, midi_paths = load_dataset(path)

=====> Parent directory: test
=====> Created new directory: test\prompt-chunked


In [10]:
len(midi_paths)

1

# Create chunks and store them in the new folder

In [11]:
def create_chunks(path, new_directory: str, midi_paths: List[str]):
  """
  midi_paths: The paths to the MIDI files to be chunked
  """
  for i, midi_path in enumerate(tqdm(midi_paths, desc="CHUNKING MIDIS")):
    try:

      # Determine the output directory for the files
      relative_path = midi_path.relative_to(path)
      output_directory = new_directory / relative_path.parent
      output_directory.mkdir(parents=True, exist_ok=True)

      # Check if chunks are already created
      chunk_paths = list(output_directory.glob(f"{midi_path.stem}_*.mid"))
      if len(chunk_paths) > 0:
        print(f"\n=====> Chunks already created for {midi_path}, skipping")
        continue

      # Loads MIDI, merges and saves it
      midi = MidiFile(str(midi_path))
      ticks_per_cut = MAX_NB_BAR * midi.ticks_per_beat * 4
      nb_cuts = ceil(midi.max_tick / ticks_per_cut)
      if nb_cuts < 2:
        print(f"=====> Less than two chunks for {midi_path}, skipping....")
        continue

      print(f"Processing {midi_path}")
      midis = [deepcopy(midi) for _ in range(nb_cuts)]

      for j, track in enumerate(midi.instruments):
        track.notes.sort(key = lambda x: x.start)
        for midi_short in midis:
          midi_short.instruments[j].notes = []
        for note in track.notes:
          cut_id = note.start // ticks_per_cut
          note_copy = deepcopy(note)
          note_copy.start -= cut_id * ticks_per_cut
          note_copy.end -= cut_id * ticks_per_cut
          midis[cut_id].instruments[j].notes.append(note_copy)

      # Saving the Midis
      for j, midi_short in enumerate(midis):
        if sum(len(track.notes) for track in midi_short.instruments) < MIN_NB_NOTES:
          continue
        midi_short.dump(output_directory / f"{midi_path.stem}_{j}.mid")

    except Exception as e:
      print(f"\nError creating output directory: {e}")

In [12]:
create_chunks(path, new_directory, midi_paths)

CHUNKING MIDIS: 100%|██████████| 1/1 [00:00<00:00, 250.02it/s]

=====> Less than two chunks for test\prompt\0_prompt.wav.mid, skipping....





* The total english songs dataset had around 330 songs out of which 7 songs were corrupted. Skipping those songs would make around 323 songs for the complete dataset.
* Only 50 hindi songs were choosen for the training due to un-availability of songs. future works would be gathering more songs for training
