In [1]:
# !pip install transformers
# !pip install note_seq
# !pip install pygame
# !pip install torch==2.0.1
# !pip install miditoolkit # MidiFile() 로 midi file 읽어오기 위한 라이브러리
# !pip install accelerate -U

[0mCollecting note_seq
  Downloading note_seq-0.0.5-py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 kB[0m [31m443.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting absl-py (from note_seq)
  Obtaining dependency information for absl-py from https://files.pythonhosted.org/packages/a2/ad/e0d3c824784ff121c03cc031f944bc7e139a8f1870ffd2845cc2dd76f6c4/absl_py-2.1.0-py3-none-any.whl.metadata
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting bokeh>=0.12.0 (from note_seq)
  Obtaining dependency information for bokeh>=0.12.0 from https://files.pythonhosted.org/packages/39/ba/aefd7aacc9e086e2c7f3bb42e99cb8e2a8f24dcb4bf78519ef25a9102988/bokeh-3.3.4-py3-none-any.whl.metadata
  Downloading bokeh-3.3.4-py3-none-any.whl.metadata (12 kB)
Collecting intervaltree>=2.1.0 (from note_seq)
  Downloading intervaltree-3.1.0.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting librosa>=0.6.2 (from n

In [2]:
from copy import deepcopy
from math import ceil
from pathlib import Path
from miditoolkit import MidiFile
import os
from tqdm import tqdm
import pandas as pd

In [3]:
MAX_NB_BAR = 20
MIN_NB_NOTES = 20

In [4]:
midi_paths = list(Path('./jazz-midi-366-songs').glob('*.mid'))
len(midi_paths)

366

노래 이름 너무 더러워서 전처리 해주시 (orig_stem -> new_stem)

In [5]:
midi_data = pd.DataFrame(columns=['midi_path'], data=midi_paths)
midi_data['orig_stem'] = midi_data['midi_path'].apply(lambda x: x.stem)

midi_data['new_stem'] = midi_data['orig_stem'].apply(lambda x: x.replace('-', '_').replace(' ', '_').replace('__', '_').replace('__', '_').replace('__', '_'))
# remove , ! ? # @ from the stem
midi_data['new_stem'] = midi_data['new_stem'].apply(lambda x: x.replace(',', '').replace('!', '').replace('?', '').replace('#', '').replace('@', ''))

# sort by replaced_name
midi_data = midi_data.sort_values(by='new_stem')

# reset index
midi_data = midi_data.reset_index(drop=True)
midi_data

Unnamed: 0,midi_path,orig_stem,new_stem
0,jazz-midi-366-songs/001_11pmtheme.mid,001_11pmtheme,001_11pmtheme
1,jazz-midi-366-songs/002_16goingon17.mid,002_16goingon17,002_16goingon17
2,jazz-midi-366-songs/003_20thcenturystomp.mid,003_20thcenturystomp,003_20thcenturystomp
3,jazz-midi-366-songs/004_2ndtime.mid,004_2ndtime,004_2ndtime
4,jazz-midi-366-songs/005_4thAvenueTheme.mid,005_4thAvenueTheme,005_4thAvenueTheme
...,...,...,...
361,jazz-midi-366-songs/362_youstep.mid,362_youstep,362_youstep
362,jazz-midi-366-songs/363_youdbe.mid,363_youdbe,363_youdbe
363,jazz-midi-366-songs/364_YBSN2CH2.mid,364_YBSN2CH2,364_YBSN2CH2
364,jazz-midi-366-songs/365_zanzibar.mid,365_zanzibar,365_zanzibar


In [6]:
# sort by new_stem
midi_data = midi_data.sort_values(by='new_stem')

# reset index
midi_data = midi_data.reset_index(drop=True)
midi_data

Unnamed: 0,midi_path,orig_stem,new_stem
0,jazz-midi-366-songs/001_11pmtheme.mid,001_11pmtheme,001_11pmtheme
1,jazz-midi-366-songs/002_16goingon17.mid,002_16goingon17,002_16goingon17
2,jazz-midi-366-songs/003_20thcenturystomp.mid,003_20thcenturystomp,003_20thcenturystomp
3,jazz-midi-366-songs/004_2ndtime.mid,004_2ndtime,004_2ndtime
4,jazz-midi-366-songs/005_4thAvenueTheme.mid,005_4thAvenueTheme,005_4thAvenueTheme
...,...,...,...
361,jazz-midi-366-songs/362_youstep.mid,362_youstep,362_youstep
362,jazz-midi-366-songs/363_youdbe.mid,363_youdbe,363_youdbe
363,jazz-midi-366-songs/364_YBSN2CH2.mid,364_YBSN2CH2,364_YBSN2CH2
364,jazz-midi-366-songs/365_zanzibar.mid,365_zanzibar,365_zanzibar


In [8]:
midi_data.to_csv('./metadata.csv')

청킹청킹!

In [10]:
CHUNK_PATH = './jazz_chunk'
num_of_cuts = [0] * len(midi_data)
for i in tqdm(range(len(midi_data))):
    try:
        irow = midi_data.loc[i]
        midi_path, new_stem = irow['midi_path'], irow['new_stem']

        if not os.path.exists(f'{CHUNK_PATH}/{new_stem}'):
            os.makedirs(f'{CHUNK_PATH}/{new_stem}')

        midi = MidiFile(midi_path)
        ticks_per_cut = MAX_NB_BAR * midi.ticks_per_beat * 4
        nb_cut = ceil(midi.max_tick / ticks_per_cut)

        if nb_cut < 2:
            midi.dump(f'{CHUNK_PATH}/{new_stem}/0.mid')
            num_of_cuts[i] = 1
            # print("Skipping", midi_path, "because it's too short")
            continue

        midi_cuts = [deepcopy(midi) for _ in range(nb_cut)]

        for j, track in enumerate(midi.instruments):
            track.notes = sorted(track.notes, key=lambda x: x.start)
            for midi_short in midi_cuts:
                midi_short.instruments[j].notes = []
            for note in track.notes:
                cut_idx = note.start // ticks_per_cut
                note_copy = deepcopy(note)
                note_copy.start -= cut_idx * ticks_per_cut
                note_copy.end -= cut_idx * ticks_per_cut
                midi_cuts[cut_idx].instruments[j].notes.append(note_copy)

        # saving midis
        short_cnt = 0
        for midi_short in midi_cuts:
            if sum(len(track.notes) for track in midi_short.instruments) < MIN_NB_NOTES:
                # print("Skipping", midi_path, "because it's too short")
                continue
            short_cnt += 1
            midi_short.dump(f'{CHUNK_PATH}/{new_stem}/{short_cnt}.mid')
        num_of_cuts[i] = short_cnt
    except:
        pass

100%|██████████| 366/366 [06:34<00:00,  1.08s/it]


청킹하면서 각 midi파일당 cut이 몇개 생겼는지 기록 (근데 제가 코드를 잘 못 짜서... 정작 preprocess-train에선 안 쓰인다는 사실!)

In [9]:
midi_data['num_of_cuts'] = num_of_cuts

midi_data.to_csv('../ym-midis/metadata/chunked-metadata.csv')

[0m