In [10]:
import os
import numpy as np
import librosa
import soundfile as sf
from basic_pitch.inference import predict
from basic_pitch import ICASSP_2022_MODEL_PATH
import music21
from pathlib import Path
from basic_pitch import constants as bp_constants
HOP_LENGTH = bp_constants.FFT_HOP
SAMPLING_RATE = bp_constants.AUDIO_SAMPLE_RATE

# MuseScore 4のパスを明示的に設定
# ユーザーが確認したパス: /Applications/MuseScore4.app/Contents/MacOS/mscore
musescore_path = Path('/Applications/MuseScore 3.app/Contents/MacOS/mscore')

if musescore_path.exists():
    us = music21.environment.UserSettings()
    us['musicxmlPath'] = str(musescore_path)
    us['musescoreDirectPNGPath'] = str(musescore_path)
    
    print(f"MuseScore path successfully set to: {musescore_path}")
else:
    print(f"Warning: MuseScore not found at {musescore_path}. Please check the installation path.")

MuseScore path successfully set to: /Applications/MuseScore 3.app/Contents/MacOS/mscore


In [36]:
data_path = os.path.join(os.path.dirname(os.getcwd()), 'data')
transcription_path = os.path.join(data_path, "processed", "transcribed", "sample_kaze")
# 元の音声ファイル
original_file = os.path.join(data_path, "original", "sample_kaze.mp4")
# 分離済みのボーカル音声ファイル
input_file = os.path.join(data_path, "processed", "UVR", "sample_kaze", "sample_kaze_(Vocals)_UVR_MDXNET_KARA_(No Reverb)_UVR-DeEcho-DeReverb.wav")
inst_file = os.path.join(data_path, "processed", "UVR", "sample_kaze", "sample_kaze_(Instrumental)_UVR_MDXNET_KARA.wav")
harmonic_file = os.path.join(transcription_path, "sample_kaze_Harmonic.wav")
output_midi_path = os.path.join(transcription_path, 'vocals_transcribed.mid')

In [67]:
# 1. チューニング補正量を計算 (元の音源から)
y_inst_raw, sr = librosa.load(inst_file)
tuning_offset = librosa.estimate_tuning(y=y_inst_raw, sr=sr)
print(f"Estimated tuning offset: {tuning_offset:.3f} bins")

# 2. リサンプリングでピッチ補正 (音質劣化なし・時間軸が変わる)
tuning_rate = 2 ** (-tuning_offset / 12)
print(f"Tuning rate: {tuning_rate:.4f}")

y_inst = librosa.resample(y_inst_raw, orig_sr=sr, target_sr=sr/tuning_rate)
print(f"Inst duration: {len(y_inst_raw)/sr:.2f}s -> {len(y_inst)/sr:.2f}s (resampled)")

# 3. リサンプリング後の音源からビート検出
tempo = librosa.feature.tempo(y=y_inst, sr=sr)
tempo, beat_track = librosa.beat.beat_track(y=y_inst, sr=sr, hop_length=HOP_LENGTH, start_bpm=tempo[0], units='frames')
print(f"Detected tempo: {float(tempo[0]):.1f} BPM, {len(beat_track)} beats")

Estimated tuning offset: -0.310 bins
Tuning rate: 1.0181
Inst duration: 69.24s -> 68.01s (resampled)
Detected tempo: 94.0 BPM, 106 beats


In [68]:
# 4. ボーカル音源も同じtuning_rateでリサンプリング
y_vocal_raw, sr = librosa.load(input_file)
if sr != SAMPLING_RATE:
    raise ValueError(f"Sample rate of input file ({sr}) does not match expected sample rate ({SAMPLING_RATE})")

y_vocal = librosa.resample(y_vocal_raw, orig_sr=sr, target_sr=sr/tuning_rate)
print(f"Vocal duration: {len(y_vocal_raw)/sr:.2f}s -> {len(y_vocal)/sr:.2f}s (resampled)")

# Harmonic/Percussive Source Separation (HPSS)
y_harmonic, _ = librosa.effects.hpss(y_vocal, margin=1.0)

sf.write(harmonic_file, y_harmonic, sr)
print(f"Saved harmonic component to {harmonic_file}")

Vocal duration: 69.24s -> 68.01s (resampled)
Saved harmonic component to /Users/kpome/github/solfege-gen/data/processed/transcribed/sample_kaze/sample_kaze_Harmonic.wav


In [69]:
model_output, midi_data, _ = predict(
    harmonic_file,
    model_or_model_path=ICASSP_2022_MODEL_PATH,
    onset_threshold=0.5, # 無関係
    frame_threshold=0.3, # 無関係
    minimum_note_length=20, # 無関係
    maximum_frequency=2000,
    multiple_pitch_bends=False,
    melodia_trick=True,
    midi_tempo=float(tempo) # 無関係
    )
# MIDIデータをファイルに保存する
output_midi_path = os.path.join(os.path.dirname(os.getcwd()), 'data', 'processed', 'vocals_transcribed.mid')
midi_data.write(output_midi_path)
print(f"saved to {output_midi_path}")

Predicting MIDI for /Users/kpome/github/solfege-gen/data/processed/transcribed/sample_kaze/sample_kaze_Harmonic.wav...
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32


  midi_tempo=float(tempo) # 無関係


isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844, 1)
dtype: float32
isfinite: True
shape: (1, 43844

In [70]:
# 1秒あたりのフレーム数を取得
# FPS = bp_constants.ANNOTATIONS_FPS
SPF = HOP_LENGTH / SAMPLING_RATE
onsets_np = model_output['onset']  # shape: (num_frames, num_pitches)
print(f"onsets_np shape: {onsets_np.shape}")
pitches_np = model_output['note']  # shape: (num_frames, num_pitches)
print(f"pitches_np shape: {pitches_np.shape}")

onsets_np shape: (5848, 88)
pitches_np shape: (5848, 88)


In [71]:
def quantize_to_grid(data, beat_track, subdivisions=4, agg_func=np.mean):
    """
    フレームデータをビートグリッドに量子化する．
    data: (num_frames, num_features)
    beat_track: ビートのフレームインデックス配列
    subdivisions: 1ビートあたりの分割数
    agg_func: 集約関数 (np.max, np.mean等). signature: func(array, axis=int)
    Returns: (grid_values, grid_frames)
    """
    end_frame = data.shape[0]
    grid_values = []
    grid_frames = []
    prev_beat_frame = 0

    for beat_frame in list(beat_track):
        sub_frames = np.linspace(prev_beat_frame, beat_frame, subdivisions + 1, dtype=int)
        for j in range(subdivisions):
            sf_start, sf_end = sub_frames[j], sub_frames[j + 1]
            if sf_end <= sf_start:
                continue
            sf_start = min(sf_start, end_frame)
            sf_end = min(sf_end, end_frame)
            
            if end_frame <= sf_start:
                grid_values.append(np.zeros(data.shape[1]))
            else:
                chunk = data[sf_start:sf_end]
                if chunk.shape[0] == 0:
                    grid_values.append(np.zeros(data.shape[1]))
                else:
                    grid_values.append(agg_func(chunk, axis=0))
            grid_frames.append(sf_start)
        prev_beat_frame = beat_frame

    # 最後のビート以降
    if prev_beat_frame < end_frame:
        sub_frames = np.linspace(prev_beat_frame, end_frame, subdivisions + 1, dtype=int)
        for j in range(subdivisions):
            sf_start, sf_end = sub_frames[j], sub_frames[j + 1]
            if sf_end <= sf_start:
                continue
            sf_start = min(sf_start, end_frame)
            sf_end = min(sf_end, end_frame)
            if sf_start < end_frame:
                grid_values.append(agg_func(data[sf_start:sf_end], axis=0))
            else:
                grid_values.append(np.zeros(data.shape[1]))
            grid_frames.append(sf_start)

    return np.array(grid_values), np.array(grid_frames)

# フレームをビートに合わせてグリッド化
SUBDIVISIONS = 4

# beat_trackはすでにリサンプリング後の時間軸で取得されているため、補正不要
grid_onsets_np, grid_frames_np = quantize_to_grid(onsets_np, beat_track, SUBDIVISIONS, agg_func=np.max)
grid_notes_np, _ = quantize_to_grid(pitches_np, beat_track, SUBDIVISIONS, agg_func=np.mean)

print(f"grid_onsets_np shape: {grid_onsets_np.shape}, grid_notes_np shape: {grid_notes_np.shape}")

grid_onsets_np shape: (428, 88), grid_notes_np shape: (428, 88)


### HMMによるノート推定

#### キー推定

In [113]:
from scipy.ndimage import uniform_filter1d, gaussian_filter1d
from collections import Counter

# キー推定用のクロマベクトル抽出
y_target_raw, sr = librosa.load(original_file, sr=sr)
y_target = librosa.resample(y_target_raw, orig_sr=sr, target_sr=sr/tuning_rate)
print(f"Target duration: {len(y_target_raw)/sr:.2f}s -> {len(y_target)/sr:.2f}s (resampled)")
y_target_harmonic, _ = librosa.effects.hpss(y_target, margin=1.0)

# クロマベクトルをCQTで抽出
chroma = librosa.feature.chroma_cqt(
    y=y_target_harmonic, 
    sr=sr, 
    hop_length=HOP_LENGTH, 
    fmin=librosa.note_to_hz('C1'), 
    n_octaves=7
)
chroma = chroma.T  # (num_frames, 12)
print(f"chroma shape: {chroma.shape}")

# クロマベクトルをビートグリッドに量子化
grid_chroma_np, _ = quantize_to_grid(chroma, beat_track, SUBDIVISIONS, agg_func=np.mean)

# 5. 平滑化フィルタの適用
filter_size = 88

# A. 単純移動平均 (Uniform Filter) - 減衰なし
# grid_chroma_filtered = uniform_filter1d(grid_chroma_np, size=filter_size, axis=0, mode='nearest')

# B. ガウシアンフィルタ (Gaussian Filter) - 滑らかな減衰あり (推奨)
# sigma = filter_size / 6 とすることで、filter_sizeの範囲に分布の約99%が収まるようにします
# sigma = filter_size / 6
# grid_chroma_filtered = gaussian_filter1d(grid_chroma_np, sigma=sigma, axis=0, mode='nearest')
grid_chroma_filtered = uniform_filter1d(grid_chroma_np, size=filter_size, axis=0, mode='nearest')
# print(f"grid_chroma_filtered shape: {grid_chroma_filtered.shape} (Applied Gaussian Filter, sigma={sigma:.1f})")

  y_target_raw, sr = librosa.load(original_file, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Target duration: 69.24s -> 68.01s (resampled)
chroma shape: (5858, 12)


In [None]:
# 6. メジャー/マイナー両方のテンプレートを用意
# Krumhansl-Schmuckler Key Profiles
major_profile = np.array([6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88])
minor_profile = np.array([6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17])

templates_major = np.array([np.roll(major_profile, i) for i in range(12)])
templates_minor = np.array([np.roll(minor_profile, i) for i in range(12)])
key_templates = np.vstack([templates_major, templates_minor])  # Shape: (24, 12)

# ラベル定義 (0-11: Major, 12-23: Minor)
KEY_NAMES = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
FULL_KEY_LABELS = [f"{k} Major" for k in KEY_NAMES] + [f"{k} Minor" for k in KEY_NAMES]

# ピアソン相関係数 (中心化コサイン類似度)
chroma_centered = grid_chroma_filtered - grid_chroma_filtered.mean(axis=1, keepdims=True)
template_centered = key_templates - key_templates.mean(axis=1, keepdims=True)

chroma_norm = chroma_centered / (np.linalg.norm(chroma_centered, axis=1, keepdims=True) + 1e-10)
template_norm = template_centered / (np.linalg.norm(template_centered, axis=1, keepdims=True) + 1e-10)

correlation = chroma_norm @ template_norm.T

# 確率変換
temperature = 1.0
key_prob = np.exp(correlation / temperature) / np.sum(np.exp(correlation / temperature), axis=1, keepdims=True)
print(f"key_prob shape: {key_prob.shape}")

# 7. HMM (24状態: 12 Major + 12 Minor)
n_states = 24
transition = np.full((n_states, n_states), 0.001) # ベース確率

for i in range(12):
    # --- Major Key (i) からの遷移 ---
    transition[i, i] = 0.90             # 自己遷移
    
    # 属調・下属調 (Major)
    transition[i, (i + 7) % 12] = 0.01
    transition[i, (i + 5) % 12] = 0.01
    
    # 平行調 (Relative Minor): 3半音下
    rel_min = 12 + (i - 3) % 12
    transition[i, rel_min] = 0.01
    
    # 同主調 (Parallel Minor)
    para_min = 12 + i
    transition[i, para_min] = 0.003

    # --- Minor Key (12+i) からの遷移 ---
    m = 12 + i
    transition[m, m] = 0.90             # 自己遷移
    
    # 属調・下属調 (Minor)
    transition[m, 12 + (i + 7) % 12] = 0.01
    transition[m, 12 + (i + 5) % 12] = 0.01
    
    # 平行調 (Relative Major): 3半音上
    rel_maj = (i + 3) % 12
    transition[m, rel_maj] = 0.01
    
    # 同主調 (Parallel Major)
    transition[m, i] = 0.003

transition /= transition.sum(axis=1, keepdims=True)

# 1回のみのViterbi実行 (Uniform初期化)
p_init = np.ones(n_states) / n_states
key_sequence = librosa.sequence.viterbi(key_prob.T, transition, p_init=p_init)

# 出力
key_counts = Counter([FULL_KEY_LABELS[k] for k in key_sequence])

print(f"\n推定キー分布 (Final):")
for key, count in key_counts.most_common():
    print(f"  {key}: {count} grids ({100*count/len(key_sequence):.1f}%)")

print(f"\n推定キー系列 (Head):")
for i in range(min(300, len(key_sequence))):
    print(f"  Grid {i:4d}: {FULL_KEY_LABELS[key_sequence[i]]}")

key_prob shape: (428, 24)

推定キー分布 (Sticky Transition):
  E Major: 193 grids (45.1%)
  C# Minor: 120 grids (28.0%)
  B Major: 115 grids (26.9%)

推定キー系列 (Head):
  Grid    0: B Major
  Grid    1: B Major
  Grid    2: B Major
  Grid    3: B Major
  Grid    4: B Major
  Grid    5: B Major
  Grid    6: B Major
  Grid    7: B Major
  Grid    8: B Major
  Grid    9: B Major
  Grid   10: B Major
  Grid   11: B Major
  Grid   12: B Major
  Grid   13: B Major
  Grid   14: B Major
  Grid   15: B Major
  Grid   16: B Major
  Grid   17: B Major
  Grid   18: B Major
  Grid   19: B Major
  Grid   20: B Major
  Grid   21: B Major
  Grid   22: B Major
  Grid   23: B Major
  Grid   24: B Major
  Grid   25: B Major
  Grid   26: B Major
  Grid   27: B Major
  Grid   28: B Major
  Grid   29: B Major
  Grid   30: B Major
  Grid   31: B Major
  Grid   32: B Major
  Grid   33: B Major
  Grid   34: B Major
  Grid   35: B Major
  Grid   36: B Major
  Grid   37: B Major
  Grid   38: B Major
  Grid   39: B Major
 

In [35]:
temperature = 0.8
rest_prob = np.expand_dims(np.ones(grid_notes_np.shape[0]) - grid_notes_np.max(axis=1), axis=1)
grid_notes_prob_np = np.concatenate([grid_notes_np, rest_prob], axis=1)
grid_notes_prob_np = np.exp(grid_notes_prob_np / temperature) / np.sum(np.exp(grid_notes_prob_np / temperature), axis=1, keepdims=True)
print(f"grid_notes_prob_np shape: {grid_notes_prob_np.shape}")

grid_notes_prob_np shape: (428, 89)


In [81]:
import pretty_midi

MIDI_OFFSET = 21 # basic_pitchのピッチ配列のMIDIノート番号オフセット (=21, A0)

def save_grid_to_midi(grid_onsets_np, grid_notes_np, grid_frames_np, spf, onset_threshold=0.60, pitch_threshold=0.33, output_filename="output.mid"):
    """
    grid_onsets_np: (num_grids, num_pitches) - 各グリッドのonset確率
    grid_notes_np:  (num_grids, num_pitches) - 各グリッドのnote確率
    grid_frames_np: (num_grids,) - 各グリッドの開始フレーム
    spf: seconds per frame
    """
    if grid_onsets_np.shape != grid_notes_np.shape:
        raise ValueError("grid_onsets_np and grid_notes_np must have the same shape")
    if grid_onsets_np.shape[0] != len(grid_frames_np):
        raise ValueError("grid count and grid_frames length must match")
    
    pm = pretty_midi.PrettyMIDI()
    instrument = pretty_midi.Instrument(program=0)
    
    active_note_start_time = None
    active_pitch = None
    num_grids = grid_onsets_np.shape[0]
    print(f"grid_onsets_np shape: {grid_onsets_np.shape}, grid_notes_np shape: {grid_notes_np.shape}")
    
    for i in range(num_grids):
        onset = grid_onsets_np[i]
        pitch = grid_notes_np[i]
        grid_time = grid_frames_np[i] * spf
        
        detected_onset = (np.argmax(onset) + MIDI_OFFSET) if np.max(onset) >= onset_threshold else None
        detected_pitch = (np.argmax(pitch) + MIDI_OFFSET) if np.max(pitch) >= pitch_threshold else None

        # --- 1. 終了判定 (Note Off) ---
        if active_pitch is not None:
            should_close = (
                (detected_pitch is None) or 
                (detected_pitch != active_pitch) or 
                (detected_onset is not None)
            )
            if should_close:
                note = pretty_midi.Note(
                    velocity=50,

                    pitch=int(active_pitch),
                    start=active_note_start_time,
                    end=grid_time
                )
                instrument.notes.append(note)
                active_pitch = None
                active_note_start_time = None

        # --- 2. 開始判定 (Note On) ---
        if active_pitch is None and detected_pitch is not None:
            active_pitch = detected_pitch
            active_note_start_time = grid_time

    # --- 3. 後処理 ---
    if active_pitch is not None:
        end_time = (grid_frames_np[-1] + (grid_frames_np[-1] - grid_frames_np[-2])) * spf if num_grids > 1 else grid_frames_np[-1] * spf
        note = pretty_midi.Note(
            velocity=50,
            pitch=int(active_pitch),
            start=active_note_start_time,
            end=end_time
        )
        instrument.notes.append(note)


    pm.instruments.append(instrument)
    pm.write(output_filename)
    print(f"Saved MIDI to {output_filename} ({len(instrument.notes)} notes)")

# 実行
save_grid_to_midi(grid_onsets_np, grid_notes_np, grid_frames_np, SPF, output_filename="transcribed_output.mid")

grid_onsets_np shape: (428, 88), grid_notes_np shape: (428, 88)
Saved MIDI to transcribed_output.mid (223 notes)


In [16]:
beat_track

array([  24,   82,  137,  194,  250,  307,  363,  416,  468,  526,  583,
        638,  693,  748,  803,  859,  914,  970, 1025, 1079, 1136, 1189,
       1247, 1301, 1359, 1412, 1470, 1523, 1581, 1635, 1692, 1747, 1803,
       1856, 1914, 1968, 2025, 2079, 2137, 2191, 2248, 2301, 2359, 2412,
       2470, 2524, 2581, 2634, 2691, 2745, 2804, 2857, 2915, 2969, 3026,
       3079, 3137, 3190, 3248, 3301, 3360, 3414, 3471, 3524, 3582, 3634,
       3692, 3746, 3804, 3858, 3915, 3969, 4027, 4079, 4137, 4190, 4248,
       4302, 4360, 4415, 4471, 4526, 4582, 4637, 4693, 4747, 4804, 4858,
       4916, 4969, 5027, 5080, 5137, 5192, 5249, 5303, 5360, 5414, 5471,
       5525, 5582, 5638, 5694, 5749, 5805, 5858])

In [5]:
import music21
# 楽譜として表示する (music21ライブラリを使用)
# 注意: きれいな楽譜を表示するには、OSに MuseScore などの楽譜ソフトがインストールされている必要があります。

score = music21.converter.parse(output_midi_path)

In [9]:
print(type(score))
score.show('midi')  # 'musicxml.png' を使用してPNG画像として表示

<class 'music21.stream.base.Score'>


In [None]:

try:
    # 保存したMIDIファイルを読み込む

    
    # 楽譜を外部ビューアで表示

    # MuseScoreなどがインストールされていれば起動します
    score.show()
    
    # ノートブック上で簡易的に確認したい場合はテキスト形式で表示できます
    # score.show('text')
    
except Exception as e:
    print(f"楽譜の表示に失敗しました: {e}")
    print("MuseScoreなどがインストールされていない場合は、'music21' の設定が必要な場合があります。")