<a href="https://colab.research.google.com/github/Unknown-Turtle/stem_iso_testing/blob/main/stem_testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using spleeter

In [None]:
!apt install ffmpeg

In [None]:
pip install spleeter

In [None]:
from IPython.display import Audio

# Separate from command line

In [None]:
# Note: i've only done this on mac with a conda venv

In [None]:
!wget https://github.com/deezer/spleeter/raw/master/audio_example.mp3

In [None]:
Audio('audio.mp3')

In [None]:
!spleeter separate -h

In [None]:
!spleeter separate -o output/ audio_example.mp3

In [None]:
!ls output/audio_example

In [None]:
Audio('output/audio/vocals.wav')

In [None]:
Audio('output/audio/accompaniment.wav')

# Using demucs

In [None]:
!pip install demucs -q
!apt-get install ffmpeg -y

In [None]:
# Basic usage: separate into 4 stems (vocals, drums, bass, other)
!demucs audio.mp3

In [None]:
# ## 4. Using different models
# htdemucs (Hybrid Transformer) model, higher quality
!demucs -n htdemucs -o output audio.mp3

In [None]:
# 6-stem model for more detailed separation (vocals, drums, bass, piano, guitar, other)
!demucs -n htdemucs_6s -o output audio.mp3

In [None]:
# ## 5. Only separate vocals vs everything else
!demucs --two-stems=vocals -o output audio.mp3

# Using Music21


This is a more fleshed out example, include more analyis, needs a bit of fixing up though. probably should remove unusded imports and edit the main guard

In [None]:
import json, math, pathlib
import numpy as np
import librosa, librosa.display
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from music21 import key as m21key

# Configs
SR = 22050
HOP = 512
N_FFT = 2048
# Chord decoding params
FRAME_SEC = HOP / SR
SMOOTH_FRAMES = 9  # median smoothing for chord labels

# a
def load_audio(path):
    y, sr = librosa.load(path, sr=SR, mono=True)
    y = librosa.util.normalize(y)
    return y, sr

# spectrogram
def compute_mel_spec(y, sr):
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP, n_mels=128, power=2.0)
    S_db = librosa.power_to_db(S, ref=np.max)
    return S_db

# Onset Detection
def detect_onsets(y, sr):
    onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=HOP, aggregate=np.median)
    onsets_frames = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, hop_length=HOP, units='frames', backtrack=True)
    onset_times = librosa.frames_to_time(onsets_frames, sr=sr, hop_length=HOP).tolist()
    return onset_times, onset_env

# Key Estimation (simple chroma template)
MAJOR_TEMPLATE = np.array([6,2,3,2,4,2,3,6,2,4,2,3], dtype=float)
MINOR_TEMPLATE = np.array([6,2,3,6,2,4,2,3,6,2,4,2], dtype=float)

def estimate_key(y, sr):
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=HOP, bins_per_octave=36)
    chroma_mean = chroma.mean(axis=1)
    chroma_mean = chroma_mean / (chroma_mean.sum() + 1e-9)

    best_score, best_key = -1, None
    for tonic in range(12):
        major = np.roll(MAJOR_TEMPLATE, tonic)
        minor = np.roll(MINOR_TEMPLATE, tonic)
        major = major / major.sum()
        minor = minor / minor.sum()
        sM = np.dot(chroma_mean, major)
        sN = np.dot(chroma_mean, minor)
        if sM > best_score:
            best_score, best_key = sM, (tonic, 'major')
        if sN > best_score:
            best_score, best_key = sN, (tonic, 'minor')

    pitch_names = ['C','C#','D','Eb','E','F','F#','G','Ab','A','Bb','B']
    tonic_name = pitch_names[best_key[0]]
    music21_guess = m21key.Key(tonic_name, best_key[1]).tonic.name + ' ' + best_key[1]
    return {'tonic_index': best_key[0], 'mode': best_key[1], 'name': music21_guess}

# Chord Estimation (triad template matching per frame)
# 24 triad templates (12 major + 12 minor).
# might extend to 7ths later...
def build_triad_templates():
    T = []
    names = []
    # intervals (major: 0,4,7), (minor: 0,3,7)
    for i, mode in [(0,'major'), (1,'minor')]:
        for tonic in range(12):
            tpl = np.zeros(12)
            if mode == 'major':
                intervals = [0,4,7]
                name = ['', 'm'][i]
            else:
                intervals = [0,3,7]
                name = 'm'
            for iv in intervals:
                tpl[(tonic+iv)%12] = 1.0
            T.append(tpl)
            root_names = ['C','C#','D','Eb','E','F','F#','G','Ab','A','Bb','B']
            names.append(root_names[tonic] + ('' if mode=='major' else 'm'))
    T = np.array(T, dtype=float)
    T = normalize(T, norm='l1', axis=1)
    return T, names

TEMPLATES, CHORD_NAMES = build_triad_templates()

def frame_chords(y, sr):
    chroma = librosa.feature.chroma_cqt(y=y, sr=sr, hop_length=HOP, bins_per_octave=36)  # (12, frames)
    chroma = normalize(chroma + 1e-6, norm='l1', axis=0)
    # cosine similarity to templates
    sims = TEMPLATES @ chroma  # (24, frames)
    idx = np.argmax(sims, axis=0)
    return idx, CHORD_NAMES, chroma

def smooth_labels(idx, k=SMOOTH_FRAMES):
    from scipy.ndimage import median_filter
    return median_filter(idx, size=k, mode='nearest')

def chord_segments(idx_smooth, names):
    # Convert per-frame labels into (start_time, end_time, chord)
    segs = []
    f2t = lambda f: f * FRAME_SEC
    start = 0
    for i in range(1, len(idx_smooth)):
        if idx_smooth[i] != idx_smooth[i-1]:
            segs.append((f2t(start), f2t(i), names[idx_smooth[i-1]]))
            start = i
    segs.append((f2t(start), f2t(len(idx_smooth)), names[idx_smooth[-1]]))
    return segs

# Pattern mining
def ngram_stats(chord_seq, n=2):
    from collections import Counter
    grams = [tuple(chord_seq[i:i+n]) for i in range(len(chord_seq)-n+1)]
    c = Counter(grams)
    total = sum(c.values())
    return [{'pattern':' â†’ '.join(g), 'count':cnt, 'pct':round(100*cnt/total,2)} for g,cnt in c.most_common(12)]

# plot graphs
def plot_spectrogram_with_onsets(S_db, onset_times, sr):
    plt.figure(figsize=(12,4))
    librosa.display.specshow(S_db, sr=sr, hop_length=HOP, x_axis='time', y_axis='mel')
    for t in onset_times:
        plt.axvline(t, linewidth=0.8, alpha=0.7)
    plt.title('Mel Spectrogram with Onset Lines')
    plt.colorbar(format="%+2.0f dB")
    plt.tight_layout()

def plot_chromagram(chroma):
    plt.figure(figsize=(12,3))
    librosa.display.specshow(chroma, x_axis='time', y_axis='chroma', hop_length=HOP)
    plt.title('Chromagram')
    plt.colorbar()
    plt.tight_layout()

# ---------- MAIN ----------
def analyze(piano_path, out_json='analysis.json', show_plots=True):
    print("[1/8] Selecting audio source...")
    src, used_nv = pick_source(piano_path)
    print(f"Using: {src}")

    print("[2/8] Loading audio...")
    y, sr = load_audio(src)
    print(f"Loaded {len(y)/sr:.1f} seconds @ {sr} Hz")

    print("[3/8] Computing mel spectrogram...")
    S_db = compute_mel_spec(y, sr)

    print("[4/8] Detecting onsets...")
    onset_times, onset_env = detect_onsets(y, sr)
    print(f"Found {len(onset_times)} onsets")

    print("[5/8] Estimating key signature...")
    key_info = estimate_key(y, sr)
    print(f"Estimated key: {key_info['name']}")

    print("[6/8] Detecting chords and smoothing...")
    idx, names, chroma = frame_chords(y, sr)
    idx_s = smooth_labels(idx)
    segments = chord_segments(idx_s, names)
    print(f"{len(segments)} chord segments found")

    print("[7/8] Extracting common progressions...")
    chord_seq = [n for _, _, n in segments]
    bigrams = ngram_stats(chord_seq, n=2)
    trigrams = ngram_stats(chord_seq, n=3) if len(chord_seq) >= 3 else []

    print("[8/8] Generating plots and saving output...")
    plot_spectrogram_with_onsets(S_db, onset_times, sr, HOP)
    plot_chromagram(chroma, HOP)
    plot_chord_timeline(idx_s, segments, FRAME_SEC)

    result = {
        "source_used": "no_vocals.wav" if used_nv else pathlib.Path(src).name,
        "sample_rate": sr,
        "hop_length": HOP,
        "onset_times_sec": onset_times,
        "key": key_info,
        "chord_segments": [{"start": round(a, 3), "end": round(b, 3), "chord": c} for a, b, c in segments],
        "progressions_top_bigrams": bigrams,
        "progressions_top_trigrams": trigrams,
    }

    pathlib.Path(OUT_DIR / out_json).write_text(json.dumps(result, indent=2))
    print(f"Analysis complete. Results saved in: {OUT_DIR.resolve()}")
    return result


if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2:
        print("Usage: python vookley_piano_analyzer.py piano.wav")
        sys.exit(1)
    analyze(sys.argv[1])
