### 환경 설정

In [1]:
import google.colab as colab
colab.drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# === beethoven만 6:2:2로 분리 + flattened_metadata_with_split.json 생성 ===

from pathlib import Path
import os, json, re, math, shutil, glob, random

# --- 경로 설정 ---
PROJ = Path("/content/drive/MyDrive/DL/aria-midi-v1-unique-ext").resolve()
DATA_ROOT = PROJ / "data"
META_JSON = PROJ / "metadata.json"

SPLIT_ROOT = Path("/content/drive/MyDrive/DL/beethoven_dataset")
SPLIT_ROOT.mkdir(parents=True, exist_ok=True)

# 여러 번 실행 시, 아래를 True로 두면 train/validation/test 폴더를 비우고 시작합니다.
CLEAN_SPLIT = True
if CLEAN_SPLIT:
    for sub in ["train", "validation", "test"]:
        shutil.rmtree(SPLIT_ROOT / sub, ignore_errors=True)

for sub in ["train", "validation", "test"]:
    (SPLIT_ROOT / sub).mkdir(parents=True, exist_ok=True)

# --- 유틸 ---
def parse_filename(fp: Path):
    m = re.match(r"^(\d{6})_(\d+)\.mid$", fp.name)
    if not m:
        return None, None
    id_num = int(m.group(1))
    take = m.group(2)
    return str(id_num), take  # metadata.json의 키는 '4' 같은 형태

def pick_audio_score(meta_entry: dict, take: str):
    aud = meta_entry.get("audio_scores", {})
    if isinstance(aud, dict) and aud:
        if take in aud:
            return aud[take]
        try:
            return next(iter(aud.values()))
        except StopIteration:
            return None
    return None

def is_beethoven(composer_val):
    if composer_val is None:
        return False
    return "beethoven" in str(composer_val).lower()

# --- 메타 로드 ---
assert META_JSON.exists(), f"metadata.json not found: {META_JSON}"
with open(META_JSON, "r") as f:
    meta_raw = json.load(f)

# --- 데이터 스캔: 모든 .mid 파일 ---
all_mid_paths = [Path(p) for p in glob.glob(str(DATA_ROOT / "**" / "*.mid"), recursive=True)]

# --- Beethoven만 필터링 ---
beet_items = []
skipped_no_meta = 0
skipped_bad_name = 0

for fp in all_mid_paths:
    id_str, take = parse_filename(fp)
    if not id_str:
        skipped_bad_name += 1
        continue
    entry = meta_raw.get(id_str)
    if not entry:
        skipped_no_meta += 1
        continue

    md = entry.get("metadata", {})
    if not is_beethoven(md.get("composer")):
        continue

    beet_items.append((fp, id_str, take, entry))

print(f"총 MIDI: {len(all_mid_paths)}개")
print(f"beethoven 후보: {len(beet_items)}개")
print(f"메타 없음으로 스킵: {skipped_no_meta}개, 파일명 규칙 불일치 스킵: {skipped_bad_name}개")

# --- 6:2:2 분할 ---
SEED = 42
random.Random(SEED).shuffle(beet_items)

N = len(beet_items)
n_train = math.floor(N * 0.6)
n_val   = math.floor(N * 0.2)
n_test  = N - n_train - n_val

splits = (
    [("train", 0.6)] * n_train +
    [("validation", 0.2)] * n_val +
    [("test", 0.2)] * n_test
)

# --- 복사 & 플래튼 메타 구성 ---
flat_meta = {}  # key: 파일명, val: 메타 dict
missing_optionals = {"music_period": 0, "difficulty": 0, "genre": 0, "opus": 0}

for (item, (split_name, split_ratio)) in zip(beet_items, splits):
    fp, id_str, take, entry = item
    md = entry.get("metadata", {})

    basename = fp.name
    audio_score = pick_audio_score(entry, take)

    music_period = md.get("music_period")
    difficulty   = md.get("difficulty")
    genre        = md.get("genre")
    opus         = md.get("opus")

    if music_period is None: missing_optionals["music_period"] += 1
    if difficulty   is None: missing_optionals["difficulty"]   += 1
    if genre        is None: missing_optionals["genre"]        += 1
    if opus         is None: missing_optionals["opus"]         += 1

    dst = SPLIT_ROOT / split_name / basename
    shutil.copy2(fp, dst)  # 같은 이름이면 덮어씀

    flat_meta[basename] = {
        "file_path": basename,       # ex) 000004_0.mid
        "split": split_name,         # train / validation / test
        "composer": md.get("composer"),
        "music_period": music_period,
        "difficulty": difficulty,
        "genre": genre,
        "audio_score": audio_score,
        "opus": opus,
        "split_ratio": split_ratio,
    }

# --- JSON/CSV 저장: 쓰기 가능한 SPLIT_ROOT에 저장 ---
OUT_JSON = SPLIT_ROOT / "flattened_metadata_with_split.json"
OUT_CSV  = SPLIT_ROOT / "flattened_metadata_with_split.csv"

with open(OUT_JSON, "w") as f:
    json.dump(flat_meta, f, ensure_ascii=False, indent=2)

# CSV도 같이 저장(편의)
import pandas as pd
pd.DataFrame.from_dict(flat_meta, orient="index").reset_index(drop=True).to_csv(OUT_CSV, index=False)

print("\n=== 분할 결과 ===")
print(f"train: {n_train}, validation: {n_val}, test: {n_test}")
print(f"저장(JSON): {OUT_JSON}")
print(f"저장(CSV):  {OUT_CSV}")
print(f"출력 폴더: {SPLIT_ROOT} (train/ validation/ test)")
print("\n(참고) optional 필드 결측 개수 →", {k:v for k,v in missing_optionals.items() if v>0})


총 MIDI: 32522개
beethoven 후보: 450개
메타 없음으로 스킵: 0개, 파일명 규칙 불일치 스킵: 0개

=== 분할 결과 ===
train: 270, validation: 90, test: 90
저장(JSON): /content/drive/MyDrive/DL/beethoven_dataset/flattened_metadata_with_split.json
저장(CSV):  /content/drive/MyDrive/DL/beethoven_dataset/flattened_metadata_with_split.csv
출력 폴더: /content/drive/MyDrive/DL/beethoven_dataset (train/ validation/ test)

(참고) optional 필드 결측 개수 → {'music_period': 30, 'difficulty': 215, 'genre': 2, 'opus': 25}


In [3]:
# [1] Runtime check for Colab T4 (CUDA)
import sys, random, os
import numpy as np
import torch

print("Python:", sys.version.split()[0])
print("PyTorch:", torch.__version__)

# GPU 켜졌는지 확인 (런타임 ▸ Change runtime type ▸ GPU)
assert torch.cuda.is_available(), "GPU가 비활성화되어 있어요. 런타임 ▸ 런타임 유형 변경 ▸ GPU 로 바꿔주세요."
device = torch.device("cuda")
print("CUDA device:", torch.cuda.get_device_name(0))
print("CUDA version:", torch.version.cuda)
print("cuDNN:", torch.backends.cudnn.version())

# 재현성 (T4/CUDA)
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = True   # 성능 ↑
torch.backends.cudnn.deterministic = False

# AMP(혼합정밀) 사용 시: PyTorch 2.x 권장 API
from torch.amp import autocast, GradScaler
AMP_DEVICE = "cuda"          # train_one_epoch 안에서 with autocast(AMP_DEVICE): 로 사용
# 필요 시 학습 코드에서 scaler = GradScaler(AMP_DEVICE) 생성


Python: 3.12.11
PyTorch: 2.8.0+cu126
CUDA device: Tesla T4
CUDA version: 12.6
cuDNN: 91002


In [4]:
# [2] 라이브러리
!pip -q install pretty_midi miditoolkit music21 datasets --progress-bar off
# (선택) 시각화/로그: wandb or tensorboard 원하면 추가

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pretty_midi (setup.py) ... [?25l[?25hdone


In [5]:
### 음악 샘플 테스트
!apt-get -q install -y fluidsynth
!pip install midi2audio

Reading package lists...
Building dependency tree...
Reading state information...
The following additional packages will be installed:
  fluid-soundfont-gm libevdev2 libfluidsynth3 libgudev-1.0-0 libinput-bin
  libinput10 libinstpatch-1.0-2 libmd4c0 libmtdev1 libqt5core5a libqt5dbus5
  libqt5gui5 libqt5network5 libqt5svg5 libqt5widgets5 libwacom-bin
  libwacom-common libwacom9 libxcb-icccm4 libxcb-image0 libxcb-keysyms1
  libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xinput0 libxcb-xkb1
  libxkbcommon-x11-0 qsynth qt5-gtk-platformtheme qttranslations5-l10n
  timgm6mb-soundfont
Suggested packages:
  fluid-soundfont-gs qt5-image-formats-plugins qtwayland5 jackd
The following NEW packages will be installed:
  fluid-soundfont-gm fluidsynth libevdev2 libfluidsynth3 libgudev-1.0-0
  libinput-bin libinput10 libinstpatch-1.0-2 libmd4c0 libmtdev1 libqt5core5a
  libqt5dbus5 libqt5gui5 libqt5network5 libqt5svg5 libqt5widgets5 libwacom-bin
  libwacom-common libwacom9 libxcb-icccm4 libxc

In [6]:
PROJ = "/content/drive/MyDrive/DL/beethoven_dataset"
print("PROJ:", PROJ)

PROJ: /content/drive/MyDrive/DL/beethoven_dataset


In [7]:
import os, glob, json, pandas as pd

# 1) 후보 루트 폴더들: beethoven_dataset 우선, 실패시 프로젝트 폴더도 검색
SEARCH_ROOTS = [
    "/content/drive/MyDrive/DL/beethoven_dataset",            # 당신이 저장한 곳(권장)
    "/content/drive/MyDrive/DL/aria-midi-v1-unique-ext",      # 원본 프로젝트 폴더
]

candidates = []
for root in SEARCH_ROOTS:
    if os.path.isdir(root):
        candidates += glob.glob(os.path.join(root, "**", "flattened_metadata_with_split.json"),
                                recursive=True)

if not candidates:
    raise FileNotFoundError("flattened_metadata_with_split.json 을 찾지 못했습니다.")

META_JSON = sorted(candidates, key=len)[0]  # 가장 짧은 경로를 우선 사용
print("META_JSON:", META_JSON)

# 2) JSON 로드 → DataFrame (dict/list 모두 지원)
with open(META_JSON, "r") as f:
    meta_raw = json.load(f)

if isinstance(meta_raw, dict):
    meta_df = pd.DataFrame.from_dict(meta_raw, orient="index").reset_index(drop=True)
elif isinstance(meta_raw, list):
    meta_df = pd.DataFrame(meta_raw)
else:
    raise TypeError("알 수 없는 JSON 구조입니다 (dict 또는 list여야 함).")

# 3) 보기 좋은 컬럼 순서(존재하는 것만 선택)
cols = [c for c in ["file_path","split","composer","music_period","difficulty",
                    "genre","audio_score","opus","split_ratio"] if c in meta_df.columns]
if cols:
    meta_df = meta_df[cols]

print("meta_df shape:", meta_df.shape)

# 4) full_path 생성: JSON이 위치한 폴더 기준으로 split/train|validation|test 하위에 파일이 있다고 가정
#    META_JSON이 .../beethoven_dataset/flattened_metadata_with_split.json 형태라면 그 상위가 SPLIT_ROOT
SPLIT_ROOT = os.path.dirname(META_JSON)
if "split" in meta_df.columns and "file_path" in meta_df.columns:
    meta_df["full_path"] = meta_df.apply(
        lambda r: os.path.join(SPLIT_ROOT, r["split"], r["file_path"])
        if isinstance(r.get("split"), str) and isinstance(r.get("file_path"), str)
        else None,
        axis=1
    )
    meta_df["exists"] = meta_df["full_path"].apply(lambda p: os.path.isfile(p) if isinstance(p, str) else False)

    print("\nSplit 분포:")
    print(meta_df["split"].value_counts())
    print("\nfull_path 존재 개수:", meta_df["exists"].sum(), "/", len(meta_df))

meta_df.head()


META_JSON: /content/drive/MyDrive/DL/beethoven_dataset/flattened_metadata_with_split.json
meta_df shape: (450, 9)

Split 분포:
split
train         270
validation     90
test           90
Name: count, dtype: int64

full_path 존재 개수: 450 / 450


Unnamed: 0,file_path,split,composer,music_period,difficulty,genre,audio_score,opus,split_ratio,full_path,exists
0,177669_0.mid,train,beethoven,classical,advanced,classical,0.996,120.0,0.6,/content/drive/MyDrive/DL/beethoven_dataset/tr...,True
1,177275_0.mid,train,beethoven,classical,advanced,classical,0.9775,1.0,0.6,/content/drive/MyDrive/DL/beethoven_dataset/tr...,True
2,077238_0.mid,train,beethoven,classical,advanced,classical,0.9814,110.0,0.6,/content/drive/MyDrive/DL/beethoven_dataset/tr...,True
3,203711_0.mid,train,beethoven,classical,advanced,classical,0.999,70.0,0.6,/content/drive/MyDrive/DL/beethoven_dataset/tr...,True
4,003263_0.mid,train,beethoven,classical,,classical,0.9984,33.0,0.6,/content/drive/MyDrive/DL/beethoven_dataset/tr...,True


In [8]:
import os, pandas as pd
from pathlib import Path
from collections import defaultdict
import re

# PROJ 아래의 .mid/.midi 재귀 인덱싱
all_midis = [p for p in Path(PROJ).rglob("*.mid")]
print("Indexed MIDI files:", len(all_midis))

# 실제 파일명을 소문자로 정규화하여 인덱싱
by_name = defaultdict(list)
for p in all_midis:
    by_name[p.name.lower()].append(str(p))

def repad_and_match(file_path_from_json: str):
    """
    JSON의 file_path에서 숫자부분을 정수로 파싱 → 6자리로 zfill → 실제 파일 찾기
    """
    if not isinstance(file_path_from_json, str):
        return None

    base = os.path.basename(file_path_from_json).strip()
    # 패턴 매칭: 숫자_숫자.확장자
    m = re.match(r"^(\d+)_([0-9]+)\.(mid|midi)$", base, flags=re.IGNORECASE)

    if not m:
        # 패턴이 다르면 기존 방식으로 시도
        hits = by_name.get(base.lower(), [])
        return hits[0] if hits else None

    num_str, suffix, ext = m.groups()
    try:
        num = int(num_str)  # 앞의 0을 무시하고 정수로 해석
    except ValueError:
        return None

    # 6자리로 재패딩 (실제 파일명 패턴에 맞춤)
    repadded = f"{str(num).zfill(6)}_{suffix}.{ext.lower()}"

    # 1) 재패딩 이름으로 직접 조회
    hits = by_name.get(repadded.lower(), [])
    if hits:
        return hits[0]

    # 2) 확장자 스왑(.mid <-> .midi)까지 시도
    if repadded.lower().endswith(".mid"):
        alt = repadded[:-4] + ".midi"
    else:
        alt = repadded[:-5] + ".mid"

    hits = by_name.get(alt.lower(), [])
    return hits[0] if hits else None

# 매칭 실행
meta_df["full_path"] = meta_df["file_path"].map(repad_and_match)

# 결과 확인
matched_count = meta_df["full_path"].notna().sum()
total_count = len(meta_df)
print(f"매칭 성공: {matched_count} / 총: {total_count}")

# split별 DataFrame/리스트 생성
train_df = meta_df[meta_df["split"] == "train"].dropna(subset=["full_path"])
val_df   = meta_df[meta_df["split"] == "validation"].dropna(subset=["full_path"])
test_df  = meta_df[meta_df["split"] == "test"].dropna(subset=["full_path"])

train_files = train_df["full_path"].tolist()
val_files   = val_df["full_path"].tolist()
test_files  = test_df["full_path"].tolist()

print("train:", len(train_files), " | val:", len(val_files), " | test:", len(test_files))
train_df.head()

Indexed MIDI files: 460
매칭 성공: 450 / 총: 450
train: 270  | val: 90  | test: 90


Unnamed: 0,file_path,split,composer,music_period,difficulty,genre,audio_score,opus,split_ratio,full_path,exists
0,177669_0.mid,train,beethoven,classical,advanced,classical,0.996,120.0,0.6,/content/drive/MyDrive/DL/beethoven_dataset/tr...,True
1,177275_0.mid,train,beethoven,classical,advanced,classical,0.9775,1.0,0.6,/content/drive/MyDrive/DL/beethoven_dataset/tr...,True
2,077238_0.mid,train,beethoven,classical,advanced,classical,0.9814,110.0,0.6,/content/drive/MyDrive/DL/beethoven_dataset/tr...,True
3,203711_0.mid,train,beethoven,classical,advanced,classical,0.999,70.0,0.6,/content/drive/MyDrive/DL/beethoven_dataset/tr...,True
4,003263_0.mid,train,beethoven,classical,,classical,0.9984,33.0,0.6,/content/drive/MyDrive/DL/beethoven_dataset/tr...,True


### EDA
	•	이 데이터로 LSTM을 안정적으로 학습시킬 수 있는가?
	•	토큰화 규칙(시간 분할, 벨로시티 bin, max_len)을 어떻게 정할 것인가?
	•	학습 전 배제해야 할 샘플(너무 짧음/깨짐/이상치)은 있는가?

  	•	TIME_SHIFT 분할: IOI 분포 기반 32 또는 64 결정
	•	VEL bin 개수: 벨로시티 분포 기반 8/16 중 택1
	•	max_len/TBPTT: 길이 P95 기반(예: 512)
	•	폴리포니 처리: 동시 발음 분포에 맞춰 간단/확장 설계
	•	배제 규칙: 짧은 곡/무음/깨짐 사례 기준치 확정
	•	샘플링 제약: 반복률 높으면 반복 페널티/노리핏 n-gram 도입

In [9]:
import pretty_midi
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_midi(path):
    try:
        pm = pretty_midi.PrettyMIDI(path)
    except Exception as e:
        print("Parse error:", path, e)
        return None

    # (B) 길이/밀도
    duration = pm.get_end_time()
    events = sum(len(inst.notes) for inst in pm.instruments)
    density = events / duration if duration > 0 else 0

    # (E) 리듬(IOI 분포)
    iois = []
    for inst in pm.instruments:
        starts = sorted([n.start for n in inst.notes])
        iois += np.diff(starts).tolist() if len(starts) > 1 else []
    iois = np.array(iois)

    # (D) 다이내믹스(벨로시티 분포)
    velocities = [n.velocity for inst in pm.instruments for n in inst.notes]

    # (F) 폴리포니(동시 발음 수)
    note_times = []
    for inst in pm.instruments:
        for n in inst.notes:
            note_times.append((n.start, +1))  # note_on
            note_times.append((n.end, -1))   # note_off
    note_times.sort()
    active, max_poly, poly_hist = 0, 0, []
    for t, ev in note_times:
        active += ev
        max_poly = max(max_poly, active)
        poly_hist.append(active)

    return {
        "file": path,
        "duration": duration,
        "events": events,
        "density": density,
        "iois": iois,
        "velocities": velocities,
        "max_poly": max_poly,
        "poly_hist": poly_hist,
    }

In [10]:
import pandas as pd
from tqdm import tqdm

all_files = train_files + val_files + test_files
results = []
ioi_all, vel_all, poly_all = [], [], []

for f in tqdm(all_files[:100]):  # 처음엔 일부만 테스트 (예: 100개)
    r = analyze_midi(f)
    if r:
        results.append({
            "file": r["file"],
            "duration": r["duration"],
            "events": r["events"],
            "density": r["density"],
            "max_poly": r["max_poly"]
        })
        ioi_all += r["iois"].tolist()
        vel_all += r["velocities"]
        poly_all += r["poly_hist"]

eda_df = pd.DataFrame(results)
eda_df.head()

100%|██████████| 100/100 [00:05<00:00, 18.32it/s]


Unnamed: 0,file,duration,events,density,max_poly
0,/content/drive/MyDrive/DL/beethoven_dataset/tr...,60.38,831,13.762835,10
1,/content/drive/MyDrive/DL/beethoven_dataset/tr...,263.64,3148,11.940525,15
2,/content/drive/MyDrive/DL/beethoven_dataset/tr...,659.42,3780,5.73231,19
3,/content/drive/MyDrive/DL/beethoven_dataset/tr...,322.77,2464,7.633919,11
4,/content/drive/MyDrive/DL/beethoven_dataset/tr...,134.2,884,6.587183,6


In [11]:
# 필요 라이브러리
import pretty_midi, numpy as np, math
from collections import Counter, defaultdict
from statistics import median

# ---- 공통 유틸 ----
def safe_load_midi(path):
    try:
        return pretty_midi.PrettyMIDI(path)
    except Exception as e:
        return None

def median_beat_period(pm: pretty_midi.PrettyMIDI):
    """곡의 박(quarter-note) 길이(초) 추정: beat 간격의 중앙값 사용."""
    beats = pm.get_beats()  # tempo 변화 반영됨
    if len(beats) >= 2:
        return float(np.median(np.diff(beats)))
    # 예외: 비트 추정 실패 → 템포 변화에서 근사
    times, tempi = pm.get_tempo_changes()
    if len(tempi) > 0:
        return float(np.median(60.0 / tempi))
    # 최후의 기본값(120bpm)
    return 0.5

def all_notes(pm, include_drums=False):
    notes = []
    for inst in pm.instruments:
        if (not include_drums) and inst.is_drum:
            continue
        notes.extend(inst.notes)
    # 시작시각 기준 정렬
    notes.sort(key=lambda n: (n.start, n.end))
    return notes

# ---- (1) TIME_SHIFT 분해능 평가: IOI 스냅 오차 ----
def ioi_snap_report(files, limit=None):
    """
    IOI(인접 note-on 간 시간차)를 박 격자(32/64분할)에 스냅했을 때
    스텝 대비 오차의 P95를 계산해 추천 분해능을 반환.
    """
    errs = {32: [], 64: []}
    n_files = 0
    for i, path in enumerate(files):
        if limit and i >= limit:
            break
        pm = safe_load_midi(path)
        if pm is None:
            continue
        n_files += 1
        bp = median_beat_period(pm)  # one beat (quarter-note) seconds
        starts = [n.start for n in all_notes(pm)]
        if len(starts) < 2:
            continue
        iois = np.diff(sorted(starts))
        for div in (32, 64):
            step = bp / div  # seconds per sub-beat
            x = iois / step  # 스텝 단위로 표시
            frac_err = np.abs(x - np.round(x))  # 최근접 스텝과의 차이(스텝 단위)
            errs[div].extend(frac_err.tolist())

    rep = {}
    for div in (32, 64):
        if len(errs[div]) == 0:
            rep[div] = np.nan
        else:
            rep[div] = float(np.percentile(errs[div], 95))  # P95 (스텝 단위, 0~0.5)
    # 추천 규칙: P95 < 0.25 이면 해당 격자 OK. 둘 다 OK면 더 단순한 32를 채택.
    if math.isnan(rep[32]) and math.isnan(rep[64]):
        choice = None
    elif (not math.isnan(rep[32]) and rep[32] <= 0.25) and (not math.isnan(rep[64]) and rep[64] <= 0.25):
        choice = 32
    elif (not math.isnan(rep[64]) and rep[64] <= 0.25):
        choice = 64
    else:
        # 둘 다 크면 64가 상대적으로 유리(더 촘촘)
        choice = 64

    print(f"[IOI 스냅 오차] P95(스텝 단위) → 32분할:{rep[32]:.3f}, 64분할:{rep[64]:.3f}  | 추천:{choice}")
    return {"p95_32": rep[32], "p95_64": rep[64], "choice": choice}

# ---- (2) Velocity 분포 요약: IQR 기반 bin 추천 ----
def velocity_report(files, limit=None):
    vels = []
    parsed = 0
    for i, path in enumerate(files):
        if limit and i >= limit:
            break
        pm = safe_load_midi(path)
        if pm is None:
            continue
        parsed += 1
        for n in all_notes(pm):
            vels.append(n.velocity)
    if len(vels) == 0:
        print("[Velocity] 수집된 벨로시티가 없습니다.")
        return {"iqr": None, "choice": None}
    v = np.array(vels, dtype=float)
    q25, q75 = np.percentile(v, [25, 75])
    iqr = float(q75 - q25)
    # 추천 규칙: IQR < 20 → 8bin, 그 외엔 16bin
    choice = 8 if iqr < 20 else 16
    print(f"[Velocity] IQR={iqr:.1f}  (Q25={q25:.1f}, Q75={q75:.1f})  | 추천 bin={choice}")
    return {"iqr": iqr, "q25": float(q25), "q75": float(q75), "choice": choice}

# ---- (3) 반복률: 3–5그램 상위 점유율 & no-repeat 권고 ----
def ngram_repetition_report(files, n_vals=(3,4,5), limit=None):
    """
    간단 토큰열: (pitch, duration_bin) 시퀀스.
    duration_bin은 한 박을 8등분한 스텝으로 반올림하여 사용.
    전역 n-gram 카운트를 모아 상위 n-gram의 점유율을 계산.
    """
    global_counts = {n: Counter() for n in n_vals}
    total = {n: 0 for n in n_vals}

    for i, path in enumerate(files):
        if limit and i >= limit:
            break
        pm = safe_load_midi(path)
        if pm is None:
            continue
        bp = median_beat_period(pm)
        step = bp / 8.0  # 프레이즈 거칠게 보기: 1박 8분할
        seq = []
        for n in all_notes(pm):
            dur_bin = int(round(max((n.end - n.start) / step, 0)))
            dur_bin = min(dur_bin, 31)  # 과도한 길이는 클램프
            seq.append((n.pitch, dur_bin))
        if len(seq) == 0:
            continue
        # n-gram 생성
        for n in n_vals:
            if len(seq) < n:
                continue
            for j in range(len(seq) - n + 1):
                tup = tuple(seq[j:j+n])
                global_counts[n][tup] += 1
                total[n] += 1

    report = {}
    for n in n_vals:
        if total[n] == 0 or len(global_counts[n]) == 0:
            report[n] = {"top_ratio": None, "suggest": None}
            print(f"[n={n}-gram] 데이터 부족")
            continue
        top_ng, top_ct = global_counts[n].most_common(1)[0]
        top_ratio = top_ct / total[n]
        # 권고: 상위 n-gram 점유율이 2.5% 이상이면 no-repeat n-gram 적용
        suggest = (f"no_repeat_ngram_size={n}" if top_ratio >= 0.025 else "optional")
        print(f"[n={n}-gram] 상위 점유율={top_ratio*100:.2f}%  | 권고: {suggest}")
        report[n] = {"top_ratio": top_ratio, "suggest": suggest}
    return report

# ---- 실행: 파일 목록을 넣어 한 번에 리포트 ----
def run_added_eda(train_files, val_files, test_files, limit_per_split=None):
    files = []
    for L in (train_files, val_files, test_files):
        files.extend(L[:limit_per_split] if limit_per_split else L)

    print("파일 수:", len(files))
    out = {}
    out["ioi"] = ioi_snap_report(files)
    out["vel"] = velocity_report(files)
    out["rep"] = ngram_repetition_report(files)
    print("\n요약:")
    print(" - TIME_SHIFT 추천:", out['ioi']['choice'])
    print(" - Velocity bin 추천:", out['vel']['choice'])
    best_rep = max((v["top_ratio"] for v in out["rep"].values() if v["top_ratio"] is not None), default=None)
    if best_rep is not None and best_rep >= 0.025:
        # 가장 강한 n-gram의 n을 찾아서 표시
        pick_n = max(out["rep"], key=lambda k: (out["rep"][k]["top_ratio"] or -1))
        print(f" - 반복 억제 권고: {out['rep'][pick_n]['suggest']} (상위 점유율={out['rep'][pick_n]['top_ratio']*100:.2f}%)")
    else:
        print(" - 반복 억제: optional (강한 반복 패턴 증거 약함)")
    return out

# 사용 예:
# result = run_added_eda(train_files, val_files, test_files, limit_per_split=50)

In [12]:
result = run_added_eda(train_files, val_files, test_files, limit_per_split=50)

파일 수: 150
[IOI 스냅 오차] P95(스텝 단위) → 32분할:0.440, 64분할:0.480  | 추천:64
[Velocity] IQR=25.0  (Q25=50.0, Q75=75.0)  | 추천 bin=16
[n=3-gram] 상위 점유율=0.02%  | 권고: optional
[n=4-gram] 상위 점유율=0.01%  | 권고: optional
[n=5-gram] 상위 점유율=0.01%  | 권고: optional

요약:
 - TIME_SHIFT 추천: 64
 - Velocity bin 추천: 16
 - 반복 억제: optional (강한 반복 패턴 증거 약함)


### 토큰화
- 이벤트 토큰 방식

In [13]:
# ==== One Cell: Beethoven-aware 토크나이저 (BAR + KEY + PEDAL + ACCENT + LH TEXTURE) ====
import math, os, json, hashlib
import numpy as np
import pretty_midi
from collections import defaultdict

# ------------------
# 0) 설정/상수
# ------------------
TS_DIV   = 64          # 1박(quarter)을 64 등분
VEL_BINS = 16          # velocity 양자화 bin
TS_MAX   = 16          # 한 번에 전진 가능한 TS 최대 스텝
PROGRAM  = 0           # 피아노

# 특수 토큰
PAD_ID = 0
BOS_ID = 1
EOS_ID = 2

# 기본 이벤트 토큰 영역
VEL_BASE = 3
TS_BASE  = VEL_BASE + VEL_BINS
NON_BASE = TS_BASE  + TS_MAX         # NOTE_ON 0..127
NOFF_BASE= NON_BASE + 128            # NOTE_OFF 0..127

# 확장 토큰: BAR/KEY/PEDAL/ACCENT/LH-TEXTURE
BAR_BASE = NOFF_BASE + 128
BAR_ID   = BAR_BASE

KEY_BASE = BAR_BASE + 1      # 24개 예약(12 Major + 12 Minor) - 실제 입력에 Minor가 없으면 Major만 사용됨
PED_BASE = KEY_BASE + 24     # Sustain Pedal ON/OFF
ACCENT_BASE = PED_BASE + 2   # 강한 악센트 마커
TEXTURE_BASE = ACCENT_BASE + 1  # 왼손 텍스처 5종

# LH 텍스처 토큰
TEXTURE_UNKNOWN  = TEXTURE_BASE + 0
TEXTURE_ALBERTI  = TEXTURE_BASE + 1
TEXTURE_OOMPAH   = TEXTURE_BASE + 2
TEXTURE_OCTAVE   = TEXTURE_BASE + 3
TEXTURE_SCALE    = TEXTURE_BASE + 4

# KEY 토큰 (0~11: Major C..B, 12~23: Minor C..B)
KEY_IDS_MAJOR = {i: KEY_BASE + i for i in range(12)}
KEY_IDS_MINOR = {i: KEY_BASE + 12 + i for i in range(12)}

PED_ON_ID  = PED_BASE + 0
PED_OFF_ID = PED_BASE + 1
ACCENT_ID  = ACCENT_BASE + 0

# 최종 Vocab
VOCAB_SIZE = TEXTURE_BASE + 5

# 🔧 토글
ADD_BAR_TOKENS     = True
ADD_KEY_TOKENS     = True
ADD_PEDAL_TOKENS   = True
ADD_ACCENT_TOKENS  = True
ADD_TEXTURE_TOKENS = True

# 파라미터
PITCH_LH_SPLIT = 60                 # 왼손/오른손 경계 (C4=60)
ACCENT_VB_MIN  = int(0.85 * VEL_BINS)  # 상위 15% 벨로시티 bin 이상이면 악센트로 표시

# ------------------
# 0-1) 헬퍼 함수
# ------------------
def vel_to_bin(v: int, bins: int = VEL_BINS):
    v = max(1, min(127, int(v)))     # 0은 드물므로 1~127로 클램프
    step = 128 / bins
    b = int(math.ceil(v / step))
    return max(1, min(bins, b))

def bin_to_vel(b: int, bins: int = VEL_BINS):
    step = 128 / bins
    lo = int((b - 1) * step)
    hi = int(b * step - 1)
    return int((lo + hi) // 2)

def beat_period_seconds(pm: pretty_midi.PrettyMIDI):
    """한 박(quarter)의 평균 길이(초) 추정."""
    beats = pm.get_beats()
    if len(beats) >= 2:
        return float(np.median(np.diff(beats)))
    t, tempi = pm.get_tempo_changes()
    if len(tempi):
        return float(np.median(60.0 / tempi))
    return 0.5  # 120bpm fallback

def _id_vel(b):      return VEL_BASE + (b - 1)
def _id_ts(k):       return TS_BASE  + (k - 1)
def _id_non(pitch):  return NON_BASE + pitch
def _id_noff(pitch): return NOFF_BASE + pitch

def _is_vel(tok):    return VEL_BASE <= tok < VEL_BASE + VEL_BINS
def _is_ts(tok):     return TS_BASE  <= tok < TS_BASE  + TS_MAX
def _is_non(tok):    return NON_BASE <= tok < NON_BASE + 128
def _is_noff(tok):   return NOFF_BASE<= tok < NOFF_BASE+ 128
def _is_bar(tok):    return tok == BAR_ID

def _pc(p): return p % 12

# ------------------
# 0-2) 마디·조성·페달·텍스처 전처리
# ------------------
def _compute_bar_boundaries(pm: pretty_midi.PrettyMIDI, step_sec: float, ts_div: int, max_grid: int):
    """
    time_signature_changes 기반으로 그리드 인덱스(정수) 형태의 마디 시작 지점들을 반환.
    """
    tsc = [(ts.time, ts.numerator, ts.denominator) for ts in pm.time_signature_changes]
    tsc.sort(key=lambda x: x[0])
    if not tsc or tsc[0][0] > 0.0:
        tsc = [(0.0, 4, 4)] + tsc

    bar_grids = []
    end_time = pm.get_end_time()
    for i, (t0, num, den) in enumerate(tsc):
        t1 = tsc[i+1][0] if i+1 < len(tsc) else end_time
        start_g = int(round(t0 / step_sec))
        end_g   = max(start_g, int(round(t1 / step_sec)))
        measure_qn = num * (4.0 / max(1, den))
        bar_steps  = max(1, int(round(measure_qn * ts_div)))
        g = start_g
        while g <= end_g and g <= max_grid:
            bar_grids.append(g)
            g += bar_steps
    bar_grids = sorted(set(x for x in bar_grids if 0 <= x <= max_grid))
    return bar_grids

def _key_events(pm: pretty_midi.PrettyMIDI, step_sec: float):
    """
    (grid_index, key_token) 리스트 반환.
    pretty_midi의 KeySignature가 보통 major 12키(0..11)만 제공할 수 있어, minor는 없으면 생략.
    """
    events = []
    try:
        ksc = getattr(pm, "key_signature_changes", None)
        if not ksc:
            return events
        for ks in ksc:
            time = float(ks.time)
            grid = int(round(time / step_sec))
            kn = int(getattr(ks, "key_number", 0))  # 0..11 (Major로 가정)
            mode = int(getattr(ks, "mode", 0))      # 있으면 0=maj, 1=min (없으면 0)
            if mode == 1:
                tok = KEY_IDS_MINOR.get(kn % 12)
            else:
                tok = KEY_IDS_MAJOR.get(kn % 12)
            if tok is not None:
                events.append((grid, tok))
        events.sort(key=lambda x: x[0])
    except Exception:
        pass
    return events

def _pedal_events(pm: pretty_midi.PrettyMIDI, step_sec: float):
    """(grid_index, PED_ON/OFF) 리스트."""
    ev = []
    try:
        for inst in pm.instruments:
            for cc in getattr(inst, "control_changes", []):
                if cc.number == 64:  # sustain
                    grid = int(round(float(cc.time) / step_sec))
                    if int(cc.value) >= 64:
                        ev.append((grid, PED_ON_ID))
                    else:
                        ev.append((grid, PED_OFF_ID))
        ev.sort(key=lambda x: x[0])
        # 동일 그리드 중복 정리(마지막 것만)
        dedup = {}
        for g, tok in ev:
            dedup[g] = tok
        ev = sorted(dedup.items(), key=lambda x: x[0])
    except Exception:
        return []
    return ev

def _classify_lh_texture(bar_on_events, bar_chord_sizes):
    """
    bar_on_events: [(grid, [pitches...]), ...] (LH만)
    bar_chord_sizes: 해당 그리드에서 LH 동시에 켜진 개수
    """
    if not bar_on_events:
        return TEXTURE_UNKNOWN
    total_on = sum(len(ps) for _, ps in bar_on_events)
    if total_on == 0:
        return TEXTURE_UNKNOWN

    # 옥타브 비율
    octave_pairs = 0
    for _, ps in bar_on_events:
        s = set(ps)
        for p in ps:
            if (p+12) in s or (p-12) in s:
                octave_pairs += 1
    pct_oct = octave_pairs / max(1, total_on)

    # 코드(3음 이상 동시) 비율
    chord_steps = sum(1 for c in bar_chord_sizes if c >= 3)
    pct_chord_steps = chord_steps / max(1, len(bar_chord_sizes))

    # 그리드 간격 모드값(촘촘함의 조악한 지표)
    grids = [g for g,_ in bar_on_events]
    grids.sort()
    if len(grids) >= 2:
        diffs = np.diff(grids)
        step_mode = int(np.bincount(diffs).argmax()) if len(diffs) > 0 else 0
    else:
        step_mode = 0

    if pct_oct >= 0.30:
        return TEXTURE_OCTAVE
    if pct_chord_steps >= 0.30:
        return TEXTURE_OOMPAH
    if step_mode and step_mode <= 4:  # 촘촘(16분음표 근처) + 단음 위주
        return TEXTURE_ALBERTI
    return TEXTURE_UNKNOWN

# ------------------
# 1) 토큰화
# ------------------
def tokenize_midi(path, ts_div=TS_DIV, vel_bins=VEL_BINS, ts_max=TS_MAX):
    pm = pretty_midi.PrettyMIDI(path)
    bp = beat_period_seconds(pm)
    step_sec = bp / ts_div

    # 노트 수집
    notes = []
    for inst in pm.instruments:
        if inst.is_drum:
            continue
        for n in inst.notes:
            notes.append(n)

    # 그리드 스냅
    for n in notes:
        n._grid_start = int(round(n.start / step_sec))
        n._grid_end   = max(n._grid_start + 1, int(round(n.end / step_sec)))

    bucket_on, bucket_off = defaultdict(list), defaultdict(list)
    for n in notes:
        vb = vel_to_bin(n.velocity, vel_bins)
        bucket_on[n._grid_start].append((n.pitch, vb))
        bucket_off[n._grid_end].append(n.pitch)

    timeline = sorted(set(list(bucket_on.keys()) + list(bucket_off.keys())))
    max_grid = timeline[-1] if timeline else 0

    # 보조 이벤트(마디/조성/페달)
    bar_grids = _compute_bar_boundaries(pm, step_sec, ts_div, max_grid) if ADD_BAR_TOKENS else []
    key_events = _key_events(pm, step_sec) if ADD_KEY_TOKENS else []
    ped_events = _pedal_events(pm, step_sec) if ADD_PEDAL_TOKENS else []

    bar_idx = 0
    key_idx = 0
    ped_idx = 0

    tokens = [BOS_ID]
    cur_t = 0

    # 시작 시점의 조성 토큰(있다면)
    if ADD_KEY_TOKENS and key_events and key_events[0][0] == 0:
        tokens.append(key_events[0][1]); key_idx = 1

    for t in timeline:
        if t < cur_t:
            continue
        gap = t - cur_t

        # 시간전진(TS) 토큰을 쪼개며 BAR/KEY/PED 이벤트에 맞춰 끊기
        while gap > 0:
            next_bar = None
            if ADD_BAR_TOKENS and bar_idx < len(bar_grids):
                while bar_idx < len(bar_grids) and bar_grids[bar_idx] <= cur_t:
                    bar_idx += 1
                if bar_idx < len(bar_grids):
                    next_bar = bar_grids[bar_idx]

            step = min(ts_max, gap)
            if ADD_BAR_TOKENS and (next_bar is not None):
                steps_to_bar = next_bar - cur_t
                if steps_to_bar > 0:
                    step = min(step, steps_to_bar)

            tokens.append(_id_ts(step))
            cur_t += step
            gap   -= step

            # BAR 경계 도달
            if ADD_BAR_TOKENS and (next_bar is not None) and (cur_t == next_bar):
                tokens.append(BAR_ID)

                # 왼손 텍스처 한 토큰 추가(선택)
                if ADD_TEXTURE_TOKENS:
                    bar_start = next_bar
                    bar_end = bar_grids[bar_idx+1] if (bar_idx+1 < len(bar_grids)) else (timeline[-1] if timeline else bar_start)
                    lh_on, lh_chord_sizes = [], []
                    for g in range(bar_start, min(bar_end, max_grid)+1):
                        if g in bucket_on:
                            lh_pitches = [p for (p, vb) in bucket_on[g] if p < PITCH_LH_SPLIT]
                            if lh_pitches:
                                lh_on.append((g, lh_pitches))
                                lh_chord_sizes.append(len(lh_pitches))
                    tex = _classify_lh_texture(lh_on, lh_chord_sizes)
                    tokens.append(tex)

                bar_idx += 1

            # 같은 그리드의 KEY/PED 이벤트도 동시에 토해냄
            if ADD_KEY_TOKENS:
                while key_idx < len(key_events) and key_events[key_idx][0] == cur_t:
                    tokens.append(key_events[key_idx][1]); key_idx += 1
            if ADD_PEDAL_TOKENS:
                while ped_idx < len(ped_events) and ped_events[ped_idx][0] == cur_t:
                    tokens.append(ped_events[ped_idx][1]); ped_idx += 1

        # 동시 발음: "ACCENT?(옵션) → VEL → NOTE_ON" 반복
        if t in bucket_on:
            for pitch, vb in sorted(bucket_on[t], key=lambda x: x[0]):
                if ADD_ACCENT_TOKENS and vb >= ACCENT_VB_MIN:
                    tokens.append(ACCENT_ID)
                tokens.append(_id_vel(vb))
                tokens.append(_id_non(pitch))

        # NOTE_OFF
        if t in bucket_off:
            for pitch in sorted(bucket_off[t]):
                tokens.append(_id_noff(pitch))

    tokens.append(EOS_ID)
    aux = {
        "step_sec": step_sec, "program": PROGRAM,
        "ts_div": ts_div, "vel_bins": vel_bins, "ts_max": ts_max,
        "add_bar_tokens": bool(ADD_BAR_TOKENS),
        "add_key_tokens": bool(ADD_KEY_TOKENS),
        "add_pedal_tokens": bool(ADD_PEDAL_TOKENS),
        "add_accent_tokens": bool(ADD_ACCENT_TOKENS),
        "add_texture_tokens": bool(ADD_TEXTURE_TOKENS),
        "pitch_lh_split": PITCH_LH_SPLIT
    }
    return tokens, aux

# ------------------
# 2) 디토큰화 (확장 토큰은 소리에 영향 X)
# ------------------
def detokenize_to_pretty_midi(tokens, aux):
    step_sec = float(aux.get("step_sec", 0.5/TS_DIV))
    program  = int(aux.get("program", PROGRAM))

    pm = pretty_midi.PrettyMIDI()
    inst = pretty_midi.Instrument(program=program, is_drum=False)
    pm.instruments.append(inst)

    cur_grid = 0
    current_vel_bin = vel_to_bin(64, VEL_BINS)
    open_notes = {}

    def grid_to_time(g): return g * step_sec

    i, N = 0, len(tokens)
    while i < N:
        tok = tokens[i]; i += 1
        if tok == BOS_ID: continue
        if tok == EOS_ID: break

        # 메타/스타일 토큰은 재생에 반영하지 않음
        if tok == BAR_ID or (KEY_BASE <= tok < KEY_BASE+24) or (tok in (PED_ON_ID, PED_OFF_ID)) \
           or (tok == ACCENT_ID) or (TEXTURE_BASE <= tok < TEXTURE_BASE+5):
            continue

        if _is_ts(tok):
            k = (tok - TS_BASE) + 1
            cur_grid += k
            continue
        if _is_vel(tok):
            current_vel_bin = (tok - VEL_BASE) + 1
            continue
        if _is_non(tok):
            pitch = (tok - NON_BASE)
            start = grid_to_time(cur_grid)
            vel   = bin_to_vel(current_vel_bin, VEL_BINS)
            if pitch in open_notes:
                inst.notes.append(pretty_midi.Note(velocity=vel, pitch=pitch,
                                                   start=open_notes[pitch], end=start + step_sec))
            open_notes[pitch] = start
            continue
        if _is_noff(tok):
            pitch = (tok - NOFF_BASE)
            if pitch in open_notes:
                start = open_notes.pop(pitch)
                end   = max(start + 1e-3, grid_to_time(cur_grid))
                vel   = bin_to_vel(current_vel_bin, VEL_BINS)
                inst.notes.append(pretty_midi.Note(velocity=vel, pitch=pitch, start=start, end=end))
            continue
        # 기타 토큰 무시

    # 열린 노트 정리
    end_time = grid_to_time(cur_grid)
    for pitch, st in list(open_notes.items()):
        inst.notes.append(pretty_midi.Note(
            velocity=bin_to_vel(current_vel_bin, VEL_BINS), pitch=pitch,
            start=st, end=max(st + step_sec, end_time)
        ))
    return pm

def detokenize_to_midi_file(tokens, aux, out_path):
    pm = detokenize_to_pretty_midi(tokens, aux)
    pm.write(out_path)
    return out_path

# ------------------
# 3) 라운드트립 테스트/리포트 (+확장 토큰 카운트)
# ------------------
def tokenize_and_reconstruct(path, out_midi_path=None, count_meta=True):
    toks, aux = tokenize_midi(path)
    pm_orig  = pretty_midi.PrettyMIDI(path)
    pm_recon = detokenize_to_pretty_midi(toks, aux)

    dur_o = pm_orig.get_end_time()
    dur_r = pm_recon.get_end_time()
    cnt_o = sum(len(inst.notes) for inst in pm_orig.instruments  if not inst.is_drum)
    cnt_r = sum(len(inst.notes) for inst in pm_recon.instruments if not inst.is_drum)

    report = {
        "tokens": int(len(toks)),
        "orig_duration": float(dur_o),
        "recon_duration": float(dur_r),
        "dur_rel_err_%": float((abs(dur_o - dur_r) / max(1e-6, dur_o)) * 100.0),
        "orig_events": int(cnt_o),
        "recon_events": int(cnt_r),
        "evt_rel_err_%": float((abs(cnt_o - cnt_r) / max(1, cnt_o)) * 100.0),
        "add_bar_tokens": bool(aux.get("add_bar_tokens", False)),
        "add_key_tokens": bool(aux.get("add_key_tokens", False)),
        "add_pedal_tokens": bool(aux.get("add_pedal_tokens", False)),
        "add_accent_tokens": bool(aux.get("add_accent_tokens", False)),
        "add_texture_tokens": bool(aux.get("add_texture_tokens", False)),
    }

    if count_meta:
        report.update({
            "BAR_tokens": int(sum(1 for t in toks if t == BAR_ID)),
            "KEY_tokens": int(sum(1 for t in toks if KEY_BASE <= t < KEY_BASE+24)),
            "PED_tokens": int(sum(1 for t in toks if t in (PED_ON_ID, PED_OFF_ID))),
            "ACCENT_tokens": int(sum(1 for t in toks if t == ACCENT_ID)),
            "TEXTURE_tokens": int(sum(1 for t in toks if TEXTURE_BASE <= t < TEXTURE_BASE+5)),
        })

    if out_midi_path:
        pm_recon.write(out_midi_path)
        report["saved"] = out_midi_path
    return toks, aux, report

print("VOCAB_SIZE:", VOCAB_SIZE, "| TS_DIV:", TS_DIV, "| VEL_BINS:", VEL_BINS)


VOCAB_SIZE: 324 | TS_DIV: 64 | VEL_BINS: 16


In [14]:
# 예시: 실제 MIDI 경로로 테스트
sample_mid = train_files[0]  # 이미 리스트가 있다면
# sample_mid = "/content/drive/MyDrive/DL/beethoven/beethoven_midis/train/xxxx.mid"  # 직접 지정도 가능

toks, aux, rep = tokenize_and_reconstruct(sample_mid)
print(rep)


{'tokens': 4009, 'orig_duration': 60.38, 'recon_duration': 60.3828125, 'dur_rel_err_%': 0.004657999337524746, 'orig_events': 831, 'recon_events': 831, 'evt_rel_err_%': 0.0, 'add_bar_tokens': True, 'add_key_tokens': True, 'add_pedal_tokens': True, 'add_accent_tokens': True, 'add_texture_tokens': True, 'BAR_tokens': 30, 'KEY_tokens': 0, 'PED_tokens': 0, 'ACCENT_tokens': 19, 'TEXTURE_tokens': 30}


### Dataset/DataLoader

In [15]:
# 전제: 앞서 정의한 tokenize_midi()가 이미 세션에 존재한다고 가정합니다.
# 필요시: from your_module import tokenize_midi

import os, csv, json, math, hashlib, random, time
from pathlib import Path
from typing import List, Dict, Any, Tuple
import numpy as np
import pretty_midi
import torch
from torch.utils.data import Dataset, DataLoader

# =========================
# 설정(필요시 수정)
# =========================
PAD_ID = 0
BOS_ID = 1
EOS_ID = 2

# 토큰화 규칙 버전(캐시 구분용): 토큰 규칙 바뀌면 꼭 바꿔주세요.
TOKEN_RULE_VERSION = "evt_ts64_vel16_tsm16_v1"

# 필터 규칙(권장 기본)
MIN_EVENTS = 200          # 너무 짧은 곡 제외
MAX_DENSITY = 10.0        # events/sec 상한
MIN_DURATION_SEC = 30.0   # 30초 미만 제외

# 캐시/로그 디렉토리
PROJ = "/content/drive/MyDrive/DL/beethoven/beethoven_dataset"
CACHE_DIR = f"{PROJ}/data/processed"
LOG_DIR   = f"{PROJ}/logs"
REPORT_DIR= f"{PROJ}/reports"
os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(REPORT_DIR, exist_ok=True)

FILTER_REPORT_CSV = f"{LOG_DIR}/filter_report.csv"

# =========================
# 유틸
# =========================
def sha1_text(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

def safe_midi_stats(path: str) -> Dict[str, Any]:
    """
    필터 판단을 위한 빠른 통계: duration, events(#notes), density.
    """
    try:
        pm = pretty_midi.PrettyMIDI(path)
        duration = pm.get_end_time()
        events = sum(len(inst.notes) for inst in pm.instruments if not inst.is_drum)
        density = (events / max(1e-6, duration)) if duration > 0 else float("inf")
        return {"ok": True, "duration": duration, "events": events, "density": density}
    except Exception as e:
        return {"ok": False, "error": str(e)}

def cache_paths_for(midipath: str) -> Tuple[str, str]:
    """
    캐시 파일 경로: .npy(토큰), .json(aux)
    캐시 키 = sha1(토큰규칙버전 + 절대경로 + 최종수정시각)
    """
    p = Path(midipath)
    stat = p.stat()
    key_src = f"{TOKEN_RULE_VERSION}|{str(p.resolve())}|{stat.st_mtime_ns}"
    key = sha1_text(key_src)
    npy_path = os.path.join(CACHE_DIR, f"{key}.npy")
    js_path  = os.path.join(CACHE_DIR, f"{key}.json")
    return npy_path, js_path

def load_or_tokenize(midipath: str):
    """
    캐시가 있으면 로드, 없으면 토큰화 후 저장.
    반환: (tokens: np.ndarray[int], aux: dict)
    """
    npy_path, js_path = cache_paths_for(midipath)
    if os.path.exists(npy_path) and os.path.exists(js_path):
        toks = np.load(npy_path)
        with open(js_path, "r") as f:
            aux = json.load(f)
        return toks, aux, True  # from_cache=True

    # 캐시 없음 → 토큰화
    tokens, aux = tokenize_midi(midipath)  # 앞서 제공한 함수 사용
    tokens = np.asarray(tokens, dtype=np.int32)

    # 저장
    np.save(npy_path, tokens)
    with open(js_path, "w") as f:
        json.dump(aux, f)

    return tokens, aux, False

def append_filter_report(rows: List[Dict[str, Any]]):
    """
    제외된 파일/사유를 CSV로 기록.
    """
    write_header = not os.path.exists(FILTER_REPORT_CSV)
    with open(FILTER_REPORT_CSV, "a", newline="") as f:
        w = csv.DictWriter(f, fieldnames=["path", "reason", "duration", "events", "density", "error"])
        if write_header:
            w.writeheader()
        for r in rows:
            w.writerow(r)

# =========================
# Dataset
# =========================
class MidiTokenDataset(Dataset):
    def __init__(self,
                 paths: List[str],
                 max_len: int = 512,
                 pad_id: int = PAD_ID,
                 apply_filters: bool = True,
                 seed: int = 42):
        """
        paths: MIDI 파일 경로 리스트
        max_len: Truncated BPTT 창 길이
        """
        super().__init__()
        self.paths_all = list(paths)
        self.max_len = int(max_len)
        self.pad_id = int(pad_id)
        self.rng = random.Random(seed)

        # 1) 필터(선택)
        self.paths = []
        filtered_rows = []
        if apply_filters:
            for p in self.paths_all:
                st = safe_midi_stats(p)
                if not st["ok"]:
                    filtered_rows.append({"path": p, "reason": "parse_error",
                                          "duration": None, "events": None, "density": None,
                                          "error": st.get("error")})
                    continue
                if st["events"] < MIN_EVENTS:
                    filtered_rows.append({"path": p, "reason": "too_few_events",
                                          "duration": st["duration"], "events": st["events"],
                                          "density": st["density"], "error": None})
                    continue
                if st["duration"] < MIN_DURATION_SEC:
                    filtered_rows.append({"path": p, "reason": "too_short_duration",
                                          "duration": st["duration"], "events": st["events"],
                                          "density": st["density"], "error": None})
                    continue
                if st["density"] > MAX_DENSITY:
                    filtered_rows.append({"path": p, "reason": "too_high_density",
                                          "duration": st["duration"], "events": st["events"],
                                          "density": st["density"], "error": None})
                    continue
                # 통과
                self.paths.append(p)
        else:
            self.paths = self.paths_all

        # 로그 기록
        if filtered_rows:
            append_filter_report(filtered_rows)

        if len(self.paths) == 0:
            raise RuntimeError("유효한 학습 샘플이 없습니다. 필터 기준을 조정하세요.")

        # 2) 각 파일의 토큰 길이 메타(빠른 슬라이싱을 위해)
        #    (캐시가 없으면 생성하면서 길이 파악)
        self.lengths = []
        t0 = time.time()
        for p in self.paths:
            toks, aux, from_cache = load_or_tokenize(p)
            self.lengths.append(int(len(toks)))
        t1 = time.time()
        print(f"[MidiTokenDataset] 캐시/토큰 길이 준비 완료: {len(self.paths)}개, {t1-t0:.1f}s")

    def __len__(self):
        # 샘플 = "파일 단위"가 아니라 "슬라이스 단위"로 보려면 IterableDataset 설계가 필요하지만,
        # 여기선 간단히 파일 단위로 두고, __getitem__에서 무작위 슬라이스를 뽑습니다.
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        toks, aux, _ = load_or_tokenize(path)
        L = len(toks)

        # 최소 길이: BOS, ..., EOS → 학습 시 x=t[:-1], y=t[1:]
        if L < 2:
            # 빈에 가까운 곡이면, 아주 짧은 더미 반환(필터에서 걸러지는 게 일반적)
            x = np.array([BOS_ID], dtype=np.int64)
            y = np.array([EOS_ID], dtype=np.int64)
            mask = np.array([1], dtype=np.int64)
            return torch.from_numpy(x), torch.from_numpy(y), torch.from_numpy(mask)

        # Truncated BPTT: max_len+1 창을 랜덤 연속 슬라이스로 선택
        T = self.max_len + 1
        if L <= T:
            slice_tokens = toks  # 짧으면 전체 사용
        else:
            start = self.rng.randint(0, L - T)
            slice_tokens = toks[start:start+T]

        # x, y 분리
        x = slice_tokens[:-1].astype(np.int64)
        y = slice_tokens[1: ].astype(np.int64)

        # 아직 패딩 전(개별 시퀀스) → collate에서 패딩
        return torch.from_numpy(x), torch.from_numpy(y), torch.tensor(1, dtype=torch.int64)  # dummy mask flag

In [16]:
def collate_pad(batch, pad_id: int = PAD_ID):
    """
    batch: list of (x, y, _)
    - 동적 패딩: 배치 내 최장 길이에 맞춰 PAD 채움
    - 마스크: (x != PAD)
    반환: xpad, ypad, mask  (shape: [B, T])
    """
    xs, ys, _ = zip(*batch)
    lens = [len(x) for x in xs]
    T = max(lens)
    B = len(xs)

    xpad = torch.full((B, T), pad_id, dtype=torch.long)
    ypad = torch.full((B, T), pad_id, dtype=torch.long)

    for i, (x, y) in enumerate(zip(xs, ys)):
        t = len(x)
        xpad[i, :t] = x
        ypad[i, :t] = y

    mask = (xpad != pad_id).to(torch.bool)
    return xpad, ypad, mask

In [17]:
from torch.utils.data import DataLoader
from functools import partial

max_len = 512
batch_size = 32

train_ds = MidiTokenDataset(train_files, max_len=max_len, apply_filters=True, seed=42)
val_ds   = MidiTokenDataset(val_files,   max_len=max_len, apply_filters=True, seed=43)

# lambda 대신 최상위 함수 + partial 사용 (피클 가능)
collate = partial(collate_pad, pad_id=PAD_ID)

train_dl = DataLoader(
    train_ds, batch_size=batch_size, shuffle=True,
    num_workers=0,                 # macOS+Jupyter 안전
    pin_memory=False,              # MPS는 pin_memory 미지원
    collate_fn=collate,
    persistent_workers=False
)

val_dl = DataLoader(
    val_ds, batch_size=batch_size, shuffle=False,
    num_workers=0,
    pin_memory=False,
    collate_fn=collate,
    persistent_workers=False
)

xb, yb, mb = next(iter(train_dl))
xb.shape, yb.shape, mb.shape, xb.dtype

[MidiTokenDataset] 캐시/토큰 길이 준비 완료: 195개, 291.8s
[MidiTokenDataset] 캐시/토큰 길이 준비 완료: 62개, 86.4s


(torch.Size([32, 512]),
 torch.Size([32, 512]),
 torch.Size([32, 512]),
 torch.int64)

In [18]:
import os, json, numpy as np, pandas as pd, torch, time
from collections import Counter

# 0) 환경 요약 (Colab T4 / CUDA)
cuda_ok = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(0) if cuda_ok else "cpu"
device = torch.device("cuda" if cuda_ok else "cpu")
print(f"CUDA: {cuda_ok} | device: {gpu_name}")

# 1) split 파일/경로 확인
print("\n[split CSV 확인]")
for name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    print(f"{name:<5} rows={len(df):4d},  has_full_path={df['full_path'].notna().sum():4d}")

# 2) 캐시 디렉토리 요약
print("\n[캐시 파일 요약]")
cache_dir = CACHE_DIR  # 당신이 설정한 CACHE_DIR 사용
n_npy = len([f for f in os.listdir(cache_dir) if f.endswith(".npy")])
n_js  = len([f for f in os.listdir(cache_dir) if f.endswith(".json")])
print("CACHE_DIR:", cache_dir)
print("npy:", n_npy, "json:", n_js)

# 3) 토큰 길이 통계(샘플 200곡)
print("\n[토큰 길이 통계 (샘플)]")
sample_files = (train_files[:100] + val_files[:50] + test_files[:50])[:200]
lens = []
t0=time.time()
for p in sample_files:
    toks, aux, from_cache = load_or_tokenize(p)
    lens.append(len(toks))
print(f"샘플 {len(lens)}개, 로드 {time.time()-t0:.1f}s  |  P50={np.percentile(lens,50):.0f}, P95={np.percentile(lens,95):.0f}, MAX={np.max(lens)}")

# 4) DataLoader 배치 무결성
print("\n[DataLoader 배치 무결성]")
xb, yb, mb = next(iter(train_dl))
print("xb/yb/mb shapes:", xb.shape, yb.shape, mb.shape, "| dtype:", xb.dtype)
assert xb.shape == yb.shape == mb.shape, "배치 텐서 shape 불일치"
assert xb.dtype == torch.long, "토큰 dtype은 torch.long이어야 합니다"
pad_id = PAD_ID
pad_frac = (xb==pad_id).float().mean().item()
print(f"PAD 비율(배치 평균): {pad_frac*100:.1f}%")

# 5) 라운드트립(토큰화↔복원) 빠른 점검 3곡
print("\n[라운드트립 테스트 3곡]")
from random import sample
for p in sample(train_files, k=min(3,len(train_files))):
    _, aux, rep = tokenize_and_reconstruct(p, out_midi_path=None)
    print(os.path.basename(p), "| dur_err%={:.2f}, evt_err%={:.2f}, tokens={}".format(
        rep["dur_rel_err_%"], rep["evt_rel_err_%"], rep["tokens"]))

# 6) 간단 손실계산 드라이런(모델 없이 mask/ignore 논리 확인)
print("\n[ignore_index 논리 점검]")
ce = torch.nn.CrossEntropyLoss(ignore_index=pad_id, reduction="mean")
# vocab_size는 당신의 VOCAB_SIZE 변수 사용
vocab_size = VOCAB_SIZE
with torch.no_grad():
    # 가짜 로짓: [B,T,V]
    logits = torch.randn(xb.size(0), xb.size(1), vocab_size)
    loss = ce(logits.view(-1, vocab_size), yb.reshape(-1))
print("dummy CE(loss, ignore PAD) =", float(loss))

print("\n[요약]")
print("- split CSV/파일 경로 OK")
print("- 캐시 파일 개수:", n_npy, "(npy) /", n_js, "(json)")
print("- DataLoader 배치/마스크 OK, PAD ignore CE OK")
print("- 라운드트립 dur/evt 오차가 매우 크면 토큰화 규칙 재점검 필요")

CUDA: True | device: Tesla T4

[split CSV 확인]
train rows= 270,  has_full_path= 270
val   rows=  90,  has_full_path=  90
test  rows=  90,  has_full_path=  90

[캐시 파일 요약]
CACHE_DIR: /content/drive/MyDrive/DL/beethoven/beethoven_dataset/data/processed
npy: 345 json: 345

[토큰 길이 통계 (샘플)]
샘플 200개, 로드 10.9s  |  P50=7694, P95=30319, MAX=105180

[DataLoader 배치 무결성]
xb/yb/mb shapes: torch.Size([32, 512]) torch.Size([32, 512]) torch.Size([32, 512]) | dtype: torch.int64
PAD 비율(배치 평균): 0.0%

[라운드트립 테스트 3곡]
130012_0.mid | dur_err%=0.00, evt_err%=0.00, tokens=2251
199638_0.mid | dur_err%=0.00, evt_err%=0.00, tokens=4483
050987_0.mid | dur_err%=0.00, evt_err%=0.00, tokens=6487

[ignore_index 논리 점검]
dummy CE(loss, ignore PAD) = 6.275530815124512

[요약]
- split CSV/파일 경로 OK
- 캐시 파일 개수: 345 (npy) / 345 (json)
- DataLoader 배치/마스크 OK, PAD ignore CE OK
- 라운드트립 dur/evt 오차가 매우 크면 토큰화 규칙 재점검 필요


## 모델링 (1)

In [19]:
# ===== 하이퍼파라미터/경로 =====
import os, math, csv, time, random
from dataclasses import dataclass
from typing import Optional
import torch
import torch.nn as nn

# 기본 경로 (필요시 수정)
PROJ = "/Users/igangsan/Desktop/MLClassic/aria-midi-v1-unique-ext/beethoven_dataset"
CKPT_DIR   = f"{PROJ}/ckpt"
LOG_DIR    = f"{PROJ}/logs"
SAMPLES_DIR= f"{PROJ}/samples"
os.makedirs(CKPT_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(SAMPLES_DIR, exist_ok=True)

# ✅ MPS 우선, 없으면 CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

# AMP는 CUDA 전용이므로 MPS/CPU에서는 비활성화
autocast, GradScaler = None, None

# 필수 토큰 상수 (없을 때의 안전 장치)
try: PAD_ID
except NameError: PAD_ID = 0
try: BOS_ID
except NameError: BOS_ID = 1
try: EOS_ID
except NameError: EOS_ID = 2
try: VOCAB_SIZE
except NameError: VOCAB_SIZE = 3 + 16 + 16 + 128 + 128  # 폴백

@dataclass
class TrainConfig:
    vocab_size: int = VOCAB_SIZE
    pad_id: int = PAD_ID
    d_model: int = 512
    lstm_hidden: int = 768
    lstm_layers: int = 2
    dropout: float = 0.25
    max_len: int = 512
    lr: float = 3e-4
    weight_decay: float = 0.01
    grad_clip: float = 1.0
    epochs: int = 100
    log_every: int = 200
    val_every: int = 1000
    amp: bool = False   # ✅ MPS/CPU 환경에서는 AMP 비활성화
    seed: int = 42
cfg = TrainConfig()

# ===== 모델 정의 =====
class EventLSTM(nn.Module):
    def __init__(self, vocab_size, d_model=512, hidden=768, layers=2, dropout=0.2, pad_id=0):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
        self.dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(
            input_size=d_model,
            hidden_size=hidden,
            num_layers=layers,
            batch_first=True,
            dropout=dropout
        )
        self.norm = nn.LayerNorm(hidden)
        self.head = nn.Linear(hidden, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embed(x)
        x = self.dropout(x)
        out, hidden = self.lstm(x, hidden)
        out = self.norm(out)
        logits = self.head(out)
        return logits, hidden

# ===== 유틸 =====
def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def perplexity(nll):
    try: return math.exp(nll)
    except OverflowError: return float("inf")

def save_checkpoint(model, opt, scaler, step, path):
    torch.save({
        "model": model.state_dict(),
        "opt": opt.state_dict(),
        "scaler": scaler.state_dict() if scaler is not None else None,
        "step": step
    }, path)

def sample_and_save(model, start_token=BOS_ID, max_tokens=1024, temperature=1.0, top_k: Optional[int]=50,
                    out_midi_path: Optional[str]=None, aux_for_detok: Optional[dict]=None):
    model.eval()
    toks = [start_token]
    hidden = None
    with torch.no_grad():
        for _ in range(max_tokens-1):
            x = torch.tensor(toks[-cfg.max_len:], dtype=torch.long, device=device).unsqueeze(0)
            logits, hidden = model(x, hidden=None)
            logits = logits[:, -1, :] / max(1e-6, temperature)
            if top_k is not None and top_k > 0:
                topv, topi = torch.topk(logits, k=min(top_k, logits.size(-1)), dim=-1)
                probs = torch.softmax(topv, dim=-1)
                idx = topi.gather(-1, torch.multinomial(probs, num_samples=1))
                next_id = int(idx.item())
            else:
                probs = torch.softmax(logits, dim=-1)
                next_id = int(torch.multinomial(probs, num_samples=1).item())
            toks.append(next_id)
            if next_id == EOS_ID:
                break
    if out_midi_path is not None and aux_for_detok is not None:
        try: detokenize_to_midi_file(toks, aux_for_detok, out_midi_path)
        except Exception as e: print("[WARN] detokenize failed:", e)
    return toks

# ===== 학습/검증 루프 =====
def train_one_epoch(model, dl, opt, scheduler, scaler, ce, step0=0,
                    log_path=f"{LOG_DIR}/train_val_curve.csv"):
    model.train()
    running_loss, step = 0.0, step0
    t0 = time.time()

    for xb, yb, mb in dl:
        xb, yb = xb.to(device, non_blocking=True), yb.to(device, non_blocking=True)
        opt.zero_grad(set_to_none=True)

        logits, _ = model(xb)
        loss = ce(logits.reshape(-1, cfg.vocab_size), yb.reshape(-1))
        loss.backward()
        if cfg.grad_clip is not None:
            nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
        opt.step()

        if scheduler is not None:
            scheduler.step()

        step += 1
        running_loss += float(loss.item())

        if step % cfg.log_every == 0:
            avg_nll = running_loss / cfg.log_every
            ppl = perplexity(avg_nll)
            print(f"[train] step {step}  nll={avg_nll:.3f}  ppl={ppl:.1f}  ({time.time()-t0:.1f}s)")
            append_log(log_path, {"step": step, "split": "train", "nll": avg_nll, "ppl": ppl})
            running_loss = 0.0

        if step % cfg.val_every == 0:
            nll, ppl = evaluate(model, val_dl)
            print(f"[val]   step {step}  nll={nll:.3f}  ppl={ppl:.1f}")
            append_log(log_path, {"step": step, "split": "val", "nll": nll, "ppl": ppl})

            aux = {"step_sec": 0.5/64, "program": 0}
            out_mid = os.path.join(SAMPLES_DIR, f"step{step}_sample.mid")
            _ = sample_and_save(model, out_midi_path=out_mid, aux_for_detok=aux)

            model.train()
    return step

@torch.no_grad()
def evaluate(model, dl):
    model.eval()
    ce = nn.CrossEntropyLoss(ignore_index=cfg.pad_id, reduction="mean")
    total_loss, total_tokens = 0.0, 0
    for xb, yb, mb in dl:
        xb, yb = xb.to(device, non_blocking=True), yb.to(device, non_blocking=True)
        logits, _ = model(xb)
        loss = ce(logits.reshape(-1, cfg.vocab_size), yb.reshape(-1))
        total_loss += float(loss.item()) * xb.size(0)
        total_tokens += xb.size(0)
    nll = total_loss / max(1, total_tokens)
    ppl = perplexity(nll)
    return nll, ppl

def append_log(csv_path, row: dict):
    write_header = not os.path.exists(csv_path)
    with open(csv_path, "a", newline="") as f:
        w = csv.DictWriter(f, fieldnames=["step", "split", "nll", "ppl"])
        if write_header: w.writeheader()
        w.writerow(row)

Using device: cpu


In [None]:
# ===== 실제 실행 (MPS/CPU 전용) =====
torch.manual_seed(cfg.seed)
random.seed(cfg.seed)

model = EventLSTM(
    vocab_size=cfg.vocab_size,
    d_model=cfg.d_model,
    hidden=cfg.lstm_hidden,
    layers=cfg.lstm_layers,
    dropout=cfg.dropout,      # ← 당신 노트북의 EventLSTM 시그니처가 dropout 하나인 버전이면 그대로 사용
    pad_id=cfg.pad_id
).to(device)

opt = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)

# CosineAnnealingLR: 배치마다 step() 호출 → T_max는 총 스텝 수로 설정
steps_per_epoch = max(1, len(train_dl))
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    opt, T_max=max(1, cfg.epochs * steps_per_epoch)
)

# ✅ MPS/CPU에서는 AMP/GradScaler 사용 안 함
cfg.amp = False
scaler = None

ce = nn.CrossEntropyLoss(ignore_index=cfg.pad_id, reduction="mean")

print(f"Model params: {count_params(model)/1e6:.2f}M")
print(f"steps/epoch: {steps_per_epoch}")

global_step = 0
for ep in range(1, cfg.epochs + 1):
    print(f"\n=== EPOCH {ep}/{cfg.epochs} ===")
    global_step = train_one_epoch(
        model, train_dl, opt, scheduler, scaler, ce, step0=global_step
    )

    # 에폭 끝 검증 + 체크포인트
    nll, ppl = evaluate(model, val_dl)
    print(f"[val @epoch{ep}] nll={nll:.3f}  ppl={ppl:.1f}")
    append_log(
        f"{LOG_DIR}/train_val_curve.csv",
        {"step": global_step, "split": f"val_ep{ep}", "nll": nll, "ppl": ppl}
    )
    save_checkpoint(
        model, opt, scaler, global_step,
        os.path.join(CKPT_DIR, f"lstm_ep{ep}_step{global_step}.pt")
    )

Model params: 9.04M
steps/epoch: 6

=== EPOCH 1/100 ===
[val @epoch1] nll=4.196  ppl=66.4

=== EPOCH 2/100 ===
[val @epoch2] nll=3.854  ppl=47.2

=== EPOCH 3/100 ===
[val @epoch3] nll=3.709  ppl=40.8

=== EPOCH 4/100 ===
[val @epoch4] nll=3.334  ppl=28.0

=== EPOCH 5/100 ===
[val @epoch5] nll=3.262  ppl=26.1

=== EPOCH 6/100 ===
[val @epoch6] nll=3.085  ppl=21.9

=== EPOCH 7/100 ===
[val @epoch7] nll=3.103  ppl=22.3

=== EPOCH 8/100 ===
[val @epoch8] nll=3.001  ppl=20.1

=== EPOCH 9/100 ===
[val @epoch9] nll=3.005  ppl=20.2

=== EPOCH 10/100 ===
[val @epoch10] nll=2.982  ppl=19.7

=== EPOCH 11/100 ===
[val @epoch11] nll=2.914  ppl=18.4

=== EPOCH 12/100 ===
[val @epoch12] nll=2.846  ppl=17.2

=== EPOCH 13/100 ===
[val @epoch13] nll=2.827  ppl=16.9

=== EPOCH 14/100 ===
[val @epoch14] nll=2.770  ppl=16.0

=== EPOCH 15/100 ===
[val @epoch15] nll=2.740  ppl=15.5

=== EPOCH 16/100 ===
[val @epoch16] nll=2.711  ppl=15.0

=== EPOCH 17/100 ===
[val @epoch17] nll=2.683  ppl=14.6

=== EPOCH 18/

### 결과

In [None]:
# 2) 샘플 토큰 → MIDI → (가능하면) WAV → 재생  [안전 버전]
from IPython.display import Audio, display
import os, time, shutil
from midi2audio import FluidSynth

# 1) 토큰 생성
toks = sample_and_save(
    model,
    start_token=BOS_ID,
    max_tokens=2048,
    temperature=1.0,
    top_k=50,
    out_midi_path=None,  # 여기서는 저장하지 않고 토큰만 받음
    aux_for_detok={"step_sec": 0.5/64, "program": 0}
)

# 2) 파일 경로
os.makedirs(SAMPLES_DIR, exist_ok=True)
sample_idx = int(time.time())
out_mid_path = os.path.join(SAMPLES_DIR, f"generated_sample_{sample_idx}.mid")
out_wav_path = out_mid_path.replace(".mid", ".wav")

# 3) MIDI 저장
toks_clean = [t for t in toks if t not in (PAD_ID, BOS_ID)]
detokenize_to_midi_file(toks_clean, {"step_sec": 0.5/64, "program": 0}, out_mid_path)
print("Saved MIDI:", out_mid_path)

# 4) fluidsynth/사운드폰트 자동 탐색
def find_fluidsynth():
    # PATH 내 검색 + Homebrew 기본 경로
    cand = [
        shutil.which("fluidsynth"),
        "/opt/homebrew/bin/fluidsynth",  # macOS (Apple Silicon)
        "/usr/local/bin/fluidsynth",     # macOS (Intel/Homebrew)
        "/usr/bin/fluidsynth",           # Linux
    ]
    return next((p for p in cand if p and os.path.exists(p)), None)

def find_sf2():
    cands = [
        "/opt/homebrew/share/sounds/sf2/FluidR3_GM.sf2",  # macOS Homebrew soundfont-fluid
        "/usr/share/sounds/sf2/FluidR3_GM.sf2",           # Ubuntu/Colab
        "/Library/Audio/Sounds/Banks/FluidR3_GM.sf2",     # macOS 수동 설치
    ]
    return next((p for p in cands if os.path.exists(p)), None)

fluidsynth_bin = find_fluidsynth()
sf2_path = find_sf2()

# 5) WAV 변환 시도 (둘 다 있어야 가능)
if fluidsynth_bin and sf2_path:
    try:
        # midi2audio는 내부적으로 PATH의 fluidsynth를 호출하므로,
        # Homebrew 경로가 PATH에 없다면 임시로 추가
        old_path = os.environ.get("PATH", "")
        bin_dir = os.path.dirname(fluidsynth_bin)
        if bin_dir not in old_path.split(os.pathsep):
            os.environ["PATH"] = bin_dir + os.pathsep + old_path

        fs = FluidSynth(sf2_path)
        fs.midi_to_audio(out_mid_path, out_wav_path)
        display(Audio(out_wav_path))
        print("WAV:", out_wav_path)
    except Exception as e:
        print("[WARN] WAV 변환 실패:", e)
        print("→ fluidsynth / SF2 설치/경로를 확인하세요.")
        print("  macOS:  brew install fluidsynth soundfont-fluid")
        print("  그리고 sf2 경로를 find_sf2() 후보들 중 하나로 맞추세요.")
else:
    print("[INFO] fluidsynth 또는 SoundFont(.sf2)를 찾지 못했습니다.")
    print("→ MIDI 파일은 저장됐으며, GarageBand/MuseScore 등에서 바로 재생할 수 있습니다.")
    print("→ macOS 설치 예:  brew install fluidsynth soundfont-fluid")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False

def _read_log_safe(log_path):
    if not os.path.exists(log_path):
        raise FileNotFoundError(f"로그 파일을 찾을 수 없습니다: {log_path}")
    # 잘못된 행 스킵, 파이썬 엔진 사용
    df = pd.read_csv(
        log_path,
        engine="python",
        on_bad_lines="skip"
    )
    # 필요한 컬럼만 남기기 (여분 헤더/깨진 행 제거)
    keep = [c for c in ["step", "split", "nll", "ppl"] if c in df.columns]
    df = df[keep]
    # 숫자형 강제 변환
    if "step" in df.columns:
        df["step"] = pd.to_numeric(df["step"], errors="coerce")
    if "nll" in df.columns:
        df["nll"] = pd.to_numeric(df["nll"], errors="coerce")
    if "ppl" in df.columns:
        df["ppl"] = pd.to_numeric(df["ppl"], errors="coerce")
    # 유효한 행만
    df = df.dropna(subset=["step", "split", "nll", "ppl"])
    # step 정렬
    df = df.sort_values("step").reset_index(drop=True)
    return df

def plot_training_curves(log_path=f"{LOG_DIR}/train_val_curve.csv", save_path=None):
    df = _read_log_safe(log_path)

    train_data = df[df['split'] == 'train']
    val_data   = df[df['split'].str.contains('val', na=False)]

    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('LSTM Music Generation Training Results', fontsize=16, fontweight='bold')

    # 1) NLL
    ax1 = axes[0, 0]
    if len(train_data): ax1.plot(train_data['step'], train_data['nll'], 'b-', label='Train NLL', alpha=0.7, linewidth=1)
    if len(val_data):   ax1.plot(val_data['step'],   val_data['nll'],   'r-', label='Validation NLL', linewidth=2)
    ax1.set_xlabel('Training Steps'); ax1.set_ylabel('NLL'); ax1.set_title('Loss Curve (NLL)')
    ax1.legend(); ax1.grid(True, alpha=0.3)

    # 2) PPL
    ax2 = axes[0, 1]
    if len(train_data): ax2.plot(train_data['step'], train_data['ppl'], 'b-', label='Train PPL', alpha=0.7, linewidth=1)
    if len(val_data):   ax2.plot(val_data['step'],   val_data['ppl'],   'r-', label='Validation PPL', linewidth=2)
    ax2.set_xlabel('Training Steps'); ax2.set_ylabel('Perplexity'); ax2.set_title('Perplexity Curve')
    ax2.legend(); ax2.grid(True, alpha=0.3); ax2.set_yscale('log')

    # 3) Epoch-wise validation (raw string으로 정규식)
    ax3 = axes[1, 0]
    epoch_val_data = df[df['split'].str.contains('val_ep', na=False)].copy()
    if len(epoch_val_data):
        epoch_val_data['epoch'] = epoch_val_data['split'].str.extract(r'val_ep(\d+)').astype(int)
        epoch_val_data = epoch_val_data.sort_values('epoch')
        ax3.plot(epoch_val_data['epoch'], epoch_val_data['nll'], 'ro-', label='Val NLL', linewidth=2, markersize=6)
        ax3_twin = ax3.twinx()
        ax3_twin.plot(epoch_val_data['epoch'], epoch_val_data['ppl'], 'go-', label='Val PPL', linewidth=2, markersize=6)
        ax3.set_xlabel('Epoch'); ax3.set_ylabel('Validation NLL', color='red')
        ax3_twin.set_ylabel('Validation Perplexity', color='green')
        ax3.set_title('Epoch-wise Validation Performance'); ax3.grid(True, alpha=0.3)

    # 4) 최근 10 에폭 PPL
    ax4 = axes[1, 1]
    if len(epoch_val_data):
        recent = epoch_val_data.tail(10)
        x = np.arange(len(recent))
        bars = ax4.bar(x, recent['ppl'], alpha=0.7, color='skyblue', edgecolor='navy')
        ax4.set_xlabel('Recent Epochs'); ax4.set_ylabel('Perplexity')
        ax4.set_title('Recent Validation Perplexity (Last 10 Epochs)')
        ax4.set_xticks(x); ax4.set_xticklabels(recent['epoch'], rotation=45)
        for bar, v in zip(bars, recent['ppl']):
            ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, f'{v:.1f}', ha='center', va='bottom', fontsize=8)

    plt.tight_layout()
    if save_path is None:
        save_path = os.path.join(LOG_DIR, 'training_curves.png')
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"그래프가 저장되었습니다: {save_path}")
    plt.show()

def plot_detailed_analysis(log_path=f"{LOG_DIR}/train_val_curve.csv"):
    df = _read_log_safe(log_path)

    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Detailed Training Analysis', fontsize=16, fontweight='bold')

    # 1) 이동평균 손실
    ax1 = axes[0, 0]
    train_data = df[df['split'] == 'train']
    val_data   = df[df['split'].str.contains('val', na=False)]
    if len(train_data):
        window = max(1, len(train_data) // 20)
        smooth = train_data['nll'].rolling(window=window, center=True).mean()
        ax1.plot(train_data['step'], smooth, 'b-', linewidth=2, label=f'Train NLL (MA-{window})')
        ax1.plot(train_data['step'], train_data['nll'], 'b-', alpha=0.3, linewidth=0.5)
    if len(val_data):
        ax1.plot(val_data['step'], val_data['nll'], 'r-', linewidth=2, label='Validation NLL')
    ax1.set_xlabel('Training Steps'); ax1.set_ylabel('NLL'); ax1.set_title('Loss Convergence (with Moving Average)')
    ax1.legend(); ax1.grid(True, alpha=0.3)

    # 2) Val PPL 분포
    ax2 = axes[0, 1]
    val_only = df[df['split'].str.contains('val', na=False)]
    if len(val_only):
        ax2.hist(val_only['ppl'], bins=20, alpha=0.7, color='red', edgecolor='black')
        ax2.axvline(val_only['ppl'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {val_only["ppl"].mean():.1f}')
        ax2.axvline(val_only['ppl'].median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {val_only["ppl"].median():.1f}')
    ax2.set_xlabel('Perplexity'); ax2.set_ylabel('Frequency'); ax2.set_title('Validation Perplexity Distribution')
    ax2.legend(); ax2.grid(True, alpha=0.3)

    # 3) 에폭별 개선률
    ax3 = axes[1, 0]
    epoch_val = df[df['split'].str.contains('val_ep', na=False)].copy()
    if len(epoch_val):
        epoch_val['epoch'] = epoch_val['split'].str.extract(r'val_ep(\d+)').astype(int)
        epoch_val = epoch_val.sort_values('epoch')
        epoch_val['ppl_impr'] = epoch_val['ppl'].diff()
        ax3.bar(epoch_val['epoch'][1:], epoch_val['ppl_impr'][1:], alpha=0.7, color='green')
        ax3.axhline(0, color='black', linestyle='-', alpha=0.5)
        ax3.set_xlabel('Epoch'); ax3.set_ylabel('Perplexity Change'); ax3.set_title('Epoch-wise Perplexity Improvement')
        ax3.grid(True, alpha=0.3)

    # 4) 학습 안정성
    ax4 = axes[1, 1]
    if len(train_data) > 10:
        group_size = max(1, len(train_data) // 10)
        groups = [train_data['nll'].iloc[i:i+group_size] for i in range(0, len(train_data), group_size)]
        stds = [g.std() for g in groups if len(g) > 1]
        xs = [i * group_size for i in range(len(stds))]
        ax4.plot(xs, stds, 'bo-', linewidth=2, markersize=6)
        ax4.set_xlabel('Training Steps (Grouped)'); ax4.set_ylabel('NLL Std'); ax4.set_title('Training Stability (Loss Variance)')
        ax4.grid(True, alpha=0.3)

    plt.tight_layout()
    plt.show()

print("=== 학습 곡선 시각화 ===")
plot_training_curves()

print("\n=== 상세 분석 ===")
plot_detailed_analysis()

## 모델링 (2)
- 어텐션
- 계층적 구조

In [21]:
# ===== 개선버전: LSTM(+Attention) 실험 프레임 (MPS/CPU 전용) =====
import os, math, csv, time, json, random, itertools
from dataclasses import dataclass
from typing import Optional, Dict, Any, List, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F

# AMP(CUDA) 모듈은 선택 임포트: MPS/CPU에서는 사용 안 함
try:
    from torch.cuda.amp import autocast, GradScaler  # CUDA 전용
except Exception:
    autocast, GradScaler = None, None

# ========== 경로/디바이스 ==========
PROJ = "/content/drive/MyDrive/DL/beethoven_dataset"
CKPT_DIR    = f"{PROJ}/ckpt"
LOG_DIR     = f"{PROJ}/logs"
SAMPLES_DIR = f"{PROJ}/samples"
RESULTS_DIR = f"{PROJ}/results"
for d in [CKPT_DIR, LOG_DIR, SAMPLES_DIR, RESULTS_DIR]:
    os.makedirs(d, exist_ok=True)


if torch.cuda.is_available():
    device = torch.device("cuda")
    AMP_DEVICE = "cuda"
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    AMP_DEVICE = None
else:
    device = torch.device("cpu")
    AMP_DEVICE = None
print("Using device:", device)

# ===== 필수 토큰 상수 (폴백) =====
try: PAD_ID
except NameError: PAD_ID = 0
try: BOS_ID
except NameError: BOS_ID = 1
try: EOS_ID
except NameError: EOS_ID = 2
try: VOCAB_SIZE
except NameError: VOCAB_SIZE = 3 + 16 + 16 + 128 + 128  # PAD/BOS/EOS + VEL(16) + TS(16) + ON(128) + OFF(128)

# ========== 설정 ==========
@dataclass
class TrainConfig:
    vocab_size: int = VOCAB_SIZE
    pad_id: int = PAD_ID
    d_model: int = 512
    lstm_hidden: int = 768
    lstm_layers: int = 2
    max_len: int = 512

    # lr/opt
    lr: float = 3e-4
    weight_decay: float = 0.01
    epochs: int = 50
    grad_clip: float = 1.0

    # AMP는 CUDA 전용 → MPS/CPU에서는 비활성화
    amp: bool = False

    # logging
    log_every: int = 200
    val_every: int = 1000
    seed: int = 42

    # dropout (분리)
    dropout_emb: float = 0.1
    dropout_lstm: float = 0.25
    dropout_attn: float = 0.1
    dropout_ffn: float = 0.1

    # Attention 옵션
    use_attention: bool = False
    num_attention_heads: int = 8

    # 랜덤 서치
    n_trials: int = 6   # 실험 개수 (시간 절약용 소수)
    # 샘플링
    default_temperature: float = 1.0
    default_top_k: int = 50
    default_top_p: float = 0.0
    default_no_repeat_ngram: int = 0

cfg = TrainConfig()

def set_seed(seed: int):
    import numpy as np
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # CUDA 시드/백엔드는 사용하지 않음 (MPS/CPU 전용)

set_seed(cfg.seed)

# ========== 모델 ==========
class EventLSTM(nn.Module):
    def __init__(self, vocab_size, d_model=512, hidden=768, layers=2,
                 dropout_emb=0.1, dropout_lstm=0.25, pad_id=0):
        super().__init__()
        self.use_attention = False
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
        self.drop_emb = nn.Dropout(dropout_emb)
        self.lstm = nn.LSTM(
            input_size=d_model, hidden_size=hidden, num_layers=layers,
            batch_first=True, dropout=dropout_lstm if layers > 1 else 0.0
        )
        self.norm = nn.LayerNorm(hidden)
        self.drop_out = nn.Dropout(dropout_lstm)
        self.head = nn.Linear(hidden, vocab_size)

    def forward(self, x: torch.Tensor, hidden: Optional[Tuple[torch.Tensor,torch.Tensor]]=None):
        x = self.drop_emb(self.embed(x))      # (B,T,d_model)
        out, hidden = self.lstm(x, hidden)    # (B,T,H)
        out = self.drop_out(self.norm(out))   # (B,T,H)
        logits = self.head(out)               # (B,T,V)
        return logits, hidden

class EventLSTMWithAttention(nn.Module):
    def __init__(self, vocab_size, d_model=512, hidden=768, layers=2,
                 dropout_emb=0.1, dropout_lstm=0.25,
                 num_heads=8, dropout_attn=0.1, dropout_ffn=0.1, pad_id=0):
        super().__init__()
        self.use_attention = True
        self.embed = nn.Embedding(vocab_size, d_model, padding_idx=pad_id)
        self.drop_emb = nn.Dropout(dropout_emb)

        self.lstm = nn.LSTM(
            input_size=d_model, hidden_size=hidden, num_layers=layers,
            batch_first=True, dropout=dropout_lstm if layers > 1 else 0.0
        )
        self.attn = nn.MultiheadAttention(embed_dim=hidden, num_heads=num_heads,
                                          dropout=dropout_attn, batch_first=True)
        self.dropout_attn = nn.Dropout(dropout_attn)
        self.dropout_ffn  = nn.Dropout(dropout_ffn)

        self.norm1 = nn.LayerNorm(hidden)
        self.norm2 = nn.LayerNorm(hidden)
        self.ffn = nn.Sequential(
            nn.Linear(hidden, 4*hidden),
            nn.GELU(),
            nn.Linear(4*hidden, hidden)
        )
        self.head = nn.Linear(hidden, vocab_size)

    def _causal_mask(self, T: int, device):
        return torch.triu(torch.ones(T, T, dtype=torch.bool, device=device), diagonal=1)

    def forward(self, x: torch.Tensor, hidden: Optional[Tuple[torch.Tensor,torch.Tensor]]=None):
        x = self.drop_emb(self.embed(x))            # (B,T,d_model)
        lstm_out, hidden = self.lstm(x, hidden)     # (B,T,H)

        T = lstm_out.size(1)
        attn_mask = self._causal_mask(T, lstm_out.device)  # (T,T) boolean

        attn_out, _ = self.attn(lstm_out, lstm_out, lstm_out, attn_mask=attn_mask)  # (B,T,H)
        y = self.norm1(lstm_out + self.dropout_attn(attn_out))                      # (B,T,H)

        ffn_out = self.ffn(y)                                   # (B,T,H)
        y = self.norm2(y + self.dropout_ffn(ffn_out))           # (B,T,H)

        logits = self.head(y)                                   # (B,T,V)
        return logits, hidden

# ========== 유틸 ==========
def count_params(model): return sum(p.numel() for p in model.parameters() if p.requires_grad)

def perplexity(nll):
    try: return math.exp(nll)
    except OverflowError: return float("inf")

def save_checkpoint(model, opt, scaler, step, path):
    torch.save({
        "model": model.state_dict(),
        "opt": opt.state_dict(),
        "scaler": None,  # MPS/CPU에서는 AMP/스케일러 미사용
        "step": step
    }, path)

# nucleus(top-p) & top-k & no-repeat-ngram
def _apply_sampling_filters(logits: torch.Tensor, top_k: int=0, top_p: float=0.0) -> torch.Tensor:
    probs = torch.softmax(logits, dim=-1)

    if top_k and top_k > 0 and top_k < probs.numel():
        topv, topi = torch.topk(probs, k=top_k)
        filtered = torch.full_like(probs, 0.0)
        filtered.scatter_(0, topi, topv)
        probs = filtered

    if top_p and 0.0 < top_p < 1.0:
        sorted_probs, sorted_idx = torch.sort(probs, descending=True)
        cumsum = torch.cumsum(sorted_probs, dim=-1)
        mask = cumsum > top_p
        mask[0] = False
        sorted_probs[mask] = 0.0
        probs = torch.zeros_like(probs)
        probs.scatter_(0, sorted_idx, sorted_probs)

    s = probs.sum()
    if s.item() > 0:
        probs = probs / s
    else:
        probs = torch.full_like(probs, 1.0 / probs.numel())
    return probs

def _forbidden_next_tokens_by_ngram(prefix: List[int], n: int) -> set:
    if n <= 1 or len(prefix) < n-1:
        return set()
    mapping = {}
    for i in range(len(prefix)-n+1):
        key = tuple(prefix[i:i+n-1])
        nxt = prefix[i+n-1]
        mapping.setdefault(key, set()).add(nxt)
    key = tuple(prefix[-(n-1):])
    return mapping.get(key, set())

@torch.no_grad()
def sample_and_save(model, start_token=BOS_ID, max_tokens=1024,
                    temperature: float=1.0, top_k: Optional[int]=50, top_p: float=0.0,
                    no_repeat_ngram_size: int=0,
                    out_midi_path: Optional[str]=None, aux_for_detok: Optional[dict]=None):
    model.eval()
    toks: List[int] = [start_token]
    hidden = None

    for _ in range(max_tokens-1):
        if getattr(model, "use_attention", False):
            x = torch.tensor(toks[-cfg.max_len:], dtype=torch.long, device=device).unsqueeze(0)
            logits, hidden = model(x, hidden=None)
            last = logits[:, -1, :].squeeze(0)
        else:
            x = torch.tensor([[toks[-1]]], dtype=torch.long, device=device)
            logits, hidden = model(x, hidden)
            last = logits.squeeze(0).squeeze(0)

        last = last / max(1e-6, temperature)

        forbids = _forbidden_next_tokens_by_ngram(toks, no_repeat_ngram_size) if no_repeat_ngram_size > 1 else set()
        probs = _apply_sampling_filters(last, top_k=top_k or 0, top_p=top_p)

        if forbids:
            probs[list(forbids)] = 0.0
            s = probs.sum()
            probs = probs / s if s.item() > 0 else torch.full_like(probs, 1.0 / probs.numel())

        next_id = int(torch.multinomial(probs, num_samples=1).item())
        toks.append(next_id)
        if next_id == EOS_ID:
            break

    if out_midi_path is not None and aux_for_detok is not None:
        try:
            detokenize_to_midi_file(toks, aux_for_detok, out_midi_path)
        except Exception as e:
            print("[WARN] detokenize failed:", e)

    return toks

# ========== 학습/검증 루프 ==========
def append_log(csv_path, row: Dict[str, Any]):
    write_header = not os.path.exists(csv_path)
    with open(csv_path, "a", newline="") as f:
        w = csv.DictWriter(f, fieldnames=["step", "split", "nll", "ppl"])
        if write_header: w.writeheader()
        w.writerow(row)

def train_one_epoch(model, dl, opt, scheduler, scaler, ce, step0=0, log_path=f"{LOG_DIR}/train_val_curve.csv",
                    val_dl=None):
    model.train()
    running_loss, step = 0.0, step0
    t0 = time.time()

    for xb, yb, mb in dl:
        xb, yb = xb.to(device, non_blocking=True), yb.to(device, non_blocking=True)
        opt.zero_grad(set_to_none=True)

        # AMP 비활성화 경로 (MPS/CPU)
        logits, _ = model(xb)
        loss = ce(logits.reshape(-1, cfg.vocab_size), yb.reshape(-1))
        loss.backward()
        if cfg.grad_clip is not None:
            nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip)
        opt.step()

        if scheduler is not None:
            scheduler.step()

        step += 1
        running_loss += float(loss.item())

        if step % cfg.log_every == 0:
            avg_nll = running_loss / cfg.log_every
            ppl = perplexity(avg_nll)
            print(f"[train] step {step}  nll={avg_nll:.3f}  ppl={ppl:.1f}  ({time.time()-t0:.1f}s)")
            append_log(log_path, {"step": step, "split": "train", "nll": avg_nll, "ppl": ppl})
            running_loss = 0.0

        if (val_dl is not None) and (step % cfg.val_every == 0):
            nll, ppl = evaluate(model, val_dl)
            print(f"[val]   step {step}  nll={nll:.3f}  ppl={ppl:.1f}")
            append_log(log_path, {"step": step, "split": "val", "nll": nll, "ppl": ppl})
            model.train()
    return step

@torch.no_grad()
def evaluate(model, dl):
    model.eval()
    ce = nn.CrossEntropyLoss(ignore_index=cfg.pad_id, reduction="none")
    total_loss, total_tokens = 0.0, 0

    for xb, yb, mb in dl:
        xb, yb = xb.to(device, non_blocking=True), yb.to(device, non_blocking=True)
        logits, _ = model(xb)  # (B,T,V)
        V = cfg.vocab_size
        loss_vec = ce(logits.reshape(-1, V), yb.reshape(-1))  # (B*T,)
        mask = (yb.reshape(-1) != cfg.pad_id)
        total_loss += float(loss_vec[mask].sum().item())
        total_tokens += int(mask.sum().item())

    nll = total_loss / max(1, total_tokens)
    ppl = perplexity(nll)
    return nll, ppl

# ========== 랜덤 서치 ==========
def sample_config_space() -> Dict[str, Any]:
    return {
        "use_attention": random.choice([False, True]),
        "num_attention_heads": random.choice([4, 8, 16]),
        "lr": random.choice([1e-4, 3e-4, 1e-3]),
        "weight_decay": random.choice([0.01, 0.1, 0.5]),
        "dropout_emb": random.choice([0.05, 0.1, 0.2]),
        "dropout_lstm": random.choice([0.2, 0.25, 0.3]),
        "dropout_attn": random.choice([0.05, 0.1, 0.2]),
        "dropout_ffn": random.choice([0.05, 0.1, 0.2]),
    }

def build_model_from_cfg() -> nn.Module:
    if cfg.use_attention:
        model = EventLSTMWithAttention(
            vocab_size=cfg.vocab_size,
            d_model=cfg.d_model,
            hidden=cfg.lstm_hidden,
            layers=cfg.lstm_layers,
            dropout_emb=cfg.dropout_emb,
            dropout_lstm=cfg.dropout_lstm,
            num_heads=cfg.num_attention_heads,
            dropout_attn=cfg.dropout_attn,
            dropout_ffn=cfg.dropout_ffn,
            pad_id=cfg.pad_id
        ).to(device)
    else:
        model = EventLSTM(
            vocab_size=cfg.vocab_size,
            d_model=cfg.d_model,
            hidden=cfg.lstm_hidden,
            layers=cfg.lstm_layers,
            dropout_emb=cfg.dropout_emb,
            dropout_lstm=cfg.dropout_lstm,
            pad_id=cfg.pad_id
        ).to(device)
    return model

def run_random_search(train_dl, val_dl, n_trials: int = None):
    n_trials = n_trials or cfg.n_trials
    results = []
    best_perplexity = float("inf")
    best_config = None

    print(f"총 {n_trials}개 랜덤 서치를 실행합니다...")

    for i in range(1, n_trials+1):
        conf = sample_config_space()
        cfg.use_attention      = conf["use_attention"]
        cfg.num_attention_heads= conf["num_attention_heads"]
        cfg.lr                 = conf["lr"]
        cfg.weight_decay       = conf["weight_decay"]
        cfg.dropout_emb        = conf["dropout_emb"]
        cfg.dropout_lstm       = conf["dropout_lstm"]
        cfg.dropout_attn       = conf["dropout_attn"]
        cfg.dropout_ffn        = conf["dropout_ffn"]

        print(f"\n=== 실험 {i}/{n_trials} ===")
        print(f"설정: {conf}")

        set_seed(cfg.seed + i)
        model = build_model_from_cfg()
        print(f"모델 파라미터: {count_params(model)/1e6:.2f}M")

        opt = torch.optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
        steps_per_epoch = max(1, len(train_dl))
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=max(1, cfg.epochs * steps_per_epoch))
        scaler = None  # AMP 미사용
        ce = nn.CrossEntropyLoss(ignore_index=cfg.pad_id, reduction="mean")

        global_step = 0
        best_val_ppl = float("inf")

        for ep in range(1, cfg.epochs + 1):
            global_step = train_one_epoch(model, train_dl, opt, scheduler, scaler, ce, step0=global_step, val_dl=val_dl)
            nll, ppl = evaluate(model, val_dl)
            print(f"[val @epoch{ep}] nll={nll:.3f}  ppl={ppl:.2f}")
            if ppl < best_val_ppl:
                best_val_ppl = ppl

        results.append({
            "config": conf,
            "best_val_perplexity": float(best_val_ppl),
            "model_params": int(count_params(model)),
            "experiment_id": i
        })
        print(f"최고 검증 Perplexity: {best_val_ppl:.2f}")

        if best_val_ppl < best_perplexity:
            best_perplexity = best_val_ppl
            best_config = conf
            print(f"★ 새로운 최고 성능! Perplexity: {best_perplexity:.2f}")

    results_file = f"{RESULTS_DIR}/random_search_results.json"
    with open(results_file, "w") as f:
        json.dump(results, f, indent=2)

    print("\n=== 랜덤 서치 완료 ===")
    print(f"총 {len(results)}개 실험 완료")
    print(f"최고 성능: Perplexity {best_perplexity:.2f}")
    print(f"최고 설정: {best_config}")
    print(f"결과 저장: {results_file}")

    analyze_results(results)
    return results, best_config

def analyze_results(results: List[Dict[str,Any]]):
    print("\n=== 결과 분석 ===")
    attn = [r for r in results if r["config"]["use_attention"]]
    base = [r for r in results if not r["config"]["use_attention"]]
    if attn and base:
        avg_attn = sum(r["best_val_perplexity"] for r in attn) / len(attn)
        avg_base = sum(r["best_val_perplexity"] for r in base) / len(base)
        imp = (avg_base - avg_attn) / max(1e-9, avg_base) * 100.0
        print(f"Baseline 평균 PPL: {avg_base:.2f}")
        print(f"Attention 평균 PPL: {avg_attn:.2f}")
        print(f"개선율: {imp:.1f}%")

    top5 = sorted(results, key=lambda x: x["best_val_perplexity"])[:5]
    print("\n상위 5개 결과:")
    for i, r in enumerate(top5, 1):
        print(f"{i}. PPL {r['best_val_perplexity']:.2f}  | 설정: {r['config']}")

# ========== 메인 ==========
if __name__ == "__main__":
    print(":fire: LSTM 업그레이드 1차: Attention + 랜덤 서치")
    print(f"디바이스: {device}")
    print(f"VOCAB_SIZE: {VOCAB_SIZE}")
    # train_dl, val_dl 은 사전에 준비되어 있어야 합니다.
    results, best_config = run_random_search(train_dl, val_dl, n_trials=cfg.n_trials)

    # 샘플 생성 예시
    # sample_and_save(model, start_token=BOS_ID, max_tokens=1024,
    #                 temperature=cfg.default_temperature, top_k=cfg.default_top_k,
    #                 top_p=cfg.default_top_p, no_repeat_ngram_size=cfg.default_no_repeat_ngram,
    #                 out_midi_path=f"{SAMPLES_DIR}/sample.mid", aux_for_detok=aux_dict)

Using device: cuda
:fire: LSTM 업그레이드 1차: Attention + 랜덤 서치
디바이스: cuda
VOCAB_SIZE: 324
총 6개 랜덤 서치를 실행합니다...

=== 실험 1/6 ===
설정: {'use_attention': False, 'num_attention_heads': 4, 'lr': 0.001, 'weight_decay': 0.1, 'dropout_emb': 0.05, 'dropout_lstm': 0.2, 'dropout_attn': 0.05, 'dropout_ffn': 0.2}
모델 파라미터: 9.08M
[val @epoch1] nll=4.002  ppl=54.73
[val @epoch2] nll=3.374  ppl=29.19
[val @epoch3] nll=3.121  ppl=22.68
[val @epoch4] nll=3.056  ppl=21.24
[val @epoch5] nll=2.987  ppl=19.83
[val @epoch6] nll=2.915  ppl=18.44
[val @epoch7] nll=2.851  ppl=17.30
[val @epoch8] nll=2.785  ppl=16.21
[val @epoch9] nll=2.730  ppl=15.34
[val @epoch10] nll=2.701  ppl=14.90
[val @epoch11] nll=2.646  ppl=14.09
[val @epoch12] nll=2.607  ppl=13.55
[val @epoch13] nll=2.586  ppl=13.27
[val @epoch14] nll=2.541  ppl=12.69
[val @epoch15] nll=2.540  ppl=12.67
[val @epoch16] nll=2.512  ppl=12.33
[val @epoch17] nll=2.511  ppl=12.31
[val @epoch18] nll=2.463  ppl=11.74
[val @epoch19] nll=2.460  ppl=11.70
[val @epoch20]

In [22]:
# === 베스트 설정으로 단일 학습 → 베스트 가중치 저장/로드 → 샘플 생성/재생 (MPS/CPU 전용) ===
import os, json, time
from IPython.display import Audio, display
import torch
import torch.nn as nn

# 경로 (앞선 셀에서 정의된 디렉토리들을 재사용)
BEST_CKPT_PATH = f"{CKPT_DIR}/best_model.pt"
RESULTS_JSON   = f"{RESULTS_DIR}/random_search_results.json"

# 0) best_config 확보: 변수 없으면 results.json에서 최저 PPL 자동 선택
try:
    best_config  # 이미 존재하면 그대로 사용
except NameError:
    with open(RESULTS_JSON, "r") as f:
        results = json.load(f)
    best_row = min(results, key=lambda r: r["best_val_perplexity"])
    best_config = best_row["config"]
print("[best_config]", best_config)

# 1) cfg 적용
cfg.epochs = 50
cfg.use_attention       = best_config['use_attention']
cfg.num_attention_heads = best_config['num_attention_heads']
cfg.lr                  = best_config['lr']
cfg.weight_decay        = best_config['weight_decay']
cfg.dropout_emb         = best_config['dropout_emb']
cfg.dropout_lstm        = best_config['dropout_lstm']
cfg.dropout_attn        = best_config['dropout_attn']
cfg.dropout_ffn         = best_config['dropout_ffn']

# 2) 모델/옵티마이저/스케줄러
best_model = build_model_from_cfg()
opt = torch.optim.AdamW(best_model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
steps_per_epoch = max(1, len(train_dl))
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(opt, T_max=max(1, cfg.epochs * steps_per_epoch))
scaler = None  # AMP 미사용
ce = nn.CrossEntropyLoss(ignore_index=cfg.pad_id, reduction="mean")

# 3) 단일 학습 루프 + 최고 PPL 가중치만 저장
best_val_ppl = float("inf")
global_step = 0
for ep in range(1, cfg.epochs + 1):
    global_step = train_one_epoch(best_model, train_dl, opt, scheduler, scaler, ce,
                                  step0=global_step, val_dl=val_dl)
    nll, ppl = evaluate(best_model, val_dl)
    print(f"[single-train @epoch{ep}] nll={nll:.3f}  ppl={ppl:.2f}")
    if ppl < best_val_ppl:
        best_val_ppl = ppl
        torch.save(best_model.state_dict(), BEST_CKPT_PATH)
        print(f"✔ 베스트 갱신: PPL={best_val_ppl:.2f}  → 저장: {BEST_CKPT_PATH}")

# 4) 가장 좋은 가중치 로드(안전)
best_model.load_state_dict(torch.load(BEST_CKPT_PATH, map_location=device))
best_model.eval()
print("✅ 베스트 모델 준비 완료")

# 5) 토큰 생성 → MIDI/WAV 저장 → 재생
toks = sample_and_save(
    best_model,
    start_token=BOS_ID,
    max_tokens=512,
    temperature=1.0,
    top_k=50,
    out_midi_path=None,  # detokenize에서 저장
    aux_for_detok={"step_sec": 0.5/64, "program": 0}
)

# 파일 경로
os.makedirs(SAMPLES_DIR, exist_ok=True)
mid = os.path.join(SAMPLES_DIR, f"best_{int(time.time())}.mid")
wav = mid.replace(".mid", ".wav")

# detokenize (BOS/PAD 제거)
toks = [t for t in toks if t not in (PAD_ID, BOS_ID)]
detokenize_to_midi_file(toks, {"step_sec": 0.5/64, "program": 0}, mid)

# WAV 변환 & 재생: SoundFont 경로 자동 탐색
sf2_candidates = [
    "/usr/share/sounds/sf2/FluidR3_GM.sf2",                 # (리눅스/Colab)
    "/opt/homebrew/share/sounds/sf2/FluidR3_GM.sf2",        # (macOS Homebrew)
    "/Library/Audio/Sounds/Banks/FluidR3_GM.sf2",           # (macOS 수동설치)
]
sf2 = next((p for p in sf2_candidates if os.path.exists(p)), None)

try:
    from midi2audio import FluidSynth
    if sf2 is None:
        raise FileNotFoundError("SoundFont(.sf2) not found in candidates.")
    FluidSynth(sf2).midi_to_audio(mid, wav)
    display(Audio(wav))
    print("MIDI:", mid, "| WAV:", wav)
except Exception as e:
    print("[WARN] WAV 변환 건너뜀:", e)
    print("→ MIDI는 저장되었습니다:", mid)
    print("  macOS 기준: `brew install fluidsynth` 후 SF2 경로를 위 후보 중 하나에 배치하세요.")

[best_config] {'use_attention': True, 'num_attention_heads': 16, 'lr': 0.001, 'weight_decay': 0.5, 'dropout_emb': 0.05, 'dropout_lstm': 0.2, 'dropout_attn': 0.1, 'dropout_ffn': 0.05}
[single-train @epoch1] nll=4.320  ppl=75.19
✔ 베스트 갱신: PPL=75.19  → 저장: /content/drive/MyDrive/DL/beethoven_dataset/ckpt/best_model.pt
[single-train @epoch2] nll=4.138  ppl=62.70
✔ 베스트 갱신: PPL=62.70  → 저장: /content/drive/MyDrive/DL/beethoven_dataset/ckpt/best_model.pt
[single-train @epoch3] nll=3.835  ppl=46.32
✔ 베스트 갱신: PPL=46.32  → 저장: /content/drive/MyDrive/DL/beethoven_dataset/ckpt/best_model.pt
[single-train @epoch4] nll=3.333  ppl=28.03
✔ 베스트 갱신: PPL=28.03  → 저장: /content/drive/MyDrive/DL/beethoven_dataset/ckpt/best_model.pt
[single-train @epoch5] nll=3.251  ppl=25.82
✔ 베스트 갱신: PPL=25.82  → 저장: /content/drive/MyDrive/DL/beethoven_dataset/ckpt/best_model.pt
[single-train @epoch6] nll=3.106  ppl=22.34
✔ 베스트 갱신: PPL=22.34  → 저장: /content/drive/MyDrive/DL/beethoven_dataset/ckpt/best_model.pt
[single-train

MIDI: /content/drive/MyDrive/DL/beethoven_dataset/samples/best_1757089053.mid | WAV: /content/drive/MyDrive/DL/beethoven_dataset/samples/best_1757089053.wav


In [None]:
### 결과(1)

### 결과

In [23]:
!apt-get -yq install fluidsynth fluid-soundfont-gm
!pip -q install midi2audio


Reading package lists...
Building dependency tree...
Reading state information...
fluid-soundfont-gm is already the newest version (3.1-5.3).
fluid-soundfont-gm set to manually installed.
fluidsynth is already the newest version (2.2.5-1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [24]:
from midi2audio import FluidSynth

In [25]:
# --- deps (Colab) ---
try:
    from midi2audio import FluidSynth
except ImportError:
    !apt-get -yq install fluidsynth fluid-soundfont-gm
    !pip -q install midi2audio
    from midi2audio import FluidSynth

from IPython.display import Audio, display
import os, time

# 5) 토큰 생성 → MIDI/WAV 저장 → 재생 (최소 옵션)
toks = sample_and_save(
    best_model,
    start_token=BOS_ID,
    max_tokens=512,
    temperature=1.0,
    top_k=50,
    out_midi_path=None,
    aux_for_detok={"step_sec": 0.5/64, "program": 0}  # TS_DIV=64 가정(학습 설정과 일치 필요)
)

# 파일 경로
os.makedirs(SAMPLES_DIR, exist_ok=True)
mid = os.path.join(SAMPLES_DIR, f"best_{int(time.time())}.mid")
wav = mid.replace(".mid", ".wav")

# detokenize (BOS/PAD 최소 정리)
toks = [t for t in toks if t not in (PAD_ID, BOS_ID)]
detokenize_to_midi_file(toks, {"step_sec": 0.5/64, "program": 0}, mid)

# WAV 변환 & 재생 (SoundFont 경로만 확인)
sf2 = "/usr/share/sounds/sf2/FluidR3_GM.sf2"  # Colab 기본 경로(없으면 apt 설치 필요)
FluidSynth(sf2).midi_to_audio(mid, wav)

display(Audio(wav))
print("MIDI:", mid, "| WAV:", wav)

MIDI: /content/drive/MyDrive/DL/beethoven_dataset/samples/best_1757089122.mid | WAV: /content/drive/MyDrive/DL/beethoven_dataset/samples/best_1757089122.wav
