<a href="https://colab.research.google.com/github/SEOUL-ABSS/SHIPSHIP/blob/main/SONAR6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =- coding: utf-8 -*-
# ==============================================================================
#                 DeepShip/MBARI 수중 음향 분류 프로젝트 (RAM 최적화 v2 최종본)
# ==============================================================================
# 최종 전문가 검토 의견을 반영하여, 후보 탐색 과정의 RAM 사용량을 최소화하는 스트리밍 및
# 최소 힙(min-heap) 방식을 도입한 최종 버전입니다.
#
# [주요 개선 사항]
# 1. RAM 사용량 최소화: 모든 후보를 메모리에 저장하는 대신, Top-K개만 유지하는 힙 자료구조를
#                      사용하여 대용량 MBARI 데이터셋을 안정적으로 처리합니다.
# 2. 속도 향상: 후보 임베딩을 배치 단위로 예측하여 탐색 속도를 개선합니다.
# 3. 제어 강화: 후보 탐색 간격, 배치 크기 등을 CONFIG에서 쉽게 조절할 수 있도록 파라미터화.
# ==============================================================================


# ==============================================================================
# ## 1. 환경 설정 및 라이브러리 임포트
# ==============================================================================
print("1. 환경 설정 및 라이브러리 임포트 중...")

# --- 라이브러리 설치 ---
!pip install -q tensorflow tensorflow_hub soundfile librosa boto3 noisereduce umap-learn

# --- 모든 라이브러리 임포트 ---
import os, sys, subprocess, random, tempfile, shutil, gc, math, warnings, heapq
from collections import defaultdict
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import librosa, librosa.display, soundfile as sf
import boto3
from botocore import UNSIGNED
from botocore.client import Config
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder
import umap.umap_ as umap
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import classification_report, confusion_matrix, f1_score, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

# 불필요한 경고 메시지 숨기기
warnings.filterwarnings("ignore", category=UserWarning)

# --- 전역 시드 설정 (재현성 확보) ---
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# --- Matplotlib 한글 폰트 (선택) ---
!sudo apt-get -y install fonts-nanum > /dev/null
!sudo fc-cache -fv > /dev/null
import matplotlib.font_manager as fm
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
if os.path.exists(font_path):
    fm.fontManager.addfont(font_path)
    plt.rc('font', family='NanumGothic')
    plt.rcParams['axes.unicode_minus'] = False
    print("\nMatplotlib 폰트 설정 완료: NanumGothic")
else:
    print("\n경고: 나눔고딕 폰트를 찾을 수 없습니다.")

# --- 전역 상수 정의 ---
print("\n전역 상수 정의 중...")
YAMNET_SAMPLE_RATE = 16000
DEEPSHIP_BASE_PATH = '/content/DeepShip'
MBARI_NOISE_BASE_DIR = '/content/MBARI_noise_data'
CANDIDATE_DIR = '/content/review_candidates'   # 후보 오디오/리포트 저장 경로
VERIFIED_DIR  = '/content/verified_ships'      # 사용자가 직접 옮길 경로
MODELS_TO_PROCESS = ['YAMNet']
print("전역 상수 정의 완료.")

# ==============================================================================
# ## 2. 데이터 확보 (DeepShip 클론 + MBARI 일부 샘플 다운로드)
# ==============================================================================
print("\n2. 데이터 확보...")

# DeepShip
if not os.path.exists(DEEPSHIP_BASE_PATH):
    try:
        subprocess.run(
            ['git', 'clone', '--depth', '1',
             'https://github.com/irfankamboh/DeepShip.git', DEEPSHIP_BASE_PATH],
            check=True, capture_output=True
        )
        print("DeepShip 클론 완료.")
    except Exception as e:
        print(f"오류: DeepShip 클론 실패: {e}")
else:
    print("DeepShip이 이미 존재합니다.")

# MBARI (샘플 10개)
os.makedirs(MBARI_NOISE_BASE_DIR, exist_ok=True)
if os.listdir(MBARI_NOISE_BASE_DIR):
    print("MBARI 노이즈 데이터가 이미 존재합니다.")
else:
    print("MBARI 노이즈 데이터 다운로드 시도 중...")
    try:
        s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
        pages = s3.get_paginator('list_objects_v2').paginate(
            Bucket='pacific-sound-16khz', Prefix='2018/01/'
        )
        dl_count, MAX_DL = 0, 10
        for page in pages:
            for obj in page.get('Contents', []):
                if obj['Key'].endswith('.wav') and obj.get('Size', 0) > 0:
                    local_path = os.path.join(MBARI_NOISE_BASE_DIR, os.path.basename(obj['Key']))
                    if not os.path.exists(local_path):
                        s3.download_file('pacific-sound-16khz', obj['Key'], local_path)
                        dl_count += 1
                if dl_count >= MAX_DL:
                    break
            if dl_count >= MAX_DL:
                break
        print(f"MBARI 다운로드 완료. (파일 수: {dl_count})")
    except Exception as e:
        print(f"오류: MBARI 노이즈 데이터 다운로드 실패: {e}")
print("2. 데이터 확보 단계 완료.")

# ==============================================================================
# ## 3. 세그먼테이션 (활동/비활동 초 단위) + 데이터 로더
# ==============================================================================

def get_activity_intervals(file_path, target_sr, top_db=25):
    """활동/비활동 구간을 '초 단위' 튜플 리스트로 반환."""
    try:
        y, sr = librosa.load(file_path, sr=target_sr)
        intervals_samples = librosa.effects.split(
            y, top_db=top_db, frame_length=2048, hop_length=512
        )
        active = [(s / target_sr, e / target_sr) for s, e in intervals_samples]
        inactive = []
        last_end, duration = 0.0, len(y) / target_sr
        for s_sec, e_sec in active:
            if s_sec > last_end:
                inactive.append((last_end, s_sec))
            last_end = e_sec
        if last_end < duration:
            inactive.append((last_end, duration))
        return active, inactive
    except Exception:
        return [], []

def load_and_segment_data_final(
    ship_paths, noise_paths, verified_ship_path,
    segment_duration=5.0, segment_overlap=0.5, undersample=True
):
    """DeepShip + (선택)검증선박 + MBARI 노이즈 → (file,start,sr), label 리스트 생성."""
    hop_length = segment_duration * (1 - segment_overlap)
    ship_segments, noise_segments = [], []

    # ✅ None-safe 처리
    all_ship_folders = list(ship_paths)
    if verified_ship_path and os.path.exists(verified_ship_path):
        all_ship_folders.append(verified_ship_path)

    # Ship 폴더들: 활동 구간=ship, 비활동 구간=noise
    for folder_path in all_ship_folders:
        print(f"'ship' 클래스 데이터 처리 중: {folder_path}")
        if not os.path.exists(folder_path):
            continue
        for root, _, files in os.walk(folder_path):
            for fn in sorted([f for f in files if f.endswith('.wav')]):
                fp = os.path.join(root, fn)
                try:
                    info = sf.info(fp)
                except:
                    continue
                active, inactive = get_activity_intervals(fp, YAMNET_SAMPLE_RATE)
                # 활동 구간 -> ship
                for s_sec, e_sec in active:
                    if e_sec - s_sec >= segment_duration:
                        for seg_start in np.arange(s_sec, e_sec - segment_duration + 1e-9, hop_length):
                            ship_segments.append(((fp, float(seg_start), info.samplerate), 'ship'))
                # 비활동 구간 -> noise
                for s_sec, e_sec in inactive:
                    if e_sec - s_sec >= segment_duration:
                        for seg_start in np.arange(s_sec, e_sec - segment_duration + 1e-9, hop_length):
                            noise_segments.append(((fp, float(seg_start), info.samplerate), 'noise'))

    # Noise 폴더들: 전구간=noise
    for folder_path in noise_paths:
        print(f"'noise' 클래스 데이터 처리 중: {folder_path}")
        if not os.path.exists(folder_path):
            continue
        for root, _, files in os.walk(folder_path):
            for fn in sorted([f for f in files if f.endswith('.wav')]):
                fp = os.path.join(root, fn)
                try:
                    info = sf.info(fp)
                except:
                    continue
                dur = info.duration
                if dur < segment_duration:
                    continue
                for seg_start in np.arange(0, dur - segment_duration + 1e-9, hop_length):
                    noise_segments.append(((fp, float(seg_start), info.samplerate), 'noise'))

    print(f"  총 'ship' 세그먼트: {len(ship_segments)}개, 총 'noise' 세그먼트: {len(noise_segments)}개")
    all_data = ship_segments + noise_segments

    if undersample and ship_segments and noise_segments:
        k = min(len(ship_segments), len(noise_segments))
        print(f"\n클래스 불균형 -> 언더샘플링({k}개) 수행.")
        all_data = random.sample(ship_segments, k) + random.sample(noise_segments, k)
        random.shuffle(all_data)

    if not all_data:
        return [], [], False
    infos, labels = zip(*all_data)
    print("\n데이터 로드 및 세그먼테이션 완료.")
    return list(infos), list(labels), True

# ==============================================================================
# ## 4. 전처리/임베딩/모델/유틸 함수
# ==============================================================================

def mix_at_snr(clean, noise, snr_db):
    L = min(len(clean), len(noise))
    c, n = clean[:L].astype(np.float32), noise[:L].astype(np.float32)
    c_power = np.sqrt(np.mean(c**2)); n_power = np.sqrt(np.mean(n**2))
    if n_power < 1e-8:
        return c
    alpha = c_power / (n_power * (10 ** (snr_db / 20)))
    return c + alpha * n

def load_and_process_segment_efficient(file_info, duration, target_sr, config):
    file_path, start_time, orig_sr = file_info
    try:
        start_frame = int(start_time * orig_sr)
        num_frames  = int(duration * orig_sr)
        y_segment, _ = sf.read(
            file_path, start=start_frame, stop=start_frame + num_frames,
            dtype='float32', always_2d=False
        )
        if y_segment.ndim > 1:
            y_segment = np.mean(y_segment, axis=1)
        if orig_sr != target_sr:
            y_segment = librosa.resample(y=y_segment, orig_sr=orig_sr, target_sr=target_sr)
        if config.get("apply_noise_reduction", False):
            y_segment = nr.reduce_noise(y=y_segment, sr=target_sr)
        if config.get("apply_rms_norm", True):
            rms = np.sqrt(np.mean(y_segment ** 2))
            if rms > 1e-6:
                y_segment = y_segment * (10.0 ** (-20.0 / 20.0) / rms)
        return y_segment
    except Exception:
        return None

def extract_yamnet_embedding(audio_info, model, config, noise_audio_infos=None, augment=False):
    try:
        y_segment = load_and_process_segment_efficient(
            audio_info, config["segment_duration"], YAMNET_SAMPLE_RATE, config
        )
        if y_segment is None:
            return None
        if augment and noise_audio_infos:
            noise_info = random.choice(noise_audio_infos)
            y_noise = load_and_process_segment_efficient(
                noise_info, config["segment_duration"], YAMNET_SAMPLE_RATE,
                {"apply_noise_reduction": False, "apply_rms_norm": True}
            )
            if y_noise is not None:
                if config.get("use_snr_augmentation", False):
                    snr_db = random.uniform(config["snr_min_db"], config["snr_max_db"])
                    y_segment = mix_at_snr(y_segment, y_noise, snr_db)
                else:
                    L = min(len(y_segment), len(y_noise))
                    y_segment = y_segment[:L] + config.get("noise_level", 0.05) * y_noise[:L]
        _, embeddings, _ = model(y_segment)
        emb = tf.reduce_mean(embeddings, axis=0).numpy() if embeddings.shape[0] > 0 else None
        return emb
    except Exception:
        return None

def load_audio_models():
    models = {}
    print("\n오디오 모델 로드 중...")
    try:
        models['YAMNet'] = hub.load('https://tfhub.dev/google/yamnet/1')
        print("  YAMNet 모델 로드: 성공")
        return models, True
    except Exception as e:
        print(f"  모델 로드 실패: {e}")
        return {}, False

def build_classifier_model(input_shape, num_classes, learning_rate):
    inp = Input(shape=(input_shape,), name='embedding_input')
    x = Dense(256, activation='relu')(inp); x = Dropout(0.5)(x)
    x = Dense(128, activation='relu')(x); x = Dropout(0.5)(x)
    out = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

def embed_infos(infos, labels, model_hub, config, label_encoder,
                noise_infos_for_train=None, augment_ship_only=True):
    """세그먼트 리스트 → (X, y onehot, infos_kept) 임베딩."""
    embs, labs, kept_infos = [], [], []
    for info, lab in zip(infos, labels):
        is_ship = (label_encoder.inverse_transform([lab])[0] == 'ship')
        use_aug = bool(noise_infos_for_train) and (augment_ship_only and is_ship)
        noise_src = noise_infos_for_train if use_aug else None
        emb = extract_yamnet_embedding(info, model_hub, config,
                                       noise_audio_infos=noise_src, augment=use_aug)
        if emb is not None:
            embs.append(emb); labs.append(lab); kept_infos.append(info)
    if not embs:
        return np.array([]), np.array([]), []
    X = np.asarray(embs, dtype=np.float32)
    y = tf.keras.utils.to_categorical(np.asarray(labs), num_classes=len(label_encoder.classes_))
    return X, y, kept_infos

def summarize_metrics(y_true_onehot, y_prob, label_encoder, title=""):
    """정확도, Macro-F1, AUC(ship)을 계산하여 출력. AUC는 가드 처리."""
    y_true = y_true_onehot.argmax(axis=1)
    y_pred = y_prob.argmax(axis=1)
    acc  = (y_pred == y_true).mean()
    f1m  = f1_score(y_true, y_pred, average='macro')

    try:
        ship_idx = list(label_encoder.classes_).index('ship')
        y_true_bin = (y_true == ship_idx).astype(int)
        if len(np.unique(y_true_bin)) < 2:
            auc_score = float('nan')
        else:
            auc_score = roc_auc_score(y_true_bin, y_prob[:, ship_idx])
    except Exception:
        auc_score = float('nan')

    print(f"{title}Acc: {acc:.4f}, Macro-F1: {f1m:.4f}, AUC(ship): {auc_score:.4f}")
    return acc, f1m, auc_score

def plot_confusion(y_true_onehot, y_prob, label_encoder, title="Confusion Matrix"):
    y_true = y_true_onehot.argmax(axis=1)
    y_pred = y_prob.argmax(axis=1)
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d',
                xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_,
                cmap='Blues')
    plt.xlabel('예측 레이블'); plt.ylabel('실제 레이블'); plt.title(title)
    plt.show()

def generate_candidate_report_zip(candidate_dir, candidates, sr=YAMNET_SAMPLE_RATE):
    """candidates: list of dict({ 'wav':np.array, 'name':str })"""
    os.makedirs(candidate_dir, exist_ok=True)
    html_path = os.path.join(candidate_dir, "candidate_report.html")
    report = ["<h1>선박 소리 후보 (검증 필요)</h1><hr>"]

    for item in candidates:
        out_wav_path = os.path.join(candidate_dir, item['name'])
        sf.write(out_wav_path, item['wav'], sr)
        # HTML에서 상대경로로 접근 (Colab/로컬 다운로드 후에도 재생 가능)
        report += [
            f"<h3>{item['name']}</h3>",
            f"<p>Ship Probability: {item['ship_prob']:.2%}</p>",
            f"<audio controls src='{item['name']}'></audio><br>"
        ]
    with open(html_path, "w", encoding="utf-8") as f:
        f.write("\n".join(report))

    bundle_path = "/content/review_candidates_bundle"
    shutil.make_archive(bundle_path, 'zip', candidate_dir)
    print(f"\n[작업 요청] '{bundle_path}.zip' 파일을 다운로드하여 압축을 풀고, HTML 리포트를 확인하세요.")
    print(f"검증된 파일을 '{VERIFIED_DIR}' 폴더로 옮기거나, 마지막 단계에서 파일명을 직접 입력해도 됩니다.")

def parse_verified_filenames_to_infos(verified_filenames, base_dir):
    """후보 파일명 → (원본경로, 시작초, sr) 복원."""
    out_infos = []
    for fname in verified_filenames:
        try:
            base = fname.rsplit('.', 1)[0]
            parts = base.split('__')
            original_base = parts[0]
            start_sec = float(parts[1].split('s')[0])

            original_path = None
            for root, _, files in os.walk(base_dir):
                if f"{original_base}.wav" in files:
                    original_path = os.path.join(root, f"{original_base}.wav")
                    break
            if original_path:
                info = sf.info(original_path)
                out_infos.append(((original_path, start_sec, info.samplerate), 'ship'))
            else:
                print(f"[경고] 원본 파일 탐색 실패: {original_base}.wav")
        except Exception as e:
            print(f"[경고] 검증 파일 파싱 실패: {fname} ({e})")
    return out_infos

# ==============================================================================
# ## 5. 메인 파이프라인 1단계: 스카우트 모델 학습 & 후보 탐색
# ==============================================================================
CONFIG_SCOUT = {"segment_duration": 5.0, "segment_overlap": 0.5, "undersample": True, "apply_noise_reduction": False, "apply_rms_norm": True, "use_snr_augmentation": True, "snr_min_db": 0, "snr_max_db": 15, "noise_level": 0.05, "test_size": 0.2, "epochs": 20, "batch_size": 32, "learning_rate": 0.0005, "top_k_candidates": 50, "candidate_stride": 5.0, "candidate_batch": 32, "max_mbari_files": 10}

print("\n>>> [1단계] 스카우트 모델 학습 및 후보 탐색 시작...")

scout_infos, scout_labels, scout_ok = load_and_segment_data_final(ship_paths=[DEEPSHIP_BASE_PATH], noise_paths=[MBARI_NOISE_BASE_DIR], verified_ship_path=None, **{k:v for k,v in CONFIG_SCOUT.items() if k in ['segment_duration','segment_overlap','undersample']})
scout_model = None
if scout_ok:
    le_scout = LabelEncoder(); y_all = le_scout.fit_transform(scout_labels); groups = [info[0] for info in scout_infos]
    gss = GroupShuffleSplit(n_splits=1, test_size=CONFIG_SCOUT["test_size"], random_state=SEED)
    tr_idx, te_idx = next(gss.split(scout_infos, y_all, groups))
    Xtr_infos = [scout_infos[i] for i in tr_idx]; ytr_enc = y_all[tr_idx]
    Xte_infos = [scout_infos[i] for i in te_idx]; yte_enc = y_all[te_idx]

    models_hub, ok = load_audio_models()
    if ok:
        print("\n초기 모델 학습 시작...")
        noise_tr = [info for info, lab in zip(Xtr_infos, ytr_enc) if le_scout.inverse_transform([lab])[0] == 'noise']
        Xtr, ytr, _ = embed_infos(Xtr_infos, ytr_enc, models_hub['YAMNet'], CONFIG_SCOUT, le_scout, noise_infos_for_train=noise_tr)
        Xte, yte, _ = embed_infos(Xte_infos, yte_enc, models_hub['YAMNet'], CONFIG_SCOUT, le_scout)
        if len(Xtr) == 0 or len(Xte) == 0: print("오류: 임베딩 추출 실패(학습/평가 샘플이 없음)."); scout_model = None
        else:
            input_dim = Xtr.shape[-1]; num_classes = ytr.shape[-1]
            clf = build_classifier_model(input_dim, num_classes, CONFIG_SCOUT["learning_rate"])
            hist = clf.fit(Xtr, ytr, validation_data=(Xte, yte), epochs=CONFIG_SCOUT["epochs"], batch_size=CONFIG_SCOUT["batch_size"], callbacks=[EarlyStopping(patience=5, restore_best_weights=True), ReduceLROnPlateau(patience=3)], verbose=1)
            scout_model = clf
            prob_te = scout_model.predict(Xte, verbose=0); summarize_metrics(yte, prob_te, le_scout, title="[스카우트] ")

def stream_topk_candidates(scout_model, yamnet_model, label_encoder, mbari_dir, config):
    ship_idx = list(label_encoder.classes_).index('ship'); top_k = int(config.get("top_k_candidates", 50))
    stride = float(config.get("candidate_stride", config["segment_duration"])); batch_size = int(config.get("candidate_batch", 32))
    max_files = int(config.get("max_mbari_files", 1000000)); heap = []; batch_embs, batch_infos = [], []

    def flush_batch():
        nonlocal heap
        if not batch_embs: return
        probs = scout_model.predict(np.asarray(batch_embs, dtype=np.float32), verbose=0)
        for p, info in zip(probs, batch_infos):
            prob_ship = float(p[ship_idx])
            if len(heap) < top_k: heapq.heappush(heap, (prob_ship, info))
            elif prob_ship > heap[0][0]: heapq.heapreplace(heap, (prob_ship, info))
        batch_embs.clear(); batch_infos.clear(); gc.collect()

    mbari_files = sorted([f for f in os.listdir(mbari_dir) if f.endswith('.wav')])[:max_files]
    print(f"  총 {len(mbari_files)}개의 MBARI 파일을 스트리밍으로 처리합니다.")
    for fi, fn in enumerate(mbari_files, 1):
        fp = os.path.join(mbari_dir, fn)
        try: info = sf.info(fp)
        except: continue

        starts = np.arange(0, info.duration - config["segment_duration"] + 1e-9, stride)

        for seg_start in starts:
            seg_info = (fp, float(seg_start), info.samplerate)
            emb = extract_yamnet_embedding(seg_info, yamnet_model, config, augment=False)
            if emb is None: continue
            batch_embs.append(emb); batch_infos.append(seg_info)
            if len(batch_embs) >= batch_size: flush_batch()
        flush_batch()
        if fi % 5 == 0: print(f"  진행 상황: {fi}/{len(mbari_files)}개 파일 처리 완료…"); gc.collect()

    heap.sort(reverse=True)
    return heap

if scout_model:
    print("\n>>> MBARI 후보 탐색 (RAM 절약 스트리밍 모드)…")
    topk_heap = stream_topk_candidates(scout_model, models_hub['YAMNet'], le_scout, MBARI_NOISE_BASE_DIR, CONFIG_SCOUT)
    print(f"  Top-{len(topk_heap)} 후보 확보.")
    cand_items = []
    for prob, info in topk_heap:
        yseg = load_and_process_segment_efficient(info, CONFIG_SCOUT["segment_duration"], YAMNET_SAMPLE_RATE, CONFIG_SCOUT)
        if yseg is None: continue
        base_name = f"{os.path.basename(info[0]).replace('.wav','')}__{info[1]:.2f}s_prob{prob:.2f}.wav"
        cand_items.append({'name': base_name, 'wav': yseg, 'ship_prob': prob})
    generate_candidate_report_zip(CANDIDATE_DIR, cand_items, sr=YAMNET_SAMPLE_RATE)
    os.makedirs(VERIFIED_DIR, exist_ok=True)
else:
    print("\n스카우트 모델이 없어 후보 탐색을 건너뜁니다.")


# ==============================================================================
# ## 6. 메인 파이프라인 2단계: 검증된 데이터로 최종 학습
# ==============================================================================
print("\n" + "="*60)
print("## 6. 최종 모델 학습 단계")
print("  - 1단계에서 생성된 'review_candidates_bundle.zip' 파일을 확인하세요.")
print(f"  - 선박 소음이 확실한 WAV 파일들을 Colab의 '{VERIFIED_DIR}' 폴더로 업로드해주세요.")
print("  - 준비가 되면 이 셀을 실행하세요.")
print("="*60)

if not os.path.exists(VERIFIED_DIR) or not any(f.endswith('.wav') for f in os.listdir(VERIFIED_DIR)):
    print(f"\n경고: '{VERIFIED_DIR}' 폴더에 검증된 파일이 없습니다. 검증 데이터 없이 학습을 진행합니다.")

CONFIG_FINAL = CONFIG_SCOUT.copy(); CONFIG_FINAL['epochs'] = 50
print("\n>>> [2단계] 최종 데이터셋 로드...")
final_infos, final_labels, ok = load_and_segment_data_final(
    ship_paths=[DEEPSHIP_BASE_PATH],
    noise_paths=[MBARI_NOISE_BASE_DIR],
    verified_ship_path=VERIFIED_DIR,
    **{k: v for k, v in CONFIG_FINAL.items() if k in ['segment_duration', 'segment_overlap', 'undersample']}
)

if ok:
    le_final = LabelEncoder(); y_all = le_final.fit_transform(final_labels)
    groups = [info[0] for info in final_infos]
    gss = GroupShuffleSplit(n_splits=1, test_size=CONFIG_FINAL["test_size"], random_state=SEED)
    tr_idx, te_idx = next(gss.split(final_infos, y_all, groups))
    Xtr_infos = [final_infos[i] for i in tr_idx]; ytr_enc = y_all[tr_idx]
    Xte_infos = [final_infos[i] for i in te_idx]; yte_enc = y_all[te_idx]

    models_final, ok2 = load_audio_models()
    if ok2:
        noise_tr = [info for info, lab in zip(Xtr_infos, ytr_enc) if le_final.inverse_transform([lab])[0]=='noise']
        Xtr, ytr, _ = embed_infos(Xtr_infos, ytr_enc, models_final['YAMNet'], CONFIG_FINAL, le_final, noise_infos_for_train=noise_tr)
        Xte, yte, _ = embed_infos(Xte_infos, yte_enc, models_final['YAMNet'], CONFIG_FINAL, le_final)
        if len(Xtr) == 0 or len(Xte) == 0: print("최종 임베딩 추출 실패")
        else:
            input_dim = Xtr.shape[-1]; num_classes = ytr.shape[-1]
            clf = build_classifier_model(input_dim, num_classes, CONFIG_FINAL["learning_rate"])
            hist = clf.fit(Xtr, ytr, validation_data=(Xte, yte), epochs=CONFIG_FINAL["epochs"], batch_size=CONFIG_FINAL["batch_size"],
                           callbacks=[EarlyStopping(patience=8, restore_best_weights=True), ReduceLROnPlateau(patience=4)], verbose=1)
            probs = clf.predict(Xte, verbose=0)
            summarize_metrics(yte, probs, le_final, title="[최종] ")
            plot_confusion(yte, probs, le_final, title="최종 모델 혼동 행렬")
    else: print("최종 모델 로드 실패")
else: print("최종 데이터 준비 실패")

print("\n🎉 전체 파이프라인 실행 완료.")

1. 환경 설정 및 라이브러리 임포트 중...

Matplotlib 폰트 설정 완료: NanumGothic

전역 상수 정의 중...
전역 상수 정의 완료.

2. 데이터 확보...
DeepShip이 이미 존재합니다.
MBARI 노이즈 데이터가 이미 존재합니다.
2. 데이터 확보 단계 완료.

>>> [1단계] 스카우트 모델 학습 및 후보 탐색 시작...
'ship' 클래스 데이터 처리 중: /content/DeepShip
'noise' 클래스 데이터 처리 중: /content/MBARI_noise_data
  총 'ship' 세그먼트: 2201개, 총 'noise' 세그먼트: 345590개

클래스 불균형 -> 언더샘플링(2201개) 수행.

데이터 로드 및 세그먼테이션 완료.

오디오 모델 로드 중...
  YAMNet 모델 로드: 성공

초기 모델 학습 시작...
