# Ainu 話者(コレクション)特定 クリーン統合ノートブック

このノートブックは Ainu 音声コレクション（語り手グループ）を識別するエンドツーエンド実験パイプラインを一つに統合したクリーン版です。

構成 (Outline):
1. 環境セットアップ & 基本設定  
2. データ収集 & メタ情報構築  
3. ロバスト音声ローダー & 検証  
4. ラベルエンコード & Train/Test 分割  
5. データバランス戦略 (WeightedRandomSampler)  
6. Mel/特徴量設定 & キャッシュ設定  
7. グローバル Mel 統計計算  
8. Augmentation & 変換ユーティリティ  
9. Mel 生成 & 正規化パイプライン  
10. Dataset / DataLoader & キャッシュウォーム  
11. モデル定義 (可変 in_channels CNN)  
12. 学習ユーティリティ (EarlyStopping 等)  
13. 統合トレーニング関数 run_training()  
14. 評価 & 指標算出  
15. 学習履歴可視化  
16. 推論ヘルパー  
17. 性能 & I/O 診断  
18. 再現性 & 実験記録(JSON)  
19. オプション: main() 実行パイプライン

再学習時は training セルだけ再実行で OK。キャッシュを無視して再生成したい場合は CACHE_CFG['recompute']=True に設定してください。

In [1]:
# 1. 環境セットアップ & 基本設定 (Imports / Config / Seed)
import os, sys, math, json, random, time, hashlib
from pathlib import Path
import numpy as np
import pandas as pd
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

# 主要ディレクトリ設定
PROJECT_ROOT = Path('..').resolve()
DATA_ROOT = PROJECT_ROOT / 'data' / 'samples'
AINU_DIRS = [
    DATA_ROOT / 'asai_take_stories',
    DATA_ROOT / 'kimura_kimi_stories',
    DATA_ROOT / 'oda_ito_stories'
]
AUDIO_EXT = {'.wav', '.flac', '.mp3'}

# データ使用設定
DATA_USAGE_CFG = {
    'max_per_collection': None,      # None で全件
    'deterministic_shuffle': True,
    'balance_strategy': 'weighted',  # 'none' or 'weighted'
    'min_samples_per_class': 1
}

# DataLoader & 性能設定
DATA_LOADER_CFG = {
    'batch_size': 8,
    'num_workers': 4,
    'pin_memory': True,
    'prefetch_factor': 2,
    'persistent_workers': True,
    'cache_warm': True,
    'log_interval': 50,
    'cudnn_benchmark': True
}
if torch.cuda.is_available() and DATA_LOADER_CFG.get('cudnn_benchmark'):
    torch.backends.cudnn.benchmark = True

print('DATA_USAGE_CFG:', DATA_USAGE_CFG)
print('DATA_LOADER_CFG:', DATA_LOADER_CFG)

Device: cuda
DATA_USAGE_CFG: {'max_per_collection': None, 'deterministic_shuffle': True, 'balance_strategy': 'weighted', 'min_samples_per_class': 1}
DATA_LOADER_CFG: {'batch_size': 8, 'num_workers': 4, 'pin_memory': True, 'prefetch_factor': 2, 'persistent_workers': True, 'cache_warm': True, 'log_interval': 50, 'cudnn_benchmark': True}


In [2]:
# 2. データ収集 & メタ情報構築

def list_audio_files(root: Path):
    return [f for f in root.rglob('*') if f.suffix.lower() in AUDIO_EXT]

rows = []
for d in AINU_DIRS:
    if not d.exists():
        print('[WARN] missing dir:', d)
        continue
    files = list_audio_files(d)
    if not files:
        print('[WARN] empty dir:', d)
        continue
    files_sorted = sorted(files)
    if DATA_USAGE_CFG['deterministic_shuffle']:
        rnd = random.Random(SEED)
        rnd.shuffle(files_sorted)
    limit = DATA_USAGE_CFG['max_per_collection']
    subset = files_sorted if limit is None else files_sorted[:limit]
    if limit is not None and len(files) > limit:
        print(f'[Info] {d.name}: using {len(subset)}/{len(files)} files (limited).')
    for fp in subset:
        rows.append({'file_path': str(fp.resolve()), 'collection': d.name})

meta_df = pd.DataFrame(rows)
print('Indexed total files:', len(meta_df))
if not meta_df.empty:
    print(meta_df.groupby('collection').size())

Indexed total files: 2854
collection
asai_take_stories      2816
kimura_kimi_stories      23
oda_ito_stories          15
dtype: int64


In [3]:
# 3. ロバスト音声ローダー定義 & 検証スキャン
from typing import Optional, Tuple
import warnings

TARGET_SR = 16000
VALIDATION_SAMPLE_LIMIT = None  # デバッグ短縮用 (int を設定)
load_failures = []

# multi-backend safe_load

def safe_load(path: str, sr: int = TARGET_SR, mono: bool = True) -> Tuple[Optional[np.ndarray], Optional[int]]:
    # Strategy 1: librosa.load (内部で soundfile -> audioread fallback)
    try:
        y, s = librosa.load(path, sr=sr, mono=mono)
        if y is not None and len(y) > 0:
            return y, s
    except Exception:
        pass
    # Strategy 2: soundfile 直接
    try:
        import soundfile as sf
        with sf.SoundFile(path) as f:
            data = f.read(always_2d=False)
            s = f.samplerate
        if data is not None and data.size > 0:
            if mono and data.ndim > 1:
                data = data.mean(axis=0)
            if s != sr:
                data = librosa.resample(data, orig_sr=s, target_sr=sr)
                s = sr
            return data.astype(np.float32), s
    except Exception:
        pass
    # Strategy 3: audioread バイト復号
    try:
        import audioread
        with audioread.audio_open(path) as af:
            raw = b''.join([buf for buf in af])
            audio16 = np.frombuffer(raw, dtype=np.int16)
            if audio16.size == 0:
                raise ValueError('Empty decode')
            y = audio16.astype(np.float32) / 32768.0
            if mono and af.channels > 1:
                y = y.reshape(-1, af.channels).mean(axis=1)
            if af.samplerate != sr:
                y = librosa.resample(y, orig_sr=af.samplerate, target_sr=sr)
            return y, sr
    except Exception:
        pass
    return None, None

if os.environ.get('SKIP_AUDIO_VALIDATION','0') != '1' and not meta_df.empty:
    valid_rows = []
    for i, row in meta_df.iterrows():
        if VALIDATION_SAMPLE_LIMIT and i >= VALIDATION_SAMPLE_LIMIT:
            break
        fp = row['file_path']
        y, s = safe_load(fp)
        if y is None or len(y) == 0:
            load_failures.append(fp)
        else:
            valid_rows.append(row)
    if load_failures:
        print(f'[Audio Validation] {len(load_failures)} / {len(meta_df)} failed -> dropped')
        with open('audio_load_failures.json','w') as f:
            json.dump(load_failures, f, indent=2)
        meta_df = pd.DataFrame(valid_rows)
    else:
        print('[Audio Validation] All files readable')
else:
    print('[Audio Validation] Skipped or no data')

[Audio Validation] 2 / 2854 failed -> dropped


In [4]:
# 4. ラベルエンコード & Stratified Train/Test Split
if meta_df.empty:
    collection_to_id = {}
    train_files, test_files = [], []
    print('[Split] No data to split')
else:
    collections = sorted(meta_df['collection'].unique())
    collection_to_id = {c:i for i,c in enumerate(collections)}
    meta_df['label'] = meta_df['collection'].map(collection_to_id)
    train_df, test_df = train_test_split(
        meta_df, test_size=0.25, random_state=SEED, stratify=meta_df['label']
    )
    train_files = train_df.to_dict('records')
    test_files = test_df.to_dict('records')
    print('[Split] Train/Test sizes:', len(train_files), len(test_files))
    print('[Split] Collections:', collections)


[Split] Train/Test sizes: 2139 713
[Split] Collections: ['asai_take_stories', 'kimura_kimi_stories', 'oda_ito_stories']


In [5]:
# 5. データバランス戦略 (WeightedRandomSampler)
train_sampler = None
if train_files and DATA_USAGE_CFG.get('balance_strategy') == 'weighted':
    labels_series = pd.Series([r['label'] for r in train_files])
    class_counts = labels_series.value_counts().to_dict()
    class_weights = {c: 1.0 / max(1, class_counts[c]) for c in class_counts}
    sample_weights = labels_series.map(class_weights).values
    train_sampler = WeightedRandomSampler(weights=sample_weights,
                                         num_samples=len(sample_weights),
                                         replacement=True)
    print('[Balancing] WeightedRandomSampler enabled')
    print(' Class counts:', class_counts)
else:
    print('[Balancing] Using default shuffle (no weighted strategy)')

[Balancing] WeightedRandomSampler enabled
 Class counts: {0: 2111, 1: 17, 2: 11}


In [6]:
# 6. Mel/特徴量設定 & キャッシュ設定
TARGET_N_MELS = 128
TARGET_FRAMES = 1000

CACHE_CFG = {
    'enable': True,
    'cache_dir': '_mel_cache',
    'recompute': False,
    'compress': False,
    'verbose': True
}

IMPROVE_CFG = {
    'random_crop_frames': 800,        # train 時のみランダムクロップ (eval はセンター)
    'random_crop_jitter': 20,
    'specaug_time_masks': 1,
    'specaug_time_max_frac': 0.10,
    'specaug_freq_masks': 1,
    'specaug_freq_max_bins': 16,
    'use_delta': True,
    'use_delta_delta': True,
    'global_norm': True,
    'augment_prob': 0.8,
    'enable': True
}

_cache_root = Path(CACHE_CFG['cache_dir'])
if CACHE_CFG['enable']:
    _cache_root.mkdir(parents=True, exist_ok=True)
print('CACHE_CFG:', CACHE_CFG)
print('IMPROVE_CFG:', IMPROVE_CFG)

CACHE_CFG: {'enable': True, 'cache_dir': '_mel_cache', 'recompute': False, 'compress': False, 'verbose': True}
IMPROVE_CFG: {'random_crop_frames': 800, 'random_crop_jitter': 20, 'specaug_time_masks': 1, 'specaug_time_max_frac': 0.1, 'specaug_freq_masks': 1, 'specaug_freq_max_bins': 16, 'use_delta': True, 'use_delta_delta': True, 'global_norm': True, 'augment_prob': 0.8, 'enable': True}


In [7]:
# 7. グローバル Mel 統計計算 (mean/std) オプション
GLOBAL_MEL_MEAN = None
GLOBAL_MEL_STD = None
if IMPROVE_CFG.get('global_norm') and train_files:
    mel_sums = None
    mel_sq_sums = None
    count = 0
    sample_limit = None  # 例: 50 にすると高速ポーリング
    for i, rec in enumerate(train_files):
        if sample_limit and i >= sample_limit:
            break
        y, sr = safe_load(rec['file_path'], sr=TARGET_SR, mono=True)
        if y is None or len(y)==0:
            continue
        mel = librosa.feature.melspectrogram(y=y, sr=TARGET_SR, n_fft=1024, hop_length=256, n_mels=TARGET_N_MELS)
        mel_db = librosa.power_to_db(mel, ref=np.max)
        if mel_sums is None:
            mel_sums = np.zeros((TARGET_N_MELS,), dtype=np.float64)
            mel_sq_sums = np.zeros((TARGET_N_MELS,), dtype=np.float64)
        mel_sums += mel_db.mean(axis=1)
        mel_sq_sums += (mel_db**2).mean(axis=1)
        count += 1
    if count>0:
        GLOBAL_MEL_MEAN = mel_sums / count
        GLOBAL_MEL_STD = np.sqrt(np.maximum(mel_sq_sums / count - GLOBAL_MEL_MEAN**2, 1e-6))
        print('[GlobalNorm] stats computed over', count, 'files')
    else:
        print('[GlobalNorm] no valid files; fallback to per-sample')
else:
    print('[GlobalNorm] disabled or no train_files')

[GlobalNorm] stats computed over 2139 files


In [8]:
# 8. Augmentation & 変換ユーティリティ

def add_noise(y, std=0.003):
    if std <= 0: return y
    return y + np.random.randn(*y.shape)*std

def time_shift(y, max_frac=0.2):
    if max_frac <= 0: return y
    shift = int(len(y) * random.uniform(-max_frac, max_frac))
    return np.roll(y, shift)

def _random_crop_train(mel_db):
    crop_len = IMPROVE_CFG.get('random_crop_frames')
    if not (IMPROVE_CFG.get('enable') and crop_len):
        return mel_db
    if mel_db.shape[1] < crop_len:
        mel_db = np.pad(mel_db, ((0,0),(0, crop_len - mel_db.shape[1])), mode='constant')
    if mel_db.shape[1] == crop_len:
        return mel_db
    max_start = mel_db.shape[1] - crop_len
    jitter = IMPROVE_CFG.get('random_crop_jitter', 0)
    start = np.random.randint(0, max_start + 1)
    if jitter>0:
        start = max(0, min(max_start, start + np.random.randint(-jitter, jitter+1)))
    return mel_db[:, start:start+crop_len]

def _center_crop_eval(mel_db):
    crop_len = IMPROVE_CFG.get('random_crop_frames')
    if not (IMPROVE_CFG.get('enable') and crop_len):
        return mel_db
    if mel_db.shape[1] < crop_len:
        mel_db = np.pad(mel_db, ((0,0),(0, crop_len - mel_db.shape[1])), mode='constant')
    if mel_db.shape[1] == crop_len:
        return mel_db
    start = (mel_db.shape[1] - crop_len)//2
    return mel_db[:, start:start+crop_len]

def _spec_augment(mel_tensor):
    if not IMPROVE_CFG.get('enable'):
        return mel_tensor
    if np.random.rand() > IMPROVE_CFG.get('augment_prob', 1.0):
        return mel_tensor
    C, M, T = mel_tensor.shape
    # time mask
    for _ in range(IMPROVE_CFG.get('specaug_time_masks', 0)):
        t_max = int(T * IMPROVE_CFG.get('specaug_time_max_frac', 0.0))
        if t_max>0:
            t_len = np.random.randint(1, t_max+1)
            t_start = np.random.randint(0, max(1, T - t_len + 1))
            mel_tensor[:, :, t_start:t_start+t_len] = 0.0
    # freq mask
    for _ in range(IMPROVE_CFG.get('specaug_freq_masks', 0)):
        f_max = IMPROVE_CFG.get('specaug_freq_max_bins', 0)
        if f_max>0:
            f_len = np.random.randint(1, f_max+1)
            f_start = np.random.randint(0, max(1, M - f_len + 1))
            mel_tensor[:, f_start:f_start+f_len, :] = 0.0
    return mel_tensor

def _apply_global_norm(mel_db):
    if IMPROVE_CFG.get('global_norm') and GLOBAL_MEL_MEAN is not None and GLOBAL_MEL_STD is not None:
        return (mel_db - GLOBAL_MEL_MEAN[:, None]) / (GLOBAL_MEL_STD[:, None] + 1e-6)
    return (mel_db - mel_db.mean()) / (mel_db.std() + 1e-6)

def _add_deltas(mel_db):
    if not IMPROVE_CFG.get('enable') or not IMPROVE_CFG.get('use_delta'):
        return mel_db[None, ...]
    delta = librosa.feature.delta(mel_db)
    if IMPROVE_CFG.get('use_delta_delta'):
        delta2 = librosa.feature.delta(mel_db, order=2)
        return np.stack([mel_db, delta, delta2], axis=0)
    return np.stack([mel_db, delta], axis=0)

In [9]:
# 9. Mel 生成 & 正規化パイプライン (キャッシュ + Δ/ΔΔ + SpecAugment)
FAILED_MEL_FILES = []
CACHE_HITS = 0
CACHE_MISSES = 0
RESIZED_CACHED = 0

def _cache_path(path_str: str):
    if not CACHE_CFG.get('enable'):
        return None
    h = hashlib.md5(path_str.encode('utf-8')).hexdigest()
    ext = '.npy' if not CACHE_CFG.get('compress') else '.npz'
    return _cache_root / f'{h}{ext}'

def wav_to_mel_tensor(path: str, train_mode: bool=True):
    global CACHE_HITS, CACHE_MISSES, RESIZED_CACHED
    y, sr = safe_load(path, sr=TARGET_SR, mono=True)
    if y is None or len(y)==0:
        FAILED_MEL_FILES.append(path)
        base_mel_db = np.zeros((TARGET_N_MELS, TARGET_FRAMES), dtype=np.float32)
    else:
        cache_file = _cache_path(path)
        base_mel_db = None
        if cache_file and cache_file.exists() and not CACHE_CFG.get('recompute'):
            try:
                if CACHE_CFG.get('compress'):
                    data = np.load(cache_file)
                    base_mel_db = data['mel']
                else:
                    base_mel_db = np.load(cache_file)
                CACHE_HITS += 1
            except Exception:
                base_mel_db = None
        if base_mel_db is None:
            mel = librosa.feature.melspectrogram(y=y, sr=TARGET_SR, n_fft=1024, hop_length=256, n_mels=TARGET_N_MELS)
            mel_db = librosa.power_to_db(mel, ref=np.max)
            # pad/truncate to TARGET_FRAMES
            if mel_db.shape[1] > TARGET_FRAMES:
                mel_db = mel_db[:, :TARGET_FRAMES]
            elif mel_db.shape[1] < TARGET_FRAMES:
                mel_db = np.pad(mel_db, ((0,0),(0, TARGET_FRAMES - mel_db.shape[1])), mode='constant')
            base_mel_db = mel_db.astype(np.float32)
            if cache_file:
                try:
                    if CACHE_CFG.get('compress'):
                        np.savez_compressed(cache_file, mel=base_mel_db)
                    else:
                        np.save(cache_file, base_mel_db)
                    CACHE_MISSES += 1
                except Exception:
                    pass
        else:
            if base_mel_db.shape[1] != TARGET_FRAMES:
                if base_mel_db.shape[1] > TARGET_FRAMES:
                    base_mel_db = base_mel_db[:, :TARGET_FRAMES]
                else:
                    base_mel_db = np.pad(base_mel_db, ((0,0),(0, TARGET_FRAMES - base_mel_db.shape[1])), mode='constant')
                RESIZED_CACHED += 1
    mel_db = base_mel_db
    mel_db = _random_crop_train(mel_db) if train_mode else _center_crop_eval(mel_db)
    mel_db = _apply_global_norm(mel_db)
    channels = _add_deltas(mel_db)
    if train_mode:
        channels = _spec_augment(channels)
    return channels.astype(np.float32)

In [10]:
# 10. Dataset / DataLoader 実装 & キャッシュウォーム
class AinuMelDataset(Dataset):
    def __init__(self, records, train_mode=True):
        self.records = records
        self.train_mode = train_mode
    def __len__(self):
        return len(self.records)
    def __getitem__(self, idx):
        r = self.records[idx]
        try:
            arr = wav_to_mel_tensor(r['file_path'], train_mode=self.train_mode)
        except Exception as e:
            print(f'[Dataset Error] {r.get("file_path")} -> {e}')
            # Fallback shape inference
            chans = 1
            if IMPROVE_CFG.get('enable') and IMPROVE_CFG.get('use_delta'):
                chans = 2 + (1 if IMPROVE_CFG.get('use_delta_delta') else 0)
            crop_len = IMPROVE_CFG.get('random_crop_frames') or TARGET_FRAMES
            arr = np.zeros((chans, TARGET_N_MELS, crop_len), dtype=np.float32)
        x = torch.from_numpy(arr)
        y = r['label']
        return {'mel': x, 'label': torch.tensor(y, dtype=torch.long)}

if train_files:
    train_dataset = AinuMelDataset(train_files, train_mode=True)
    test_dataset = AinuMelDataset(test_files, train_mode=False)
    dl_kwargs = dict(batch_size=DATA_LOADER_CFG['batch_size'],
                     num_workers=DATA_LOADER_CFG.get('num_workers',0),
                     pin_memory=DATA_LOADER_CFG.get('pin_memory', False),
                     persistent_workers=DATA_LOADER_CFG.get('persistent_workers', False))
    if train_sampler is not None:
        train_loader = DataLoader(train_dataset, sampler=train_sampler, **dl_kwargs)
    else:
        train_loader = DataLoader(train_dataset, shuffle=True, **dl_kwargs)
    test_loader = DataLoader(test_dataset, shuffle=False, **dl_kwargs)
    print('[Loader] train batches =', len(train_loader), 'workers =', dl_kwargs['num_workers'])
else:
    train_loader = test_loader = None
    print('[Loader] No data available')

# オプション: キャッシュウォーム (メル計算済み化)
if DATA_LOADER_CFG.get('cache_warm') and CACHE_CFG.get('enable') and train_files:
    t0 = time.time()
    for i in range(len(train_dataset)):
        _ = train_dataset[i]
    dt = time.time() - t0
    print(f'[Warmup] Done in {dt:.1f}s ({len(train_dataset)/max(dt,1e-6):.1f} samples/s)')
else:
    print('[Warmup] Skipped')

[Loader] train batches = 268 workers = 4
[Warmup] Done in 34.6s (61.8 samples/s)


In [11]:
# 11. モデル定義 (可変 in_channels CNN)
class ConvBlock(nn.Module):
    def __init__(self, c_in, c_out, k=3, p=1):
        super().__init__()
        self.conv = nn.Conv2d(c_in, c_out, k, padding=p)
        self.bn = nn.BatchNorm2d(c_out)
    def forward(self, x):
        return F.relu(self.bn(self.conv(x)))

class BestCNN(nn.Module):
    def __init__(self, n_classes, in_channels=1, dropout=0.35):
        super().__init__()
        self.block1 = ConvBlock(in_channels, 32)
        self.block2 = ConvBlock(32, 64)
        self.block3 = ConvBlock(64, 128)
        self.pool = nn.AdaptiveAvgPool2d((1,1))
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(128, n_classes)
    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.pool(x).squeeze(-1).squeeze(-1)
        x = self.dropout(x)
        return self.fc(x)

IN_CHANNELS = 1
if IMPROVE_CFG.get('enable') and IMPROVE_CFG.get('use_delta'):
    IN_CHANNELS = 2 + (1 if IMPROVE_CFG.get('use_delta_delta') else 0)

num_classes = len(collection_to_id)
model = BestCNN(num_classes, in_channels=IN_CHANNELS).to(device) if num_classes>0 else None
if model:
    total_params = sum(p.numel() for p in model.parameters())
    print(f'Model params: {total_params} | in_channels={IN_CHANNELS}')
else:
    print('No model instantiated (no data).')

Model params: 94083 | in_channels=3


In [12]:
# 12. 学習ユーティリティ (EarlyStopping / Checkpoint)
class EarlyStopping:
    def __init__(self, patience=20, min_delta=0.1):  # min_delta は精度(%)差分
        self.patience = patience
        self.min_delta = min_delta
        self.best = -float('inf')
        self.wait = 0
        self.stopped = False
    def step(self, metric):  # metric は精度%
        if metric > self.best + self.min_delta:
            self.best = metric
            self.wait = 0
        else:
            self.wait += 1
            if self.wait >= self.patience:
                self.stopped = True
        return self.stopped

def save_checkpoint(model, path):
    torch.save(model.state_dict(), path)

def load_checkpoint(model, path, map_location=None, strict=True):
    sd = torch.load(path, map_location=map_location)
    model.load_state_dict(sd, strict=strict)
    return model

In [21]:
# 13. 統合トレーニング関数 run_training()  (GPU 使用状況ログ追加)

def run_training(model, train_loader, test_loader, *, epochs=80, lr=1e-3, grad_clip=5.0,
                 patience=40, min_delta=0.1, log_interval=None, gpu_monitor_interval=100):
    if model is None or train_loader is None or test_loader is None:
        print('[Train] Missing model or loaders')
        return None
    use_amp = torch.cuda.is_available()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=4)
    try:
        scaler = torch.amp.GradScaler('cuda' if use_amp else 'cpu', enabled=use_amp)
    except TypeError:
        scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

    early = EarlyStopping(patience=patience, min_delta=min_delta)
    history = {
        'train_loss': [], 'test_loss': [],
        'train_acc': [], 'test_acc': [],
        'best_epoch': 0, 'best_acc': 0.0
    }
    best_path = 'best_ainu_model.pth'

    def _gpu_stats(tag=""):
        if torch.cuda.is_available():
            torch.cuda.synchronize()
            alloc = torch.cuda.memory_allocated() / (1024**2)
            reserved = torch.cuda.memory_reserved() / (1024**2)
            return f"GPU[{torch.cuda.current_device()}]{tag} mem_alloc={alloc:.1f}MB mem_reserved={reserved:.1f}MB"
        return 'GPU not available'

    print('[Train] use_amp =', use_amp, '| device =', device)
    if torch.cuda.is_available():
        print('[Train] CUDA name:', torch.cuda.get_device_name(0))
        print('[Train]', _gpu_stats('(start)'))

    for ep in range(1, epochs+1):
        t0 = time.time()
        model.train()
        batch_losses = []
        correct = 0
        total = 0
        for bi, batch in enumerate(train_loader, start=1):
            x = batch['mel'].to(device, non_blocking=True)
            y = batch['label'].to(device, non_blocking=True)
            optimizer.zero_grad(set_to_none=True)
            with torch.amp.autocast(device_type='cuda', enabled=use_amp):
                logits = model(x)
                loss = F.cross_entropy(logits, y)
            scaler.scale(loss).backward()
            if grad_clip is not None:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            scaler.step(optimizer)
            scaler.update()
            batch_losses.append(loss.item())
            preds = logits.argmax(1)
            correct += (preds == y).sum().item()
            total += y.size(0)

            if log_interval and (bi % log_interval == 0 or bi == 1):
                gpu_msg = ''
                if torch.cuda.is_available():
                    gpu_msg = ' | ' + _gpu_stats()
                print(f'[Ep {ep}][{bi}/{len(train_loader)}] loss={loss.item():.4f} acc={(correct/total)*100:.2f}%{gpu_msg}')
            elif gpu_monitor_interval and torch.cuda.is_available() and (bi % gpu_monitor_interval == 0):
                # 軽量 GPU 状態表示 (log_interval が設定されていない場合でも)
                print(f'[GPU Monitor] Ep {ep} Batch {bi}:', _gpu_stats())

        epoch_train_loss = float(np.mean(batch_losses)) if batch_losses else 0.0
        epoch_train_acc = 100.0 * correct / max(1,total)

        # Eval
        model.eval()
        test_losses = []
        all_preds, all_y = [], []
        with torch.no_grad():
            for batch in test_loader:
                x = batch['mel'].to(device, non_blocking=True)
                y = batch['label'].to(device, non_blocking=True)
                with torch.amp.autocast(device_type='cuda', enabled=use_amp):
                    logits = model(x)
                    loss = F.cross_entropy(logits, y)
                test_losses.append(loss.item())
                preds = logits.argmax(1).cpu().numpy()
                all_preds.extend(preds)
                all_y.extend(y.cpu().numpy())
        epoch_test_loss = float(np.mean(test_losses)) if test_losses else 0.0
        epoch_test_acc = 100.0 * accuracy_score(all_y, all_preds) if all_y else 0.0
        acc_fraction = epoch_test_acc / 100.0
        scheduler.step(acc_fraction)

        history['train_loss'].append(epoch_train_loss)
        history['test_loss'].append(epoch_test_loss)
        history['train_acc'].append(epoch_train_acc)
        history['test_acc'].append(epoch_test_acc)

        elapsed = time.time() - t0
        gpu_ep_msg = ''
        if torch.cuda.is_available():
            gpu_ep_msg = ' | ' + _gpu_stats('(epoch end)')
        print(f'Epoch {ep:02d} | {elapsed/60:.2f}m | train_loss={epoch_train_loss:.4f} test_loss={epoch_test_loss:.4f} train_acc={epoch_train_acc:.2f}% test_acc={epoch_test_acc:.2f}% lr={optimizer.param_groups[0]['lr']:.2e}{gpu_ep_msg}')

        improved = epoch_test_acc > history['best_acc'] + min_delta
        if improved:
            history['best_acc'] = epoch_test_acc
            history['best_epoch'] = ep
            save_checkpoint(model, best_path)
            print(f'  * New best model saved ({epoch_test_acc:.2f}%)')
            early.wait = 0
        else:
            early.wait += 1
        if early.wait >= early.patience:
            print(f'[EarlyStopping] Stop at epoch {ep}')
            break

    print(f'Best accuracy {history['best_acc']:.2f}% at epoch {history['best_epoch']} (path={best_path})')
    if torch.cuda.is_available():
        print('[Train] Final GPU state:', _gpu_stats('(final)'))
    return history, best_path

In [14]:
# 14. 評価 & 指標算出

def evaluate(model, loader):
    model.eval()
    use_amp = torch.cuda.is_available()
    losses = []
    all_y, all_preds = [], []
    with torch.no_grad():
        for batch in loader:
            x = batch['mel'].to(device, non_blocking=True)
            y = batch['label'].to(device, non_blocking=True)
            with torch.amp.autocast(device_type='cuda', enabled=use_amp):
                logits = model(x)
                loss = F.cross_entropy(logits, y)
            losses.append(loss.item())
            preds = logits.argmax(1).cpu().numpy()
            all_preds.extend(preds)
            all_y.extend(y.cpu().numpy())
    avg_loss = float(np.mean(losses)) if losses else 0.0
    acc = 100.0 * accuracy_score(all_y, all_preds) if all_y else 0.0
    return {
        'loss': avg_loss,
        'accuracy': acc,
        'y_true': all_y,
        'y_pred': all_preds
    }

def compute_class_stats(eval_dict, collection_to_id):
    idx_to_collection = {v:k for k,v in collection_to_id.items()}
    y_true = np.array(eval_dict['y_true'])
    y_pred = np.array(eval_dict['y_pred'])
    labels = sorted(collection_to_id.values())
    cm = confusion_matrix(y_true, y_pred, labels=labels) if len(y_true)>0 else None
    report = classification_report(y_true, y_pred, labels=labels,
                                   target_names=[idx_to_collection[l] for l in labels], digits=2,
                                   zero_division=0) if len(y_true)>0 else ''
    per_class_acc = {}
    if len(y_true)>0:
        for l in labels:
            mask = (y_true == l)
            correct = (y_pred[mask] == l).sum()
            total = mask.sum()
            per_class_acc[l] = 100.0 * correct / total if total>0 else 0.0
    return {
        'confusion_matrix': cm.tolist() if cm is not None else None,
        'classification_report': report,
        'per_class_accuracy': per_class_acc,
        'idx_to_collection': idx_to_collection
    }

In [15]:
# 15. 学習履歴可視化

def plot_history(history, class_stats=None, save_prefix='ainu_cnn'):
    epochs_range = range(1, len(history['train_loss'])+1)
    fig, axes = plt.subplots(2,2, figsize=(14,9))
    fig.suptitle('Ainu Collection Identification Training', fontsize=16, fontweight='bold')
    # Loss
    axes[0,0].plot(epochs_range, history['train_loss'], label='Train Loss', linestyle='--')
    axes[0,0].plot(epochs_range, history['test_loss'], label='Test Loss')
    axes[0,0].set_title('Loss'); axes[0,0].legend(); axes[0,0].grid(alpha=0.3)
    # Accuracy
    axes[0,1].plot(epochs_range, history['train_acc'], label='Train Acc', linestyle='--')
    axes[0,1].plot(epochs_range, history['test_acc'], label='Test Acc')
    axes[0,1].axvline(history['best_epoch'], color='red', linestyle=':', label='Best')
    axes[0,1].set_title('Accuracy (%)'); axes[0,1].legend(); axes[0,1].grid(alpha=0.3)
    # Test accuracy progression
    axes[1,0].plot(epochs_range, history['test_acc'], marker='o', markersize=4)
    axes[1,0].set_title('Test Accuracy per Epoch'); axes[1,0].grid(alpha=0.3)
    # Per-class accuracy bar
    if class_stats and class_stats.get('per_class_accuracy'):
        pc = class_stats['per_class_accuracy']
        idx_to_collection = class_stats['idx_to_collection']
        keys = sorted(pc.keys())
        vals = [pc[k] for k in keys]
        axes[1,1].bar(range(len(vals)), vals)
        axes[1,1].set_xticks(range(len(vals)))
        axes[1,1].set_xticklabels([idx_to_collection[k] for k in keys], rotation=45, ha='right')
        axes[1,1].set_title('Per-Class Accuracy')
        axes[1,1].grid(alpha=0.3)
    else:
        axes[1,1].text(0.5,0.5,'No class stats', ha='center', va='center')
    plt.tight_layout()
    fig.savefig(f'{save_prefix}_training_curves.png', dpi=300, bbox_inches='tight')
    plt.show()
    print(f'[Plot] Saved {save_prefix}_training_curves.png')

In [16]:
# 16. 推論ヘルパー (単一 / バッチ)
IDX_TO_COLLECTION = {v:k for k,v in collection_to_id.items()}

def load_best_model(path='best_ainu_model.pth'):
    if not os.path.exists(path) or model is None:
        print('[WARN] best model file not found')
        return None
    m = BestCNN(num_classes, in_channels=IN_CHANNELS)
    m.load_state_dict(torch.load(path, map_location=device))
    m.to(device).eval()
    return m

def _inference_preprocess(y, sr):
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, hop_length=256, n_mels=TARGET_N_MELS)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    if mel_db.shape[1] > TARGET_FRAMES:
        mel_db = mel_db[:, :TARGET_FRAMES]
    elif mel_db.shape[1] < TARGET_FRAMES:
        mel_db = np.pad(mel_db, ((0,0),(0, TARGET_FRAMES - mel_db.shape[1])), mode='constant')
    # 学習時 random_crop_frames < TARGET_FRAMES の場合でも inference では中央切り出しを使うことで整合性
    mel_db = _center_crop_eval(mel_db)
    mel_db = _apply_global_norm(mel_db)
    if IMPROVE_CFG.get('enable') and IMPROVE_CFG.get('use_delta'):
        delta = librosa.feature.delta(mel_db)
        chans = [mel_db, delta]
        if IMPROVE_CFG.get('use_delta_delta'):
            delta2 = librosa.feature.delta(mel_db, order=2)
            chans.append(delta2)
        arr = np.stack(chans, axis=0)
    else:
        arr = mel_db[None, ...]
    return arr.astype(np.float32)

def predict_collection(audio_path: str, model_path='best_ainu_model.pth'):
    m = load_best_model(model_path)
    if m is None:
        return None
    try:
        y, sr = librosa.load(audio_path, sr=TARGET_SR, mono=True)
        arr = _inference_preprocess(y, sr)
        x = torch.from_numpy(arr).unsqueeze(0).to(device)
        with torch.no_grad():
            logits = m(x)
            probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
            pred_id = int(np.argmax(probs))
        return {
            'prediction': IDX_TO_COLLECTION.get(pred_id, 'UNKNOWN'),
            'probs': {IDX_TO_COLLECTION.get(i, str(i)): float(p) for i,p in enumerate(probs)}
        }
    except Exception as e:
        print('[Inference Error]', e)
        return None

def batch_predict(file_paths, model_path='best_ainu_model.pth'):
    m = load_best_model(model_path)
    if m is None:
        return []
    outputs = []
    with torch.no_grad():
        for fp in file_paths:
            try:
                y, sr = librosa.load(fp, sr=TARGET_SR, mono=True)
                arr = _inference_preprocess(y, sr)
                x = torch.from_numpy(arr).unsqueeze(0).to(device)
                logits = m(x)
                probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
                pred_id = int(np.argmax(probs))
                outputs.append({'file': fp, 'prediction': IDX_TO_COLLECTION.get(pred_id, 'UNKNOWN'), 'probs': probs.tolist()})
            except Exception as e:
                outputs.append({'file': fp, 'error': str(e)})
    return outputs

In [17]:
# 17. 性能 & I/O 診断
if train_files:
    # 個別サンプル取得時間
    N = min(48, len(train_files))
    t0 = time.time()
    for i in range(N):
        _ = wav_to_mel_tensor(train_files[i]['file_path'], train_mode=True)
    dt = time.time() - t0
    print(f'[Perf] {N} single feature builds: {dt:.2f}s -> {N/max(dt,1e-6):.1f} samples/s')
    # DataLoader バッチ計測
    if 'train_loader' in globals() and train_loader is not None:
        K = min(40, len(train_loader))
        t1 = time.time()
        it = iter(train_loader)
        for k in range(K):
            b = next(it)
            _x = b['mel'].to(device, non_blocking=True)
            _y = b['label'].to(device, non_blocking=True)
        dtb = time.time() - t1
        print(f'[Perf] {K} loader batches: {dtb:.2f}s -> {K/max(dtb,1e-6):.2f} batches/s')
print(f'[Cache] Hits={CACHE_HITS} Misses={CACHE_MISSES} Resized={RESIZED_CACHED} FailedMel={len(FAILED_MEL_FILES)}')

[Perf] 48 single feature builds: 1.20s -> 39.8 samples/s
[Perf] 40 loader batches: 69.88s -> 0.57 batches/s
[Cache] Hits=2187 Misses=0 Resized=0 FailedMel=0


In [18]:
# 18. 再現性 & 実験記録 (config + metrics JSON 保存)
import datetime

def save_experiment_record(history, class_stats, eval_dict, config_extra=None, path_prefix='ainu_experiment'):
    ts = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
    record = {
        'timestamp': ts,
        'history': history,
        'class_stats': class_stats,
        'evaluation': eval_dict,
        'CACHE_CFG': CACHE_CFG,
        'IMPROVE_CFG': IMPROVE_CFG,
        'DATA_USAGE_CFG': DATA_USAGE_CFG,
        'DATA_LOADER_CFG': DATA_LOADER_CFG,
        'num_classes': num_classes,
        'collections': list(collection_to_id.keys())
    }
    if config_extra:
        record['extra'] = config_extra
    out_path = f'{path_prefix}_{ts}.json'
    with open(out_path, 'w') as f:
        json.dump(record, f, indent=2)
    print('[Record] Saved experiment JSON ->', out_path)
    return out_path

In [1]:
# 19. オプション: main() パイプライン (セルを実行すると学習〜評価まで自動)
RUN_MAIN = True  # True にすると下セル実行時 main() 呼び出し

def main():
    if model is None or train_loader is None or test_loader is None:
        print('[Main] Model or loaders missing; abort')
        return
    print('--- Start Training ---')
    history, best_path = run_training(
        model, train_loader, test_loader,
        epochs=80, lr=1e-3, grad_clip=5.0,
        patience=40, min_delta=0.1,
        log_interval=DATA_LOADER_CFG.get('log_interval')
    )
    print('--- Evaluate Best Model ---')
    best_model = BestCNN(num_classes, in_channels=IN_CHANNELS).to(device)
    load_checkpoint(best_model, best_path, map_location=device)
    eval_dict = evaluate(best_model, test_loader)
    class_stats = compute_class_stats(eval_dict, collection_to_id)
    plot_history(history, class_stats)
    save_experiment_record(history, class_stats, eval_dict)
    print('Prediction example (first test file):')
    if test_files:
        ex = predict_collection(test_files[0]['file_path'], model_path=best_path)
        print(ex)
    return history, eval_dict, class_stats

if RUN_MAIN:
    _ = main()

NameError: name 'model' is not defined

In [22]:
# GPU 使用確認ユーティリティ (手動チェック用)
if torch.cuda.is_available():
    print('[GPU Check] Device count:', torch.cuda.device_count())
    print('[GPU Check] Current device:', torch.cuda.current_device())
    print('[GPU Check] Name:', torch.cuda.get_device_name(0))
    print('[GPU Check] Memory Allocated (MB):', torch.cuda.memory_allocated()/(1024**2))
    print('[GPU Check] Memory Reserved  (MB):', torch.cuda.memory_reserved()/(1024**2))
else:
    print('[GPU Check] CUDA not available. Falling back to CPU.')

[GPU Check] Device count: 1
[GPU Check] Current device: 0
[GPU Check] Name: NVIDIA GeForce RTX 4070 SUPER
[GPU Check] Memory Allocated (MB): 36.453125
[GPU Check] Memory Reserved  (MB): 1550.0
