In [11]:
# pip install torch torchaudio librosa pandas
import os, torch, librosa, numpy as np, pandas as pd
import torchaudio as ta
from torch.utils.data import Dataset, DataLoader
from torch import nn

CSV_PATH = "C:\\Users\\ipl1\\Desktop\\IPL\\세미나(25하계)\\data\\sound\\esc50.csv"
WAV_DIR  = "C:\\Users\\ipl1\\Desktop\\IPL\\세미나(25하계)\\data\\sound\\audio\\audio"   # wav들이 있는 폴더
DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"

# 고정 라벨 매핑 (ESC-10)
df_label = {0:'dog', 1:'chainsaw', 2:'crackling_fire', 3:'helicopter', 4:'rain',
            5:'crying_baby', 6:'clock_tick', 7:'sneezing', 8:'rooster', 9:'sea_waves'}
label2id = {v:k for k,v in df_label.items()}

# --- 데이터프레임 준비 (ESC-10만)
df = pd.read_csv(CSV_PATH)
df = df[df["esc10"]==True].reset_index(drop=True)
df["y"] = df["category"].map(label2id)

class ESC10CRNNDataset(Dataset):
    def __init__(self, frame, augment=False):
        self.frame = frame.reset_index(drop=True)
        self.sr = 16000
        self.fixed_len = self.sr * 5
        self.augment = augment

        self.mel = ta.transforms.MelSpectrogram(
            sample_rate=self.sr, n_fft=1024, hop_length=320, n_mels=128, center=True, power=2.0
        )
        self.to_db = ta.transforms.AmplitudeToDB(top_db=80)
        # 간단 SpecAugment
        self.fmask = ta.transforms.FrequencyMasking(freq_mask_param=12)
        self.tmask = ta.transforms.TimeMasking(time_mask_param=24)

    def __len__(self): return len(self.frame)

    def __getitem__(self, i):
        r = self.frame.iloc[i]
        wav_path = os.path.join(WAV_DIR, r["filename"])

        y, sr = ta.load(wav_path)           # (ch, T)
        if sr != self.sr:
            y = ta.functional.resample(y, sr, self.sr)
        y = y.mean(0, keepdim=True)         # mono (1, T)

        # 길이 5초로 맞추기
        if y.shape[1] < self.fixed_len:
            y = nn.functional.pad(y, (0, self.fixed_len - y.shape[1]))
        else:
            y = y[:, :self.fixed_len]

        # Log-Mel
        mel = self.mel(y)                   # (1, n_mels, time)
        mel_db = self.to_db(mel)

        if self.augment:
            mel_db = self.fmask(mel_db)
            mel_db = self.tmask(mel_db)

        # 표준화(채널별): 실험적으로 성능 안정화
        m = mel_db.mean(dim=(2), keepdim=True)
        s = mel_db.std(dim=(2), keepdim=True).clamp_min(1e-6)
        mel_db = (mel_db - m) / s

        return mel_db.squeeze(0), int(r["y"])   # (n_mels, time), label


In [12]:
class CRNN(nn.Module):
    def __init__(self, n_mels=128, n_class=10, rnn_hidden=128):
        super().__init__()
        # CNN: (B, 1, n_mels, time)
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d((2,2)),
            nn.Conv2d(32,64,3, padding=1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d((2,2)),
            nn.Conv2d(64,128,3,padding=1), nn.BatchNorm2d(128), nn.ReLU(),
        )
        # CNN 출력 -> (B, C, F, T). RNN에 넣을 때는 (B, T, C*F)
        self.bi_lstm = nn.LSTM(
            input_size=128 * (n_mels//4),   # 풀링(2,2)x2 후 주파수 축 크기
            hidden_size=rnn_hidden, num_layers=1,
            batch_first=True, bidirectional=True
        )
        self.classifier = nn.Sequential(
            nn.Linear(rnn_hidden*2, 128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128, n_class)
        )

    def forward(self, x):            # x: (B, n_mels, T)
        x = x.unsqueeze(1)           # (B,1,M,T)
        x = self.cnn(x)              # (B,128,M/4,T/4)
        B, C, F, T = x.shape
        x = x.permute(0,3,1,2).contiguous().view(B, T, C*F)  # (B,T, C*F)
        out, _ = self.bi_lstm(x)     # (B,T, 2*hidden)
        out = out.mean(dim=1)        # 시간 평균 풀링 (또는 마지막 스텝 사용 가능)
        logits = self.classifier(out)
        return logits


In [19]:
from torch.optim import Adam
from torch.utils.data import DataLoader

def run_fold(val_fold=1, epochs=25, bs=16, lr=1e-3):
    train_df = df[df["fold"] != val_fold]
    val_df   = df[df["fold"] == val_fold]

    train_ds = ESC10CRNNDataset(train_df, augment=True)
    val_ds   = ESC10CRNNDataset(val_df, augment=False)
    train_dl = DataLoader(train_ds, batch_size=bs, shuffle=True, num_workers=0, pin_memory=True)
    val_dl   = DataLoader(val_ds, batch_size=bs, shuffle=False, num_workers=0, pin_memory=True)

    model = CRNN(n_mels=128, n_class=10, rnn_hidden=128).to(DEVICE)
    opt = Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    crit = nn.CrossEntropyLoss()

    best, best_state = 0.0, None
    for ep in range(1, epochs+1):
        model.train()
        for mel, y in train_dl:
            mel, y = mel.to(DEVICE), y.to(DEVICE)
            opt.zero_grad()
            loss = crit(model(mel), y)
            loss.backward(); opt.step()

        # validate
        model.eval(); correct=tot=0
        with torch.no_grad():
            for mel, y in val_dl:
                mel, y = mel.to(DEVICE), y.to(DEVICE)
                pred = model(mel).argmax(1)
                correct += (pred==y).sum().item(); tot += y.numel()
        acc = correct/tot
        print(f"[fold{val_fold}] epoch {ep}  val_acc={acc:.3f}")
        if acc > best:
            best, best_state = acc, {k:v.cpu() for k,v in model.state_dict().items()}

    torch.save(best_state, f"crnn_esc10_fold{val_fold}.pth")
    print(f"Best(fold{val_fold})={best:.3f}")
    return best

In [20]:
# 예시 실행 (5-fold 평균 권장)
scores = [run_fold(f) for f in [1,2,3,4,5]]
print("mean acc:", sum(scores)/len(scores))

[fold1] epoch 1  val_acc=0.400
[fold1] epoch 2  val_acc=0.388
[fold1] epoch 3  val_acc=0.588
[fold1] epoch 4  val_acc=0.588
[fold1] epoch 5  val_acc=0.713
[fold1] epoch 6  val_acc=0.613
[fold1] epoch 7  val_acc=0.388
[fold1] epoch 8  val_acc=0.688
[fold1] epoch 9  val_acc=0.775
[fold1] epoch 10  val_acc=0.700
[fold1] epoch 11  val_acc=0.650
[fold1] epoch 12  val_acc=0.725
[fold1] epoch 13  val_acc=0.662
[fold1] epoch 14  val_acc=0.675
[fold1] epoch 15  val_acc=0.750
[fold1] epoch 16  val_acc=0.725
[fold1] epoch 17  val_acc=0.738
[fold1] epoch 18  val_acc=0.738
[fold1] epoch 19  val_acc=0.838
[fold1] epoch 20  val_acc=0.738
[fold1] epoch 21  val_acc=0.750
[fold1] epoch 22  val_acc=0.725
[fold1] epoch 23  val_acc=0.750
[fold1] epoch 24  val_acc=0.750
[fold1] epoch 25  val_acc=0.775
Best(fold1)=0.838
[fold2] epoch 1  val_acc=0.200
[fold2] epoch 2  val_acc=0.287
[fold2] epoch 3  val_acc=0.450
[fold2] epoch 4  val_acc=0.512
[fold2] epoch 5  val_acc=0.625
[fold2] epoch 6  val_acc=0.688
[fold