In [53]:
import os
import json
import random
import subprocess
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import librosa

from kws_dataloader import KWSLazyDataset, load_opus_ffmpeg, logmel, SEG_SEC, SR


In [40]:
TRAIN_DIR = Path("train_opus")
TEST_DIR  = Path("test_opus")

TRAIN_AUDIO = TRAIN_DIR / "audio"
TEST_AUDIO  = TEST_DIR  / "audio"


In [41]:
with open(TRAIN_DIR / "word_bounds.json", "r", encoding="utf-8") as f:
    bounds = json.load(f)

train_files = sorted(TRAIN_AUDIO.glob("*.opus"))
print("train_files:", len(train_files), "bounds:", len(bounds))


train_files: 90000 bounds: 45000


In [42]:
pos_ids = set(bounds.keys())

pos = []
neg = []

for p in tqdm(train_files):
    audio_id = p.stem
    if audio_id in pos_ids:
        s, e = bounds[audio_id]
        pos.append((str(p), 1, float(s), float(e)))
    else:
        neg.append((str(p), 0, None, None))

print("pos:", len(pos), "neg:", len(neg))


100%|██████████| 90000/90000 [00:00<00:00, 663114.76it/s]

pos: 45000 neg: 45000





In [43]:
NEG_RATIO = 1   # 1 позитив : 1 негатив
random.seed(42)

neg_sample = random.sample(neg, k=min(len(neg), len(pos) * NEG_RATIO))
examples = pos + neg_sample
random.shuffle(examples)

print("examples:", len(examples))


examples: 90000


In [44]:
print("train_files:", len(train_files))
print("pos from json:", len(pos))
print("neg total:", len(neg))
print("examples:", len(examples))
print("examples pos/neg:", sum(1 for _,lab,_,_ in examples if lab==1),
      sum(1 for _,lab,_,_ in examples if lab==0))


train_files: 90000
pos from json: 45000
neg total: 45000
examples: 90000
examples pos/neg: 45000 45000


In [45]:
from sklearn.model_selection import train_test_split

labels = np.array([lab for _, lab, _, _ in examples], dtype=np.int64)
idx = np.arange(len(examples))

tr_idx, va_idx = train_test_split(idx, test_size=0.2, random_state=42, stratify=labels)

train_ex = [examples[i] for i in tr_idx]
val_ex   = [examples[i] for i in va_idx]

print("train:", len(train_ex), "val:", len(val_ex))


train: 72000 val: 18000


In [46]:
BATCH_TRAIN = 64
BATCH_VAL = 128
NUM_WORKERS = 4

train_dl = DataLoader(KWSLazyDataset(train_ex), batch_size=BATCH_TRAIN, shuffle=True,
                      num_workers=NUM_WORKERS, pin_memory=True)
val_dl   = DataLoader(KWSLazyDataset(val_ex), batch_size=BATCH_VAL, shuffle=False,
                      num_workers=NUM_WORKERS, pin_memory=True)


In [None]:
class CNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.feat = nn.Sequential(
            nn.Conv2d(1, 16, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(16, 32, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
        )
        self.head = nn.Linear(64, 1)

    def forward(self, x):
        z = self.feat(x).flatten(1)
        return self.head(z).squeeze(1)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = CNN().to(device)

opt = torch.optim.Adam(model.parameters(), lr=1e-3)
crit = nn.BCEWithLogitsLoss()

device


'cpu'

In [48]:
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
    else "cpu"
)
print("device:", device)
model = model.to(device)


device: mps


In [49]:
def score_metric(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    pos = y_true == 1
    neg = y_true == 0
    NUM_POS = pos.sum()
    NUM_NEG = neg.sum()

    FN = np.sum((y_true == 1) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))

    FRR = FN / max(1, NUM_POS)
    FAR = FP / max(1, NUM_NEG)

    a = 1.0 - FRR
    b = 1.0 - FAR
    return 2*a*b / max(1e-12, (a + b))

@torch.no_grad()
def eval_probs(dl):
    model.eval()
    probs, ys = [], []
    for x, yb in dl:
        x = x.to(device)
        p = torch.sigmoid(model(x)).cpu().numpy()
        probs.append(p)
        ys.append(yb.numpy())
    return np.concatenate(probs), np.concatenate(ys)

def find_best_threshold(y_true, probs):
    ts = np.linspace(0.05, 0.95, 91)
    best_s, best_t = -1.0, 0.5
    for t in ts:
        pred = (probs >= t).astype(int)
        s = score_metric(y_true, pred)
        if s > best_s:
            best_s, best_t = s, t
    return best_s, best_t


In [52]:
EPOCHS = 5

for epoch in range(1, EPOCHS + 1):
    model.train()
    running = 0.0
    n = 0

    for x, yb in tqdm(train_dl, desc=f"epoch {epoch}"):
        x = x.to(device)
        yb = yb.to(device)

        opt.zero_grad()
        logits = model(x)
        loss = crit(logits, yb)
        loss.backward()
        opt.step()

        running += float(loss.item()) * x.size(0)
        n += x.size(0)

    val_probs, val_y = eval_probs(val_dl)
    best_s, best_t = find_best_threshold(val_y, val_probs)
    print(f"epoch {epoch}: train_loss={running/n:.4f} val_score={best_s:.4f} best_thr={best_t:.2f}")


epoch 1: 100%|██████████| 1125/1125 [14:43<00:00,  1.27it/s]


epoch 1: train_loss=0.6255 val_score=0.7375 best_thr=0.54


epoch 2: 100%|██████████| 1125/1125 [14:45<00:00,  1.27it/s]


epoch 2: train_loss=0.5183 val_score=0.7861 best_thr=0.42


epoch 3: 100%|██████████| 1125/1125 [14:52<00:00,  1.26it/s]


epoch 3: train_loss=0.4603 val_score=0.8082 best_thr=0.47


epoch 4: 100%|██████████| 1125/1125 [14:45<00:00,  1.27it/s]


epoch 4: train_loss=0.4368 val_score=0.8184 best_thr=0.41


epoch 5: 100%|██████████| 1125/1125 [14:43<00:00,  1.27it/s]


epoch 5: train_loss=0.4146 val_score=0.8255 best_thr=0.54


In [54]:
def sliding_windows(y: np.ndarray, sr: int, seg_sec=SEG_SEC, hop_sec=0.3):
    seg = int(seg_sec * sr)
    hop = int(hop_sec * sr)
    if len(y) <= seg:
        yield np.pad(y, (0, seg - len(y)))
        return
    for start in range(0, len(y) - seg + 1, hop):
        yield y[start:start+seg]

@torch.no_grad()
def predict_record(path: str, thr: float, hop_sec=0.3):
    y = load_opus_ffmpeg(path, sr=SR)
    feats = []
    for seg in sliding_windows(y, SR, seg_sec=SEG_SEC, hop_sec=hop_sec):
        feats.append(logmel(seg, sr=SR))
    Xw = torch.from_numpy(np.stack(feats)).unsqueeze(1).to(device)  # [K,1,80,T]
    probs = torch.sigmoid(model(Xw)).cpu().numpy()
    p = float(probs.max())
    return int(p >= thr), p


In [55]:
test_files = sorted(TEST_AUDIO.glob("*.opus"))
print("test_files:", len(test_files))

val_probs, val_y = eval_probs(val_dl)
best_s, best_thr = find_best_threshold(val_y, val_probs)
print("best_val_score:", best_s, "thr:", best_thr)

rows = []
for p in tqdm(test_files):
    label, prob = predict_record(str(p), thr=best_thr, hop_sec=0.3)
    rows.append((p.stem, label))

sub = pd.DataFrame(rows, columns=["id", "label"])
sub.to_csv("submission.csv", index=False)
sub.head()


test_files: 27000




best_val_score: 0.8267114033910438 thr: 0.49999999999999994


100%|██████████| 27000/27000 [25:48<00:00, 17.44it/s]


Unnamed: 0,id,label
0,0000219778122723066859323624505982384475,1
1,0000920560142346477464477964040846645823,1
2,0002106775361063830068199242310438122126,1
3,0002161736146841817059430282255903999813,0
4,0002303832386140303186933286284938192307,1
