In [None]:
import os
import re
import requests
import subprocess
from pathlib import Path
from tqdm import tqdm


def get_direct_file_link(mailru_file_url: str) -> str:
    """
    Преобразует публичную ссылку вида:
        https://cloud.mail.ru/public/<key>/<subkey>/<filename>
    в прямую ссылку на CDN, по которой можно скачать файл через wget или requests.

    Возвращает прямую ссылку для скачивания.
    """
    resp = requests.get(mailru_file_url)
    if resp.status_code != 200:
        raise RuntimeError(f"Ошибка {resp.status_code} при запросе {mailru_file_url}")

    match = re.search(r'dispatcher.*?weblink_get.*?url":"(.*?)"', resp.text)
    if not match:
        raise RuntimeError("Не удалось найти CDN ссылку в HTML Mail.ru")

    base_url = match.group(1)
    parts = mailru_file_url.strip("/").split("/")[-3:]
    return f"{base_url}/{parts[0]}/{parts[1]}/{parts[2]}"


def download_from_mailru(file_url: str, local_name: str, force: bool = False, show_progress: bool = True):
    """
    Скачивает файл с Mail.ru по публичной ссылке.

    Args:
        file_url: ссылка на файл в облаке Mail.ru.
        local_name: имя файла для сохранения.
        force: если True — перекачивает даже если файл уже есть.
        show_progress: показывать ли прогресс-бар.
    """
    local_path = Path(local_name)
    if local_path.exists() and not force:
        print(f"Файл {local_name} уже существует, пропускаем скачивание.")
        return

    direct = get_direct_file_link(file_url)
    print(f"Скачиваем {file_url} → {local_name}")

    with requests.get(direct, stream=True) as r:
        r.raise_for_status()
        total_size = int(r.headers.get("content-length", 0))
        block_size = 8192
        with open(local_name, "wb") as f, tqdm(
            total=total_size,
            unit="B",
            unit_scale=True,
            unit_divisor=1024,
            desc=f"Downloading {local_name}",
            disable=not show_progress,
        ) as bar:
            for chunk in r.iter_content(block_size):
                f.write(chunk)
                bar.update(len(chunk))

    print(f"Файл {local_name} успешно скачан ({os.path.getsize(local_name)/1e6:.1f} MB).")

In [None]:
# Ссылки на данные по задаче
train_link = "https://cloud.mail.ru/public/Gsyr/8VxmbhAaZ/train_data.tar"
test_link  = "https://cloud.mail.ru/public/Gsyr/8VxmbhAaZ/test_data.tar"

In [None]:
# Если скорость загрузки низкая — это может быть связано с CDN.
# Попробуйте перезапустить ячейку: при новом соединении может попасться другой узел CDN,
# и загрузка обычно проходит быстрее (2-3 минуты при нормальном узле).
download_from_mailru(train_link, "train_data.tar")
download_from_mailru(test_link, "test_data.tar")

In [None]:
# Распаковка
subprocess.run(["tar", "xf", "train_data.tar"], check=True)
subprocess.run(["tar", "xf", "test_data.tar"], check=True)
print("Готово.")

Готово.


In [None]:
folder_path = '/content/test_opus/audio'
files_before = len([f for f in Path(folder_path).rglob('*') if f.is_file()])
print(f"Файлов до удаления: {files_before}")

!find {folder_path} -type f -name "._*" -delete

files_after = len([f for f in Path(folder_path).rglob('*') if f.is_file()])
deleted = files_before - files_after

print(f"Удалено файлов: {deleted}")
print(f"Файлов после удаления: {files_after}")

In [None]:
folder_path = '/content/train_opus/audio'
files_before = len([f for f in Path(folder_path).rglob('*') if f.is_file()])
print(f"Файлов до удаления: {files_before}")

!find {folder_path} -type f -name "._*" -delete

files_after = len([f for f in Path(folder_path).rglob('*') if f.is_file()])
deleted = files_before - files_after

print(f"Удалено файлов: {deleted}")
print(f"Файлов после удаления: {files_after}")

## Import & pipeline starting

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os
import gc
import json
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.stats import hmean

import torchaudio
from transformers import AutoProcessor
from transformers import AutoModelForSpeechSeq2Seq

from tqdm.notebook import tqdm

!pip -q install torch-audiomentations
from torch_audiomentations import Compose, Gain, PolarityInversion, AddColoredNoise, Shift, HighPassFilter, LowPassFilter
import librosa

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.6/59.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for julius (setup.py) ... [?25l[?25hdone


In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED=2008
set_seed(SEED)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

## Data preparing

In [None]:
train_audio_path = '/content/train_opus/audio'
test_audio_path = '/content/test_opus/audio'
word_bounds_path = '/content/train_opus/word_bounds.json'

In [None]:
word_bounds = json.load(open(word_bounds_path))

train_data = pd.DataFrame({'id': [f[:f.find('.')] for f in os.listdir(train_audio_path)]})
train_data['label'] = train_data['id'].apply(lambda x: 1 if x in word_bounds else 0)
train_data['path'] = train_data['id'].apply(lambda x: f'{train_audio_path}/{x}.opus')

pos_items = train_data[train_data['label'] == 1][['id', 'path']].values.tolist()
neg_items = train_data[train_data['label'] == 0][['id', 'path']].values.tolist()

pos_train, pos_val = train_test_split(pos_items, test_size=0.12, random_state=SEED)
neg_train, neg_val = train_test_split(neg_items, test_size=0.12, random_state=SEED)

train_items = pos_train + neg_train
val_items = pos_val + neg_val

random.shuffle(train_items)
random.shuffle(val_items)

print(f'Train: {len(train_items)} (pos: {len(pos_train)}, neg: {len(neg_train)})')
print(f'Val: {len(val_items)} (pos: {len(pos_val)}, neg: {len(neg_val)})')

## Augmentations

In [None]:
class AudioAugmentation:
    def __init__(self, sample_rate=16000, use_heavy_aug=True):
        self.sample_rate = sample_rate
        self.use_heavy_aug = use_heavy_aug

        self.augment = Compose(
            transforms=[
                Gain(min_gain_in_db=-12.0, max_gain_in_db=8.0, p=0.6),
                PolarityInversion(p=0.5),
                AddColoredNoise(min_snr_in_db=10.0, max_snr_in_db=35.0, min_f_decay=-2.0, max_f_decay=2.0, p=0.6),
                Shift(min_shift=-0.15, max_shift=0.15, shift_unit="fraction", rollover=True, p=0.5),
                HighPassFilter(min_cutoff_freq=20.0, max_cutoff_freq=400.0, p=0.35),
                LowPassFilter(min_cutoff_freq=2000.0, max_cutoff_freq=7500.0, p=0.35),
            ]
        )

    def apply_time_stretch(self, wav, rate=0.15):
        if random.random() > 0.4:
            try:
                stretch_factor = random.uniform(1.0 - rate, 1.0 + rate)
                wav_np = wav.cpu().numpy() if isinstance(wav, torch.Tensor) else wav
                stretched = librosa.effects.time_stretch(wav_np, rate=stretch_factor)
                if len(stretched) > len(wav_np):
                    stretched = stretched[:len(wav_np)]
                else:
                    stretched = np.pad(stretched, (0, max(0, len(wav_np) - len(stretched))))
                return stretched
            except:
                return wav.cpu().numpy() if isinstance(wav, torch.Tensor) else wav
        return wav.cpu().numpy() if isinstance(wav, torch.Tensor) else wav

    def apply_pitch_shift(self, wav, n_steps_range=(-2, 2)):
        if random.random() > 0.5:
            try:
                n_steps = random.uniform(n_steps_range[0], n_steps_range[1])
                if abs(n_steps) < 0.1:
                    return wav
                wav_np = wav.cpu().numpy() if isinstance(wav, torch.Tensor) else wav
                shifted = librosa.effects.pitch_shift(wav_np, sr=self.sample_rate, n_steps=n_steps)
                return shifted
            except:
                return wav.cpu().numpy() if isinstance(wav, torch.Tensor) else wav
        return wav.cpu().numpy() if isinstance(wav, torch.Tensor) else wav

    def apply_background_noise(self, wav, noise_level_range=(0.005, 0.03)):
        if random.random() > 0.6:
            wav_np = wav.cpu().numpy() if isinstance(wav, torch.Tensor) else wav
            std = np.std(wav_np)
            if std < 1e-6:
                return wav_np
            noise_level = random.uniform(*noise_level_range)
            noise = np.random.randn(len(wav_np)) * noise_level * std
            return wav_np + noise
        return wav.cpu().numpy() if isinstance(wav, torch.Tensor) else wav

    def __call__(self, wav):
        if isinstance(wav, np.ndarray):
            wav_torch = torch.from_numpy(wav).float()
        else:
            wav_torch = wav.float()

        wav_torch = wav_torch.unsqueeze(0).unsqueeze(0)
        augmented = self.augment(wav_torch, sample_rate=self.sample_rate)
        augmented = augmented.squeeze(0).squeeze(0)
        augmented_np = augmented.cpu().numpy()

        if self.use_heavy_aug:
            augmented_np = self.apply_time_stretch(augmented_np, rate=0.15)
            augmented_np = self.apply_pitch_shift(augmented_np, n_steps_range=(-1.5, 1.5))
            augmented_np = self.apply_background_noise(augmented_np, noise_level_range=(0.01, 0.04))

        return augmented_np.astype(np.float32)

augmentation = AudioAugmentation(sample_rate=16000, use_heavy_aug=True)

## Dataset

In [None]:
MODEL_NAME = 'openai/whisper-medium'

In [None]:
class KWSDataset(Dataset):
    def __init__(self, items, labels=None, sampling_rate=16000, augment=False):
        self.processor = AutoProcessor.from_pretrained(MODEL_NAME)

        self.filepaths = [item[1] for item in items]
        if labels is None:
            self.labels = [0] * len(items)
        else:
            self.labels = labels

        self.sampling_rate = sampling_rate
        self.augment = augment

    def __getitem__(self, idx):
        waveform, sr = torchaudio.load(self.filepaths[idx])
        if sr != self.sampling_rate:
            waveform = torchaudio.functional.resample(waveform, sr, self.sampling_rate)

        if self.augment:
            waveform_np = augmentation(waveform.squeeze(0).numpy())
            waveform = torch.from_numpy(waveform_np).unsqueeze(0)

        inputs = self.processor(
            waveform.squeeze(0),
            sampling_rate=self.sampling_rate,
            return_tensors="pt"
        )

        inputs_dict = {"input_features": inputs.input_features.squeeze(0)}
        if self.labels is not None:
            inputs_dict["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)

        return inputs_dict

    def __len__(self):
        return len(self.filepaths)

In [None]:
train_labels = [1] * len(pos_train) + [0] * len(neg_train)
val_labels = [1] * len(pos_val) + [0] * len(neg_val)

train_dataset = KWSDataset(train_items, train_labels, augment=True)
val_dataset = KWSDataset(val_items, val_labels, augment=False)

batch_size = 2
gradient_accumulation_steps = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=False, pin_memory=True)

## Model & training utils

In [None]:
def calculate_metrics(preds, labels, num_pos, num_neg):
    correct = (preds == labels).sum()
    total = len(labels)
    accuracy = correct / total if total > 0 else 0

    tp = ((preds == 1) & (labels == 1)).sum()
    fp = ((preds == 1) & (labels == 0)).sum()
    fn = ((preds == 0) & (labels == 1)).sum()
    tn = ((preds == 0) & (labels == 0)).sum()

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    frr = fn / num_pos if num_pos > 0 else 0
    far = fp / num_neg if num_neg > 0 else 0

    score_1_frr = 1 - frr
    score_1_far = 1 - far

    if score_1_frr > 0 and score_1_far > 0:
        competition_score = hmean([score_1_frr, score_1_far])
    else:
        competition_score = 0.0

    return {
        'accuracy': accuracy,
        'f1': f1,
        'competition_score': competition_score,
        'tp': tp,
        'fp': fp,
        'fn': fn,
        'tn': tn
    }

In [None]:
def validate(model, dataloader, criterion, device, num_pos, num_neg):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            input_features = batch['input_features'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_features, labels=labels)
            loss = outputs['loss']
            total_loss += loss.item()

            preds = outputs['logits'].argmax(dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)

    metrics = calculate_metrics(all_preds, all_labels, num_pos, num_neg)
    metrics['loss'] = total_loss / len(dataloader)

    return metrics

In [None]:
# class AttentionPooling(nn.Module):
#     def __init__(self, hidden_size):
#         super().__init__()
#         self.attention = nn.Sequential(
#             nn.Linear(hidden_size, hidden_size // 2),
#             nn.Tanh(),
#             nn.Linear(hidden_size // 2, 1)
#         )

#     def forward(self, hidden_states):
#         attention_weights = self.attention(hidden_states)
#         attention_weights = F.softmax(attention_weights, dim=1)
#         pooled = torch.sum(hidden_states * attention_weights, dim=1)
#         return pooled

In [None]:
class WhisperForKWS(nn.Module):
    def __init__(self, model_id, num_labels=2):
        super().__init__()
        from transformers import AutoModelForSpeechSeq2Seq
        self.whisper = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
        hidden_size = self.whisper.model.encoder.layer_norm.normalized_shape[0]

        self.classifier = nn.Sequential(
            nn.LayerNorm(hidden_size),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, 192),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(192, num_labels)
        )

    def forward(self, input_features, labels=None):
        encoder_outputs = self.whisper.model.encoder(input_features)
        hidden_states = encoder_outputs.last_hidden_state

        pooled = hidden_states.mean(dim=1)
        logits = self.classifier(pooled)

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)

        return {"loss": loss, "logits": logits}

## Training

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
num_epochs = 3

model = WhisperForKWS(MODEL_NAME)
model.to(device=device, dtype=torch.float32)

whisper_lr = 1e-5
classifier_lr = 25e-5

optimizer = torch.optim.Adam([
    {'params': model.whisper.parameters(), 'lr': whisper_lr},
    {'params': model.classifier.parameters(), 'lr': classifier_lr}
])

criterion = nn.CrossEntropyLoss()

checkpoint_dir = 'whisper_checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

history = {
    'train_loss': [], 'train_acc': [], 'train_f1': [], 'train_score': [],
    'val_loss': [], 'val_acc': [], 'val_f1': [], 'val_score': []
}

best_score = 0
best_model_path = os.path.join(checkpoint_dir, 'best_model.pth')

for epoch in range(1, num_epochs + 1):
    print(f'\nEpoch {epoch}/{num_epochs}')

    model.train()
    train_preds = []
    train_labels = []
    train_losses = []

    optimizer.zero_grad()

    progress_bar = tqdm(train_loader, desc=f"Training")
    for step, batch in enumerate(progress_bar):
        input_features = batch['input_features'].to(device=device, dtype=torch.float32)
        labels = batch['labels'].to(device=device)

        outputs = model(input_features, labels=labels)
        loss = outputs['loss'] / gradient_accumulation_steps
        loss.backward()

        if (step + 1) % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()

        preds = outputs['logits'].argmax(dim=1).detach().cpu().numpy()
        train_preds.extend(preds)
        train_labels.extend(labels.cpu().numpy())
        train_losses.append(loss.item() * gradient_accumulation_steps)

        progress_bar.set_postfix(loss=loss.item() * gradient_accumulation_steps)

    if len(train_loader) % gradient_accumulation_steps != 0:
        optimizer.step()
        optimizer.zero_grad()

    train_metrics = calculate_metrics(np.array(train_preds), np.array(train_labels), len(pos_train), len(neg_train))
    train_metrics['loss'] = np.mean(train_losses)

    val_metrics = validate(model, val_loader, criterion, device, len(pos_val), len(neg_val))

    history['train_loss'].append(train_metrics['loss'])
    history['train_acc'].append(train_metrics['accuracy'])
    history['train_f1'].append(train_metrics['f1'])
    history['train_score'].append(train_metrics['competition_score'])
    history['val_loss'].append(val_metrics['loss'])
    history['val_acc'].append(val_metrics['accuracy'])
    history['val_f1'].append(val_metrics['f1'])
    history['val_score'].append(val_metrics['competition_score'])

    print(f"Train Loss: {train_metrics['loss']:.4f} | Acc: {train_metrics['accuracy']:.4f} | F1: {train_metrics['f1']:.4f} | Score: {train_metrics['competition_score']:.4f}")
    print(f"Val Loss: {val_metrics['loss']:.4f} | Acc: {val_metrics['accuracy']:.4f} | F1: {val_metrics['f1']:.4f} | Score: {val_metrics['competition_score']:.4f}")

    checkpoint_path = os.path.join(checkpoint_dir, f'epoch_{epoch}.pth')
    torch.save(model.state_dict(), checkpoint_path)

    if val_metrics['competition_score'] > best_score:
        best_score = val_metrics['competition_score']
        torch.save(model.state_dict(), best_model_path)
        print(f'Best model saved with Score: {best_score:.4f}')

print(f'Best validation Score: {best_score:.4f}')

## Making submission

In [None]:
model.load_state_dict(torch.load(best_model_path))

In [None]:
val_probs = []
val_labels = []

with torch.no_grad():
    model.eval()
    for batch in tqdm(val_loader):
        input_features = batch['input_features'].to(device=device, dtype=torch.float32)
        labels = batch['labels'].to(device=device)
        outputs = model(input_features)
        val_probs.extend(outputs['logits'].softmax(dim=-1)[:, 1].cpu().numpy())
        val_labels.extend(labels.cpu().numpy())

val_probs = np.array(val_probs)
val_labels = np.array(val_labels)

In [None]:
thresholds = np.arange(0.1, 0.9, 0.025)
best_score = 0
best_threshold = 0.5

for thresh in thresholds:
    preds = (val_probs >= thresh).astype(int)
    metrics = calculate_metrics(preds, val_labels, len(pos_val), len(neg_val))
    if metrics['competition_score'] > best_score:
        best_score = metrics['competition_score']
        best_threshold = thresh

print(f'Best threshold: {best_threshold:.2f} with score: {best_score:.4f}')

In [None]:
test_filepaths = os.listdir(test_audio_path)
test_ids = [f.replace('.opus', '') for f in test_filepaths]
test_audio_paths = [f'{test_audio_path}/{f}' for f in test_filepaths]
test_items = [[id, path] for id, path in zip(test_ids, test_audio_paths)]

test_dataset = KWSDataset(test_items)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=False, pin_memory=True)

probs = []
with torch.no_grad():
    model.eval()
    for batch in tqdm(test_loader):
        input_features = batch['input_features'].to(device=device, dtype=torch.float32)
        outputs = model(input_features)
        probs.append(outputs['logits'].softmax(dim=-1).cpu().numpy())

probs = np.concatenate(probs, axis=0)

In [None]:
test_labels = (probs[:, 1] >= best_threshold).astype(int)
submission = pd.DataFrame({'id': test_ids, 'label': test_labels})
submission.label.value_counts(normalize=True)

submission.to_csv(f"sub-mean-seed-{SEED}.csv", index=False)
submission.head()