```yaml
jupyter:
  jupytext:
    text_representation:
      extension: .py
      format_name: light
      format_version: '1.5'
      jupytext_version: 1.16.4
  kernelspec:
    display_name: Python 3
    name: python3
```
Ниже приведены основные компоненты:
- Загрузка и предобработка аудио
- Модели: CNN, Transformer, Whisper
- Аугментации
- Метрики
- Тренировка и инференс

**Как использовать:**
- Замените пути к данным на свои
- Подставьте свои метки и классы
- Выберите нужную задачу и модель
- Настройте гиперпараметры под себя

In [None]:
# pip install torch torchaudio librosa audiomentations transformers datasets scikit-learn wandb tqdm pandas numpy
import os
import sys
import random
import numpy as np
import pandas as pd
import torch
import torchaudio
import librosa
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import torch.nn as nn
import torch.nn.functional as F
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import warnings
warnings.filterwarnings("ignore")

In [None]:
# --- НАСТРОЙКИ ---
# Подставьте сюда путь к вашему датасету
DATA_PATH = "/path/to/your/audio/data"
MODEL_NAME = "openai/whisper-base"  # Модель Whisper (base, tiny, small, large)
TASK_TYPE = "vad"  # "vad", "classification", "sed", "transcription"
NUM_CLASSES = 2  # Для задачи "я тебя не слышу": 0 - нет речи, 1 - есть речь
SAMPLE_RATE = 16000  # Whisper работает на 16kHz
DURATION = 5.0  # в секундах
BATCH_SIZE = 16
EPOCHS = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def load_audio(path, sr=16000, duration=5.0):
    """
    Загрузка и нормализация аудио файла.
    Если длина файла < duration, то дополняем нулями.
    Если > duration, то обрезаем.
    """
    waveform, orig_sr = torchaudio.load(path)
    # Ресемплинг
    if orig_sr != sr:
        resampler = torchaudio.transforms.Resample(orig_sr, sr)
        waveform = resampler(waveform)
    # Моно
    if waveform.shape[0] == 2:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    # Обрезка/заполнение
    expected_length = int(sr * duration)
    current_length = waveform.shape[1]
    if current_length < expected_length:
        # Заполняем нулями
        pad_length = expected_length - current_length
        waveform = F.pad(waveform, (0, pad_length))
    elif current_length > expected_length:
        # Обрезаем
        waveform = waveform[:, :expected_length]
    return waveform

In [None]:
class AudioDataset(Dataset):
    """
    Класс для работы с аудио-датасетом.
    """
    def __init__(self, df, sr=16000, duration=5.0, transform=None, task_type="vad"):
        self.df = df
        self.sr = sr
        self.duration = duration
        self.transform = transform
        self.task_type = task_type

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = self.df.iloc[idx]['path']
        label = self.df.iloc[idx]['label']
        waveform = load_audio(path, sr=self.sr, duration=self.duration)

        if self.transform:
            waveform = self.transform(waveform.numpy())
            waveform = torch.tensor(waveform)

        # Whisper принимает raw audio
        if self.task_type == "transcription":
            # Возвращаем waveform и текст (если есть)
            text = self.df.iloc[idx].get('text', "")
            return waveform, text
        else:
            # Для задач классификации и VAD
            return waveform, torch.tensor(label, dtype=torch.long)

In [None]:
augmentations = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.9, max_rate=1.1, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
    Shift(min_fraction=-0.1, max_fraction=0.1, p=0.5),
])

In [None]:
# --- CNN-базовая модель для VAD / Classification ---
class SimpleAudioCNN(nn.Module):
    def __init__(self, n_classes=2):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 32, kernel_size=80, stride=4, padding=38)
        self.bn1 = nn.BatchNorm1d(32)
        self.pool1 = nn.MaxPool1d(kernel_size=4, stride=4)
        self.conv2 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm1d(64)
        self.pool2 = nn.MaxPool1d(kernel_size=4, stride=4)
        self.conv3 = nn.Conv1d(64, 128, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm1d(128)
        self.pool3 = nn.MaxPool1d(kernel_size=4, stride=4)
        self.conv4 = nn.Conv1d(128, 256, kernel_size=3, stride=1, padding=1)
        self.bn4 = nn.BatchNorm1d(256)
        self.pool4 = nn.MaxPool1d(kernel_size=4, stride=4)
        self.fc = nn.Linear(256, n_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool1(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool2(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool3(x)
        x = F.relu(self.bn4(self.conv4(x)))
        x = self.pool4(x)
        x = torch.mean(x, dim=2)  # Global average pooling
        x = self.fc(x)
        return x

In [None]:
# Загрузка Whisper модели для транскрибции
# Используется только если task_type == "transcription"
if TASK_TYPE == "transcription":
    processor = WhisperProcessor.from_pretrained(MODEL_NAME)
    whisper_model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
    whisper_model.to(DEVICE)
    whisper_model.eval()

In [None]:
# Пример DataFrame (замените на свой)
# df = pd.read_csv("your_data.csv")  # с колонками: path, label, text (если transcription)
# Пример:
df = pd.DataFrame({
    'path': ['/path/to/audio1.wav', '/path/to/audio2.wav'],
    'label': [1, 0],  # 1 - речь, 0 - нет
    'text': ['hello world', '']  # только для транскрибции
})

# Разделение
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
train_dataset = AudioDataset(
    train_df,
    sr=SAMPLE_RATE,
    duration=DURATION,
    transform=augmentations,
    task_type=TASK_TYPE
)

val_dataset = AudioDataset(
    val_df,
    sr=SAMPLE_RATE,
    duration=DURATION,
    transform=None,
    task_type=TASK_TYPE
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
if TASK_TYPE in ["vad", "classification", "sed"]:
    model = SimpleAudioCNN(n_classes=NUM_CLASSES)
    model.to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        x, y = batch
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [None]:
def validate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            x, y = batch
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss = criterion(logits, y)
            total_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(y.cpu().numpy())
    
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return total_loss / len(dataloader), acc, f1

In [None]:
if TASK_TYPE in ["vad", "classification", "sed"]:
    for epoch in range(EPOCHS):
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, DEVICE)
        val_loss, val_acc, val_f1 = validate(model, val_loader, criterion, DEVICE)
        print(f"Epoch {epoch+1}/{EPOCHS}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val F1: {val_f1:.4f}")


In [None]:
def transcribe_audio_with_whisper(audio_path, processor, model, device):
    """
    Транскрибация одного аудиофайла с помощью Whisper.
    """
    waveform = load_audio(audio_path, sr=16000, duration=None)  # Whisper сам обрабатывает длину
    input_features = processor(
        waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt"
    ).input_features
    input_features = input_features.to(device)
    
    with torch.no_grad():
        predicted_ids = model.generate(input_features)
    
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    return transcription


In [None]:
if TASK_TYPE == "transcription":
    # Пример: транскрибация
    sample_path = "/path/to/sample.wav"
    result = transcribe_audio_with_whisper(sample_path, processor, whisper_model, DEVICE)
    print("Transcription:", result)


In [None]:
if TASK_TYPE in ["vad", "classification", "sed"]:
    torch.save(model.state_dict(), f"model_{TASK_TYPE}.pth")
    print(f"Модель {TASK_TYPE} сохранена как model_{TASK_TYPE}.pth")

## Как подпиливать под задачу:

1. **Замените `df`** на ваш датафрейм с `path`, `label` (и `text` для транскрибации).
2. **Измените `TASK_TYPE`** на `"vad"`, `"classification"`, `"sed"`, `"transcription"`.
3. **Подставьте `NUM_CLASSES`** для вашей задачи.
4. **Настройте `augmentations`**, если нужно.
5. **Замените `SimpleAudioCNN`** на другую архитектуру (ResNet, PANN, Transformer).
6. **Для Whisper** — используйте `transcribe_audio_with_whisper` для транскрибации или fine-tune.


1. VAD — Voice Activity Detection (детекция активности речи)
2. Classification — классификация аудио
3. SED — Sound Event Detection (детекция звуковых событий во времени)
4. Transcription — транскрибация (ASR: Automatic Speech Recognition)