#### Dataset tesing

In [1]:
import math

# Заданные пропорции
train_percent = 0.8
valid_percent = 0.2
test_percent = 0.0  # Если нужно включить тестовые данные, измените это значение

assert math.isclose(train_percent + valid_percent + test_percent, 1.0, rel_tol=1e-9), "Sum doesnt equal to 1" 

# Ваши данные
data = [i for i in range(100)]
N = len(data)

# Расчёт количества данных для каждой части
train_size = int(N * train_percent)
valid_size = int(N * valid_percent)
test_size = int(N * valid_percent)

# Разделение данных
train_data = data[:train_size]
valid_data = data[train_size:train_size + valid_size]
test_data = data[train_size + valid_size:]

# Проверка
print(f"Train data ({len(train_data)}), last: {train_data[len(train_data)-1:]}")
print(f"Valid data ({len(valid_data)}), last: {valid_data[len(valid_data)-1:]}")
print(f"Test data ({len(test_data)}), last: {test_data[len(test_data)-1:]}")

Train data (80), last: [79]
Valid data (20), last: [99]
Test data (0), last: []


In [None]:
from data import MelVADDataset
from glob import glob
from tqdm import tqdm

In [1]:
class Dog:
    def __init__(self, *params):
        # Здесь *params может быть использовано для передачи произвольного количества аргументов
        self.params = params

    def bark(self, word):
        print(f"The dog says: {word}")


# Пример использования
params = ['Rex', 5, 'black']
Dog(*params).bark(word='gaw gaw')


The dog says: gaw gaw


In [None]:
speeches = [t.replace('\\', '/') for t in glob('F:/ISSAI_KSC2_unpacked/vad_data_augmented/*.flac')]

In [None]:
dataset = MelVADDataset(
    path_list = speeches, 
    n_frames=32, 
    nfft=1024, 
    hop_length=512, 
    n_mels=128, 
    sr=16000, 
    norm=False
)

for i in tqdm(range(len(dataset))):
    try:
        data = dataset[i]
    except Exception as e:
        print(f"[CRITICAL ERROR] idx={i}, error={str(e)}")

In [None]:
import torch as th
import torchaudio

EPS = 1e-8

class GetMinNmels(th.utils.data.Dataset):  # Удален лишний символ "__" в названии класса
    def __init__(self, speeches, n_fft=1024, hop_length=512, n_mels=128):  # Исправлено объявление параметров
        self.speeches = speeches  # Список аудиофайлов
        self.mel_spec = torchaudio.transforms.MelSpectrogram(n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    
    def _get_min(self):
        __min = float('inf')
        for t in tqdm(self.speeches):  # Итерация по списку аудиофайлов
            audio, _ = torchaudio.load(t)
            spec = th.log(self.mel_spec(audio) + EPS)  # Добавление маленькой константы для предотвращения логарифма от 0
            __min = min(__min, spec.shape[-1])  # Поиск минимальной длины спектрограммы
        return __min

Получаем минимальный n_frames

In [None]:
print(GetMinNmels(speeches, n_fft=1048, hop_length=512, n_mels=128)._get_min())

In [None]:
len(speeches)

In [None]:
len(speeches[:int(len(speeches) * 0.1)])

In [None]:
glob(os.path.join(str(speeches), '*.flac'))[:int(len(files) * 0.1)]

##### Testing data shapes 

In [None]:
import torch as th
from omegaconf import OmegaConf
from trainer import VAD
import yaml
import argparse
import sys
from dataset import VADMelDataModule

In [None]:
# Убираем Jupyter аргументы, которые начинаются с -f
sys.argv = sys.argv[:1]

# Создаем парсер и добавляем аргумент для конфигурации
parser = argparse.ArgumentParser()
parser.add_argument('--config', type=str, default='./configs/128_mels.yml', help='Path to config file')
args = parser.parse_args()

# Открываем YAML файл и загружаем конфигурацию
cfg = yaml.load(open(args.config), Loader=yaml.FullLoader)

# Или используя OmegaConf для чтения конфигурации
# cfg = OmegaConf.load(args.config)

In [None]:
dummy_batch = {
    "spectro": th.rand(8, 1, 128, 128),  # Размерность входных данных
    "targets": th.randint(0, 2, (8, 128)).float()  # Бинарные метки размерности (8, 128)
}

In [None]:
data_module = VADMelDataModule(**cfg['data'])
data_module.setup()
train_loader = data_module.train_dataloader()

# Size of training set: 555203
# Size of validation set: 61689

In [None]:
# Извлечение одного батча
batch = next(iter(train_loader))

# Проверка содержимого батча
spectro = batch["spectro"]
targets = batch["targets"]

print(f"Размер спектрограммы: {spectro.shape}")
print(f"Размер меток: {targets.shape}")

# Size of training set: 555203
# Size of validation set: 61689
# Размер спектрограммы: torch.Size([512, 1, 128, 32])
# Размер меток: torch.Size([512, 1, 32])

In [None]:
data_module = VADMelDataModule(**cfg['data'])
data_module.setup()
train_loader = data_module.train_dataloader()

#### Init

In [1]:
from trainer import VAD

In [2]:
import torch as th
from omegaconf import OmegaConf
from trainer import VAD
import yaml
import argparse
import sys
from dataset import VADMelDataModule

In [3]:
# Убираем Jupyter аргументы, которые начинаются с -f
sys.argv = sys.argv[:1]

# Создаем парсер и добавляем аргумент для конфигурации
parser = argparse.ArgumentParser()
parser.add_argument('--config', type=str, default='./configs/128_mels.yml', help='Path to config file')
args = parser.parse_args()

# Открываем YAML файл и загружаем конфигурацию
cfg = yaml.load(open(args.config), Loader=yaml.FullLoader)

# Или используя OmegaConf для чтения конфигурации
# cfg = OmegaConf.load(args.config)

#### Testing config file

In [5]:
model = VAD(cfg)

In [7]:
data_module = VADMelDataModule(**cfg['data'])
data_module.setup()
train_loader = data_module.train_dataloader()

Size of training set: 18349
Size of validation set: 2039


In [None]:
batch_num = 0
for batch_idx, batch in enumerate(train_loader):
    model.train()  # Переводим модель в режим тренировки
    training_loss = model.training_step(batch, batch_idx=batch_idx)  # Передаем реальный батч
    print(f'BATCH NUM: {batch_num}')
    print("Training Loss:", training_loss.item(), th.mean(training_loss))
    batch_num += 1
    if batch_num > 10:
        break

In [4]:
cfg

{'xp_config': {'model_type': 'VAD', 'dataset': 'ISSAI_KSC2'},
 'data': {'data_dir': 'F:/ISSAI_KSC2_unpacked/temp_vad',
  'batch_size': 512,
  'valid_percent': 0.9,
  'n_frames': 32,
  'nfft': 1048,
  'hop_length': 512,
  'n_mels': 128,
  'sr': 16000,
  'norm': False,
  'n_workers': 16,
  'pin_memory': True,
  'seed': 42},
 'model': {'n_feat': 128,
  'cnn_channels': 32,
  'embed_dim': 256,
  'dff': 512,
  'num_heads': 16},
 'training': {'optim': 'Adam', 'lr': 0.01, 'weight_decay': 1e-05},
 'trainer': {'fast_dev_run': False,
  'accelerator': 'gpu',
  'devices': 1,
  'precision': 32,
  'accumulate_grad_batches': 1,
  'profiler': False,
  'val_check_interval': 1.0,
  'max_epochs': 100},
 'model_checkpoint': {'monitor': 'val_loss',
  'filename': 'VAD-{epoch:02d}',
  'save_last': True}}

**Проверяем Vad** 

In [59]:
import pytorch_lightning as pl
from models import VADNet
import omegaconf as om
import torch.nn as nn
import torchmetrics.classification as tm # Note

In [60]:
from models import ConvBlock, CNNEmbedder

class VADNetTEST(nn.Module):
    def __init__(self, n_feat=256, cnn_channels=32, embed_dim=256, dff=512, num_heads=16):
        print(n_feat, cnn_channels, embed_dim, dff, num_heads)
        super().__init__()
        self.cnn_embedder = CNNEmbedder(ch_in=1, ch_out=cnn_channels) 
        # after the framewise flattening operation we have F'xC = (n_feat/16)*cnn_channels
        self.fc1 = nn.Linear(in_features=int((n_feat/16)*cnn_channels), out_features=embed_dim)
        self.self_attention = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads, batch_first=True) # changed
        self.layer_norm1 = nn.LayerNorm(embed_dim)
        self.layer_norm2 = nn.LayerNorm(embed_dim)
        # Changed
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, dff),
            nn.Linear(dff, embed_dim),
        )
        self.fc2 = nn.Linear(embed_dim, 1) # changed

In [65]:
class VADTESTINIT(pl.LightningModule):
    def __init__(self, hparams: om.DictConfig):
        super().__init__()
        self.hparams.update(hparams)
        if not isinstance(hparams, om.DictConfig):
            hparams = om.DictConfig(hparams)
        self.hparams.update(om.OmegaConf.to_container(hparams, resolve=True))
        print('after', self.hparams['model'])
        # self.hparams['model']['n_feat'] = 1000 # поменял и все работает 
        self.model = VADNetTEST(**self.hparams['model'])

In [66]:
new_hparams = VADTESTINIT(cfg) 

after {'n_feat': 128, 'cnn_channels': 32, 'embed_dim': 256, 'dff': 512, 'num_heads': 16}
128 32 256 512 16


**Проверяем VADNET**

#### Torch ligtgning training test

In [20]:
import torch as th
import omegaconf as om
import torchmetrics as tm
import pytorch_lightning as pl

from models import *
from dataset import *


class VADTESTING(pl.LightningModule):
    def __init__(self, hparams: om.DictConfig):
        super().__init__()
        self.hparams.update(hparams)
        if not isinstance(hparams, om.DictConfig):
            hparams = om.DictConfig(hparams)
        self.hparams.update(om.OmegaConf.to_container(hparams, resolve=True))
        
        self.model = VADNet(**self.hparams['model'])
        self.loss = nn.BCEWithLogitsLoss()
        self.auroc = tm.AUROC(task="binary", num_classes=1)
        self.acc = tm.Accuracy(task="binary", threshold=0.5)
        self.f1 = tm.F1Score(task="binary", threshold=0.5)

    def forward(self, x):
        probs = self.model(x)
        return probs

    def configure_optimizers(self):
        
        optim_type = self.hparams.training["optim"]
        assert  optim_type in ['Adam', 'SDG']
        
        if self.hparams.training["optim"] == 'Adam':
            return th.optim.Adam(self.model.parameters() ,lr=self.hparams.training["lr"], weight_decay=self.hparams.training["weight_decay"])
        else: 
            return th.optim.SGD(self.model.parameters() ,lr=self.hparams.training["lr"], weight_decay=self.hparams.training["weight_decay"])

    def training_step(self, batch, batch_idx):

        x, t = batch['spectro'], batch['targets'].squeeze(1)
        probs = self.forward(x).squeeze(-1)
        loss = self.loss(probs, t)
        self.log_dict({'train_loss':th.mean(loss)}, on_step=False, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):

        x, t = batch['spectro'], batch['targets'].squeeze(1)
        probs = self.forward(x).squeeze(-1)
        val_loss = self.loss(probs, t)

        probs = probs.squeeze(0)
        t = t.int().squeeze(0)

        # Compute metrics
        eval_metrics = {
            "val_loss": th.mean(val_loss),
            "auroc": self.auroc(probs, t),
            "accuracy": self.acc(probs, t),
            "F1": self.f1(probs, t)
        }

        self.log_dict(eval_metrics, on_step=False, on_epoch=True)

        return th.mean(val_loss)

In [21]:
vad_model = VADTESTING(cfg)
test_data_module = VADMelDataModule(**cfg['data'])
test_data_module.setup()
test_train_loader = test_data_module.train_dataloader()

Size of training set: 555203
Size of validation set: 61689


In [22]:
batch_num = 0

for batch_idx, batch in enumerate(test_train_loader):
    vad_model.train()  # Переводим модель в режим тренировки
    training_loss = vad_model.training_step(batch, batch_idx=batch_idx)  # Передаем реальный батч
    print(f'BATCH NUM: {batch_num}')
    print("Training Loss:", training_loss.item())
    vad_model.eval()  # Переводим модель в режим валидации
    with th.no_grad():  # Отключаем градиенты для ускорения
        val_loss = vad_model.validation_step(batch, batch_idx=batch_idx)  # Передаем реальный батч
    print("Validation Loss:", val_loss.item())
    batch_num += 1

BATCH NUM: 0
Training Loss: 0.7284665703773499
Validation Loss: 0.7013393640518188
BATCH NUM: 1
Training Loss: 0.7289002537727356
Validation Loss: 0.7090765833854675
BATCH NUM: 2
Training Loss: 0.723651647567749
Validation Loss: 0.7133157253265381
BATCH NUM: 3
Training Loss: 0.7187821865081787
Validation Loss: 0.7178352475166321
BATCH NUM: 4
Training Loss: 0.7126844525337219
Validation Loss: 0.7127168774604797
BATCH NUM: 5
Training Loss: 0.7315097451210022
Validation Loss: 0.736426055431366
BATCH NUM: 6
Training Loss: 0.7398598790168762
Validation Loss: 0.7419386506080627
BATCH NUM: 7
Training Loss: 0.7305679321289062
Validation Loss: 0.7371056079864502
BATCH NUM: 8
Training Loss: 0.7095872759819031
Validation Loss: 0.7240737080574036
BATCH NUM: 9
Training Loss: 0.7044501900672913
Validation Loss: 0.7249741554260254
BATCH NUM: 10
Training Loss: 0.7326357364654541
Validation Loss: 0.7413102388381958
BATCH NUM: 11
Training Loss: 0.7328454256057739
Validation Loss: 0.7372945547103882
BATC

KeyboardInterrupt: 

In [None]:
# ----------------------------------------------------------

In [15]:
from torch import nn
loss = nn.BCEWithLogitsLoss()
input = torch.randn(3, requires_grad=True)
target = torch.empty(3).random_(2)
output_i = loss(input, target)
print(output_i)
output_i.backward()
print(output_i)

tensor(0.8688, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.8688, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)


In [16]:
import pytorch_lightning as pl
f1 = pl.metrics.classification.F1()

AttributeError: module 'pytorch_lightning' has no attribute 'metrics'

#### Torch training test

In [31]:
import torch

def p_output_log(epoch, num_epochs, phase, epoch_loss, epoch_metrics, metrics):
    if phase == 'train':
        print(f'Epoch {epoch+1}/{num_epochs}')
    print(f"{phase.upper()}, Loss: {epoch_loss:.4f}, ", end="")
    for m in metrics.keys():
        print(f"{m}: {epoch_metrics[m]:.4f} ", end="")
    print() 
    if phase == 'valid':
        print('-' * 108, '\n')

def __train_model(model, dataloaders, criterion, optimizer, metrics, num_epochs=25, device='cuda'):
    model.to(device)
    min_acc = 0.0
    for epoch in range(num_epochs):
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()
                dataloader = dataloaders['train']
            else:
                model.eval()
                dataloader = dataloaders['valid']
            
            running_loss = 0.0
            [metrics[m].reset() for m in metrics.keys()]
            total_samples = len(dataloader.dataset)
            for inputs, labels in dataloader:
                inputs, labels = inputs.to(device), labels.to(device)
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs).transpose(1, 2)
                    loss = criterion(outputs, labels)
                    if phase == 'train':
                        optimizer.zero_grad()
                        loss.backward()
                        optimizer.step()
                        
                # Так как значение loss.item() уже усреднено внутри батча, 
                # чтобы получить общую сумму потерь (а не среднюю) для этого батча, 
                # нужно домножить её на количество объектов в батче, то есть на inputs.size(0).
                running_loss += loss.item() * inputs.size(0)
                [metrics[m].update(outputs, labels) for m in metrics.keys()]
            
            epoch_loss = running_loss / total_samples
            epoch_metrics = {m: metrics[m].compute().item() for m in metrics.keys()}
            
            p_output_log(epoch, num_epochs, phase, epoch_loss, epoch_metrics, metrics)

    return model

In [None]:
import torch
from torch import nn, optim
import os
from tqdm import tqdm
import random
import numpy as np
import torchmetrics
# from torch_trainer import train_model
import yaml
import argparse
from pathlib import Path
import omegaconf as om

from models import VADNet 
from torching_datasets import *

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision('medium')

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

def main():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    cfg = {
        'data': {
            'data_dir': 'F:/ISSAI_KSC2_unpacked/temp_vad',
            'batch_size': 512,
            'valid_percent': 0.9,
            'n_frames': 32,
            'nfft': 1048,
            'hop_length': 512,
            'n_mels': 128,
            'sr': 16000,
            'norm': False,
            'n_workers': 2,
            'pin_memory': True,
            'seed': 42
        },
        'model': {
            'n_feat': 128,
            'cnn_channels': 32,
            'embed_dim': 256,
            'dff': 512,
            'num_heads': 16
        },
        'training': {
            'optim': 'Adam',
            'lr': 0.01,
            'weight_decay': 1e-05
        }
    }

    # Load data 
    datamodule = VADMelDataModule(**cfg['data'])
    datamodule.setup()
    dataloaders = {'train': datamodule.train_dataloader(), 'valid': datamodule.val_dataloader()}

    model = VADNet(**cfg['model'])
    
    # Meta-data
    print(f"Trainable parametrs: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

    # Optimizer
    optim_type = cfg['training']["optim"]
    assert  optim_type in ['Adam', 'SGD']
    if optim_type == 'Adam':
        optimizer = th.optim.Adam(model.parameters(), lr=cfg['training']["lr"], weight_decay=cfg['training']["weight_decay"])
    else: 
        optimizer = th.optim.SGD(model.parameters(), lr=cfg['training']["lr"], weight_decay=cfg['training']["weight_decay"])

    # Metrics: accuracy. Changes macro changed to micro
    metrics = {m: getattr(torchmetrics, m)(task='binary', average='micro').to(device) for m in ['Accuracy']}

    # Start training. Hardcode: num_epochs = 100
    trained_model = __train_model(model, dataloaders, nn.BCEWithLogitsLoss(), optimizer, metrics, num_epochs=100, 
                                device=device)

if __name__ == '__main__':
    main()

Size of training set: 18349
Size of validation set: 2039
Trainable parametrs: 621477
Epoch 1/100
TRAIN, Loss: 0.9608, Accuracy: 0.5072 
VALID, Loss: 0.7164, Accuracy: 0.5277 
------------------------------------------------------------------------------------------------------------ 

Epoch 2/100
TRAIN, Loss: 0.6949, Accuracy: 0.5225 
VALID, Loss: 0.6855, Accuracy: 0.5065 
------------------------------------------------------------------------------------------------------------ 

Epoch 3/100
TRAIN, Loss: 0.5706, Accuracy: 0.6956 
VALID, Loss: 0.6047, Accuracy: 0.6911 
------------------------------------------------------------------------------------------------------------ 

Epoch 4/100
TRAIN, Loss: 0.2384, Accuracy: 0.9092 
VALID, Loss: 0.3443, Accuracy: 0.8599 
------------------------------------------------------------------------------------------------------------ 

Epoch 5/100
TRAIN, Loss: 0.1986, Accuracy: 0.9252 
VALID, Loss: 0.5876, Accuracy: 0.7743 
---------------------

In [None]:
# import torch
# # Пример входного тензора (input size)
# input_tensor = torch.randn(512, 32)  # Тензор размера [512, 32]
# # Пример целевого тензора (target size)
# target_tensor = torch.randn(512, 1, 32)  # Тензор размера [512, 1, 32]
# # Печать форм тензоров
# print("Input tensor size:", input_tensor.size())
# print("Target tensor size:", target_tensor.size())

# a_target_tensor = target_tensor.squeeze(1) # squeeze -1 transpose(1, 2)

In [2]:
a = 0.00001
b = 1e-05
print(a == b )

True


#### ROC

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve

# Функция для построения ROC и PR-кривых
def plot_roc_pr_curves(y_true, y_scores):
    # ROC-кривая
    fpr, tpr, thresholds_roc = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)

    # PR-кривая
    precision, recall, thresholds_pr = precision_recall_curve(y_true, y_scores)
    pr_auc = auc(recall, precision)

    # Построение графиков
    plt.figure(figsize=(14, 6))

    # ROC-кривая
    plt.subplot(1, 2, 1)
    plt.plot(fpr, tpr, label=f"ROC curve (AUC = {roc_auc:.2f})", color="blue")
    plt.plot([0, 1], [0, 1], "k--", label="Random classifier")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve")
    plt.legend(loc="lower right")

    # PR-кривая
    plt.subplot(1, 2, 2)
    plt.plot(recall, precision, label=f"PR curve (AUC = {pr_auc:.2f})", color="green")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision-Recall Curve")
    plt.legend(loc="lower left")

    plt.tight_layout()
    plt.show()

# Пример использования
# y_true - истинные метки классов (0 или 1)
# y_scores - вероятности от модели
y_true = np.array([0, 1, 1, 0, 1, 0, 1, 1, 0, 0])  # Истинные метки
y_scores = np.array([0.1, 0.9, 0.8, 0.3, 0.7, 0.2, 0.95, 0.85, 0.4, 0.05])  # Предсказанные вероятности

plot_roc_pr_curves(y_true, y_scores)


In [None]:
# Предположим, что у вас есть список аудиофайлов и их истинные метки
audio_files = ["file1.wav", "file2.wav", "file3.wav", ...]
true_labels = [0, 1, 0, 1, ...]  # Истинные метки

# Собираем предсказания
predicted_probs = []
for audio_path in audio_files:
    probs = your_model.predict(audio_path)  # Используем вашу функцию predict
    predicted_probs.append(probs.mean())   # Например, усредняем вероятности по всем временным меткам

# Преобразуем списки в массивы
y_true = np.array(true_labels)
y_scores = np.array(predicted_probs)

# Строим ROC и PR кривые
plot_roc_pr_curves(y_true, y_scores)


#### Metrics