#### Testing Dataset

In [1]:
import os
import random
import math
from glob import glob
import torch as th
import torchaudio
import pytorch_lightning as pl
from typing import Optional, List
import torch.nn.functional as F
import pandas as pd 
from tqdm.notebook import tqdm 
import numpy as np

from utils.measure_time import measure_time 

In [3]:
import argparse
import sys
from utils.load_config import load_config  

parser = argparse.ArgumentParser()
parser.add_argument("-p", "--hparams", type=str, default="./configs/train_rnn.yml", help="hparams config file")
args, unknown = parser.parse_known_args()  # Игнорирует нераспознанные аргументы
cfg = load_config(args.hparams)

In [4]:
datamodule = AudioDataModule(**cfg['data']).setup(stage = 'train')

Size of training set: 420
Size of validation set: 63
Elapsed time 'setup': 00:00:02.20


In [5]:
dataloaders = {'train': datamodule.train_dataloader(), 'valid': datamodule.val_dataloader()}

In [6]:
# Получение первого батча данных из DataLoader
dataloader = dataloaders['train'] 
sample_mix, sample_refs = next(iter(dataloader))  # Используем iter и next для доступа к данным

In [7]:
print(sample_mix, '\n')
print('chunks_num', len(sample_mix), '\n')
print(sample_mix[0], '\n')
print(sample_mix[0].shape, '\n')
print('----------------------------------------------', '\n')
print('spekears num', len(sample_refs), '\n')
print('firs_speaker list:', sample_refs[0], '\n')
print('chunks_nums', len(sample_refs[0]), '\n')
print(sample_refs[0][0].shape, '\n')

[tensor([[0.0016, 0.0045, 0.0016,  ..., 0.0505, 0.1465, 0.1519]])] 

chunks_num 1 

tensor([[0.0016, 0.0045, 0.0016,  ..., 0.0505, 0.1465, 0.1519]]) 

torch.Size([1, 32000]) 

---------------------------------------------- 

spekears num 2 

firs_speaker list: [tensor([[-0.0042, -0.0083, -0.0139,  ..., -0.0009, -0.0040, -0.0038]])] 

chunks_nums 1 

torch.Size([1, 32000]) 



#### Testing dataloaders LAST UPDATE 1.

In [1]:
import argparse
import sys
from utils.load_config import load_config  

parser = argparse.ArgumentParser()
parser.add_argument("-p", "--hparams", type=str, default="./configs/train_rnn_percent_08.yml", help="hparams config file")
args, unknown = parser.parse_known_args()  # Игнорирует нераспознанные аргументы
cfg = load_config(args.hparams)

In [2]:
from data.DiarizationDataset import DiarizationDataset
datamodule = DiarizationDataset(**cfg['datasets']).setup(stage = 'train')
dataloaders = {'train': datamodule.train_dataloader(), 'valid': datamodule.val_dataloader()}

Size of training set: 3140
Size of validation set: 641
Elapsed time 'setup': 00:00:02.00


In [3]:
# # Получение первого батча данных из DataLoader
# dataloader = dataloaders['train'] 
# sample_mix, sample_refs = next(iter(dataloader))  
# print(sample_mix)
# print('chunks_num', len(sample_mix))
# print(sample_mix[0])
# print(sample_mix[0].shape)
# print('----------------------------------------------')
# print('spekears num', len(sample_refs))
# print('firs_speaker list:', sample_refs[0])
# print('chunks_nums', len(sample_refs[0]))
# print(sample_refs[0][0].shape)

tensor([[0.0209, 0.0117, 0.0137,  ..., 0.0167, 0.0109, 0.0182]])
chunks_num 1
tensor([0.0209, 0.0117, 0.0137,  ..., 0.0167, 0.0109, 0.0182])
torch.Size([32000])
----------------------------------------------
spekears num 2
firs_speaker list: tensor([[ 0.0179,  0.0152,  0.0104,  ...,  0.0005, -0.0008, -0.0029]])
chunks_nums 1
torch.Size([32000])


#### Testing dataloaders LAST UPDATE. 2.

In [1]:
import argparse
import sys
from utils.load_config import load_config  

parser = argparse.ArgumentParser()
parser.add_argument("-p", "--hparams", type=str, default="./configs/train_rnn.yml", help="hparams config file")
args, unknown = parser.parse_known_args()  # Игнорирует нераспознанные аргументы
cfg = load_config(args.hparams)

In [2]:
from data.DiarizationDataset import DiarizationDataset

datamodule = DiarizationDataset(**cfg['data']).setup(stage = 'train')
dataloaders = {'train': datamodule.train_dataloader(), 'valid': datamodule.val_dataloader()}

Size of training set: 3140
Size of validation set: 641
Elapsed time 'setup': 00:00:02.50


In [2]:
import torch.nn.functional as F
import torch
import torchaudio

from typing import List, Tuple
import os.path as ospth

def get_file_name(file_path: str):
    return ospth.splitext(ospth.basename(file_path))[0]

def handle_df(audios: List[Tuple[int, str]]) -> dict:
    scp_dict = dict()
    for audio in audios:
        common_len, l = audio
        if len(audio) != 2:
            raise RuntimeError("Format error in")
        if len(audio) == 2:
            key, value = f"{get_file_name (l)}.flac", l
        if key in scp_dict:
            raise ValueError("Duplicated key \'{0}\' exists in {1}".format(
                l, l))
        scp_dict[key] = {'common_len': common_len, 'name': value}
    return scp_dict
        
    
def read_wav(fname, return_rate=False):
    '''
         Read wavfile using Pytorch audio
         input:
               fname: wav file path
               return_rate: Whether to return the sampling rate
         output:
                src: output tensor of size C x L 
                     L is the number of audio frames 
                     C is the number of channels. 
                sr: sample rate
    '''
    src, sr = torchaudio.load(fname, channels_first=True)
    if return_rate:
        return src.squeeze(), sr
    else:
        return src.squeeze()


def write_wav(fname, src, sample_rate):
    '''
         Write wav file
         input:
               fname: wav file path
               src: frames of audio
               sample_rate: An integer which is the sample rate of the audio
         output:
               None
    '''
    torchaudio.save(fname, src, sample_rate)


class CustomAudioReader(object):
    '''
        Class that reads Wav format files
        Input:
            scp_path (str): a different scp file address
            sample_rate (int, optional): sample rate (default: 8000)
            chunk_size (int, optional): split audio size (default: 32000(4 s))
            least_size (int, optional): Minimum split size (default: 16000(2 s))
        Output:
            split audio (list)
    '''

    def __init__(self, scp_path, sample_rate=8000, chunk_size=32000, least_size=16000):
        super(CustomAudioReader, self).__init__()
        self.sample_rate = sample_rate
        self.index_dict = handle_df(scp_path)
        self.keys = list(self.index_dict.keys())
        # print(self.keys[0])
        self.audio = []
        self.chunk_size = chunk_size
        self.least_size = least_size
        self.split()

    def split(self):
        '''
            split audio with chunk_size and least_size
        '''
        for key in self.keys:
            common_len, name = self.index_dict[key]['common_len'], self.index_dict[key]['name']
            utt = read_wav(name)
            utt = utt[:common_len]
            if utt.shape[0] < self.least_size:
                continue
            if utt.shape[0] > self.least_size and utt.shape[0] < self.chunk_size:
                gap = self.chunk_size-utt.shape[0]
                self.audio.append(F.pad(utt, (0, gap), mode='constant'))
            if utt.shape[0] >= self.chunk_size:
                start = 0
                while True:
                    if start + self.chunk_size > utt.shape[0]:
                        break
                    self.audio.append(utt[start:start+self.chunk_size])
                    start += self.least_size

    def get_num_after_splitting(self):
        print(len(self.audio))

In [3]:
import torch

import numpy as np

class CustomDatasets(torch.utils.data.Dataset):
    '''
       Load audio data
       mix_scp: file path of mix audio (type: str)
       ref_scp: file path of ground truth audio (type: list[spk1,spk2])
       chunk_size (int, optional): split audio size (default: 32000(4 s))
       least_size (int, optional): Minimum split size (default: 16000(2 s))
    '''

    def __init__(self, df=None, sample_rate=16000, chunk_size=32000, least_size=16000):
        super(torch.utils.data.Dataset, self).__init__()
        k = len(df.iloc[0]) - 2
        mix_scp = []
        ref_scp = [[] for _ in range (k)]
        
        for _, row in df.iterrows():
            common_len_idx = row['common_len_idx']
            mix_scp.append([common_len_idx, row['mixed_audio']])
            i = 0
            for col in df.columns[2:]:
                audio_value = row[col]
                ref_scp[i].append([common_len_idx, audio_value])
                i += 1 
    
        self.mix_audio = CustomAudioReader(mix_scp, sample_rate=sample_rate, chunk_size=chunk_size, least_size=least_size).audio
        self.ref_audio = [CustomAudioReader(r, sample_rate=sample_rate, chunk_size=chunk_size, least_size=least_size).audio for r in ref_scp]

    def __len__(self):
        return len(self.mix_audio)

    def __getitem__(self, index):
        return self.mix_audio[index], [ref[index] for ref in self.ref_audio]

In [4]:
from utils.measure_time import measure_time 
import torch as th
import random
import numpy as np
import math
import pandas as pd

class TestingDiarizationDataset:
    def __init__(self, data_root = './', train_percent = 0.75, valid_percent = 0.15, test_percent = 0.0, shuffle=False, 
                 num_workers=0, batch_size=1, pin_memory = False, sample_rate=8000, chunk_size=32000, least_size=16000, seed = 42):
        self.shuffle = shuffle
        self.num_workers = num_workers
        self.batch_size = batch_size
        self.pin_memory = pin_memory
        self.sample_rate = sample_rate
        self.chunk_size = chunk_size
        self.least_size = least_size
        self.seed = seed
        self._set_seed(seed)
        self.g = th.Generator()
        self.g.manual_seed(seed)
        full_data_df = pd.read_csv(data_root) 
        assert math.isclose(train_percent + valid_percent + test_percent, 1.0, rel_tol=1e-9), "Sum doesnt equal to 1" 
        train_size = int(train_percent * len(full_data_df)) 
        val_size = int(valid_percent * len(full_data_df)) 
        test_size = len(full_data_df) - train_size - val_size
        self.train_df = full_data_df.iloc[:train_size] 
        self.val_df = full_data_df.iloc[train_size:train_size + val_size] 
        self.test_df = full_data_df.iloc[train_size + val_size:]
         
    @measure_time
    def setup(self, stage = 'train'):
        assert stage in ['train', 'eval'], "Invalid stage" 
        if stage == 'train': 
            self.train_dataset = CustomDatasets(self.train_df, 
                                            sample_rate = self.sample_rate,
                                            chunk_size = self.chunk_size,
                                            least_size = self.least_size)
            print(f"Size of training set: {len(self.train_dataset)}")
            self.val_dataset = CustomDatasets(self.val_df, 
                                        sample_rate = self.sample_rate,
                                        chunk_size = self.chunk_size,
                                        least_size = self.least_size)
            print(f"Size of validation set: {len(self.val_dataset)}")
        # To Do 
        # self.test_dataset
        
        return self # warning! 
        
    def train_dataloader(self):
        return th.utils.data.DataLoader(self.train_dataset,
                                    batch_size = self.batch_size,
                                    pin_memory = self.pin_memory,
                                    shuffle = self.shuffle,
                                    num_workers = self.num_workers,
                                    worker_init_fn=self.seed_worker,
                                    generator=self.g)
        
    def val_dataloader(self):
        return th.utils.data.DataLoader(self.train_dataset,
                                    batch_size = self.batch_size,
                                    pin_memory = self.pin_memory,
                                    shuffle = False,
                                    num_workers = self.num_workers,
                                    worker_init_fn=self.seed_worker,
                                    generator=self.g)

    def _set_seed(self, seed: int):
        random.seed(seed)
        np.random.seed(seed)
        th.manual_seed(seed)
        th.cuda.manual_seed_all(seed)

    def seed_worker(self, worker_id):
        worker_seed = th.initial_seed() % 2**32
        np.random.seed(worker_seed)
        random.seed(worker_seed)
        
    # ToDo
    # def test_dataloader(self):

In [5]:
datamodule = TestingDiarizationDataset(**cfg['data']).setup(stage = 'train')
dataloaders = {'train': datamodule.train_dataloader(), 'valid': datamodule.val_dataloader()}

Size of training set: 3140
Size of validation set: 641
Elapsed time 'setup': 00:00:02.54


In [6]:
# Получение первого батча данных из DataLoader
dataloader = dataloaders['train'] 
sample_mix, sample_refs = next(iter(dataloader))  
print(sample_mix)
print('chunks_num', len(sample_mix))
print(sample_mix[0])
print(sample_mix[0].shape)
print('----------------------------------------------')
print('spekears num', len(sample_refs))
print('firs_speaker list:', sample_refs[0])
print('chunks_nums', len(sample_refs[0]))
print(sample_refs[0][0].shape)

tensor([[0.0209, 0.0117, 0.0137,  ..., 0.0167, 0.0109, 0.0182]])
chunks_num 1
tensor([0.0209, 0.0117, 0.0137,  ..., 0.0167, 0.0109, 0.0182])
torch.Size([32000])
----------------------------------------------
spekears num 2
firs_speaker list: tensor([[ 0.0179,  0.0152,  0.0104,  ...,  0.0005, -0.0008, -0.0029]])
chunks_nums 1
torch.Size([32000])


In [None]:
tensor([[0.0209, 0.0117, 0.0137,  ..., 0.0167, 0.0109, 0.0182]])
chunks_num 1
tensor([0.0209, 0.0117, 0.0137,  ..., 0.0167, 0.0109, 0.0182])
torch.Size([32000])
----------------------------------------------
spekears num 2
firs_speaker list: tensor([[ 0.0179,  0.0152,  0.0104,  ...,  0.0005, -0.0008, -0.0029]])
chunks_nums 1
torch.Size([32000])

In [None]:
# Size of training set: 3140
# Size of validation set: 641
# Elapsed time 'setup': 00:00:03.57

#### Developed | Loss Functions

In [4]:
import torch
from losses import sdr_loss
from torchmetrics.audio import PermutationInvariantTraining
from torchmetrics.functional.audio import signal_distortion_ratio
from torchmetrics.audio import SignalDistortionRatio
from torchmetrics.functional.audio import permutation_invariant_training

from losses import sisnr_pit
from torchmetrics.audio import ScaleInvariantSignalNoiseRatio

seed = 42
torch.manual_seed(seed)

batch = 2
spk = 2
time = 32000
sample_mix = [torch.randn(batch, time) for _ in range(spk)]
sample_refs = [torch.randn(batch, time) for _ in range(spk)]

# print(sample_mix[0].shape)

# Функция для смешивания тензоров с заданной схожестью
def mix_tensors_with_similarity(tensor1, tensor2, unsimilarity=0.05):
    # Векторная интерполяция между tensor1 и tensor2
    return unsimilarity * tensor1 + (1 - unsimilarity) * tensor2

# Применяем смешивание для всех спикеров
sample_mix_similar = [
    mix_tensors_with_similarity(m, r, unsimilarity=0.5)
    for m, r in zip(sample_mix, sample_refs)
]

sample_mix = sample_mix_similar
r_sample_mix = sample_mix[::-1]

sample_mix_tensor = torch.stack(sample_mix, dim=1)  # по оси 1 (spk) ожидаемый вывод: torch.Size([1, 2, 3200])
sample_refs_tensor = torch.stack(sample_refs, dim=1)  # по оси 1 (spk)  # ожидаемый вывод: torch.Size([1, 2, 3200])

# print(sample_mix_tensor.shape)
# print(sample_refs_tensor.shape)

pit = PermutationInvariantTraining(signal_distortion_ratio, mode="speaker-wise", eval_func="max")
print('taudio:', - pit(sample_mix_tensor, sample_refs_tensor)) # Warining "-" minus before
sdr = SignalDistortionRatio()
print('another taudio:', - sdr(sample_mix_tensor, sample_refs_tensor)) # Warining "-" minus before
print('my sdr:', sdr_loss(sample_mix, sample_refs), '\n')
# print('')

print('my sisnr:', sisnr_pit(sample_mix, sample_refs).item())
print('reversed:', sisnr_pit(r_sample_mix, sample_refs).item())
sisnr = ScaleInvariantSignalNoiseRatio().to('cuda')
sample_mix_tensor = sample_mix_tensor.to('cuda')
sample_refs_tensor = sample_refs_tensor.to('cuda')
sisnr.update(sample_mix_tensor, sample_refs_tensor)
TEMP = sisnr.compute()
print('sisnr taudio:', - TEMP.item()) # Warining "-" minus before

taudio: tensor(-0.1013)
another taudio: tensor(-0.1013)
my sdr: tensor(-2.9903) 

my sisnr: 0.04009392857551575
reversed: 0.04009392857551575
sisnr taudio: 0.04009474068880081


In [3]:
print('my sdr:', SDR_LOSS(sample_mix, sample_refs), '\n')

worked
torch.Size([1, 32000]) torch.Size([1, 32000])
worked
torch.Size([1, 32000]) torch.Size([1, 32000])
worked
torch.Size([1, 32000]) torch.Size([1, 32000])
worked
torch.Size([1, 32000]) torch.Size([1, 32000])
my sdr: tensor(-0.1466, device='cuda:0') 



In [5]:
from itertools import permutations

for i in permutations(range(2)):
    print(i)
    break

(0, 1)


In [4]:
print('my uPIT_Loss:', uPIT_Loss(sample_mix, sample_refs).item())

my uPIT_Loss: -0.014378014951944351


In [None]:
from utils.measure_time import measure_time

# Функция для смешивания тензоров с заданной схожестью
def mix_tensors_with_similarity(tensor1, tensor2, unsimilarity=0.05):
    # Векторная интерполяция между tensor1 и tensor2
    # similarity - это коэффициент для интерполяции
    return unsimilarity * tensor1 + (1 - unsimilarity) * tensor2

@measure_time
def check_elapsed_time():
    batch = 1
    spk = 2
    time = 3200
    sample_mix = [torch.randn(batch, time) for _ in range(spk)]
    sample_refs = [torch.randn(batch, time) for _ in range(spk)]

    # Применяем смешивание для всех спикеров
    sample_mix_similar = [
        mix_tensors_with_similarity(m, r, unsimilarity=0.5)
        for m, r in zip(sample_mix, sample_refs)
    ]
    
    sample_mix = sample_mix_similar
    
    print('my sisnr:', sisnr_pit(sample_mix, sample_refs).item())

In [None]:
check_elapsed_time()

Вердикт - использовать мою функцию в качестве sisnr и использовать torchaudio для sdr. 

Интересный факт - функция sdr ведет себя sdr(x) -> min как sisnr (x) -> min.

#### Training.

In [1]:
import argparse
import sys
from utils.load_config import load_config  

parser = argparse.ArgumentParser()
parser.add_argument("-p", "--hparams", type=str, default="./configs/dev_dualpathrnn.yml", help="hparams config file")
args, unknown = parser.parse_known_args()  

In [2]:
import os
import argparse
from pathlib import Path

import torch
import torchmetrics
from torch.utils.tensorboard import SummaryWriter as TensorBoard
from tqdm.notebook import tqdm

from losses import sisnr_pit, sdr_loss
from utils.load_config import load_config 
from utils.training import metadata_info, configure_optimizer, p_output_log
from utils.measure_time import measure_time
from models import MODELS
from data.DiarizationDataset import DiarizationDataset


torch.cuda.empty_cache()
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.set_float32_matmul_precision('medium')

cfg = load_config(args.hparams)
datamodule = DiarizationDataset(**cfg['data']).setup(stage = 'train')
dataloaders = {'train': datamodule.train_dataloader(), 'valid': datamodule.val_dataloader()}
model_class = MODELS[cfg['xp_config']['model_type']]
model = model_class(**cfg['model'])
metadata_info(model)
writer = TensorBoard(f'tb_logs/{Path(args.hparams).stem}', comment = f"{cfg['trainer']['ckpt_folder']}")
optimizer = configure_optimizer (cfg, model)

Size of training set: 6847
Size of validation set: 865
Elapsed time 'setup': 00:00:07.26
Trainable parametrs: 2633729
Size of model: 10.05 MB, in float32


In [3]:
from utils.checkpointer import Checkpointer
from utils.training import *


class Trainer:
    def __init__(self, num_epochs = 100, device='cuda', best_weights = False, checkpointing = False, 
                 checkpoint_interval = 10, model_name = '', trained_model = './', path_to_weights= './weights', 
                 ckpt_folder = '', speaker_num = 2, resume = False) -> None:
        self.num_epochs = num_epochs
        self.device = device
        self.best_weights = best_weights
        self.ckpointer = Checkpointer(model_name, path_to_weights, ckpt_folder, metrics = False)
        self.checkpointing = checkpointing
        self.checkpoint_interval = checkpoint_interval
        self.model_name = model_name
        os.makedirs(path_to_weights, exist_ok=True)
        self.path_to_weights = path_to_weights
        self.ckpt_folder = ckpt_folder
        self.speaker_num = speaker_num
        self.resume = resume
        self.trained_model = trained_model

    @measure_time
    def fit(self, model, dataloaders, criterion, optimizer, writer) -> None:
        model.to(self.device)
        start_epoch, min_val_loss, model, optimizer = self.load_pretrained_model(model, optimizer)
        epoch_state = EpochState(metrics = None)
        for epoch in tqdm(range(start_epoch, self.num_epochs)):
            for phase in ['train', 'valid']:
                model.train() if phase == 'train' else model.eval()
                dataloader = dataloaders[phase] 
                running_loss = 0.0
                i = 0
                for inputs, labels in dataloader:
                    if i > 5:
                        break
                    inputs, labels = inputs.to(self.device), [l.to(self.device) for l in labels]
                    with torch.set_grad_enabled(phase == 'train'):
                        outputs = model(inputs)
                        loss = criterion(outputs, labels)
                        if phase == 'train':
                            optimizer.zero_grad()
                            loss.backward()
                            optimizer.step()
                    running_loss += loss.item()
                    i += 1
                epoch_loss = running_loss / i
                epoch_state.update_state(epoch_loss, phase)
                p_output_log(self.num_epochs, epoch, phase, epoch_state)
                
                if phase == 'valid' and self.best_weights and epoch_loss < min_val_loss:
                    min_val_loss = epoch_loss
                    self.ckpointer.save_best_weight(model, optimizer, epoch, epoch_state)
            
            torch_logger(writer, epoch, epoch_state)
            
            if self.checkpointing and (epoch + 1) % self.checkpoint_interval == 0:
                self.ckpointer.save_checkpoint(model, optimizer, epoch, epoch_state)

    def load_pretrained_model(self, model, optimizer):
        if self.trained_model:
            print(f"Load pretrained mode: {self.trained_model}", '\n')
            checkpoint = torch.load(self.trained_model, map_location=self.device, weights_only=False)
            model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            return checkpoint['epoch'] + 1, checkpoint['val_loss'] , model, optimizer
        else:
            return 0, float('inf'), model, optimizer

In [4]:
Trainer(**cfg['trainer']).fit(model, 
                              dataloaders, 
                              sisnr_pit, 
                              optimizer, 
                              writer)

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch 1/200
TRAIN, Loss: 0.9106
VALID, Loss: 0.0176
------------------------------------------------------------------------------------------------------------ 

Epoch 2/200
TRAIN, Loss: -0.3866
VALID, Loss: 0.0454
------------------------------------------------------------------------------------------------------------ 

Epoch 3/200
TRAIN, Loss: -0.6582
VALID, Loss: 0.3061
------------------------------------------------------------------------------------------------------------ 

Epoch 4/200
TRAIN, Loss: -0.7400
VALID, Loss: -0.0281
------------------------------------------------------------------------------------------------------------ 

Epoch 5/200
TRAIN, Loss: -1.0677
VALID, Loss: -0.0290
------------------------------------------------------------------------------------------------------------ 

Epoch 6/200
TRAIN, Loss: -1.2396
VALID, Loss: 0.0302
------------------------------------------------------------------------------------------------------------ 

Epoch 7/200
TRA

In [None]:
# Epoch 1/200
# TRAIN, Loss: -0.4523
# VALID, Loss: -0.8374

In [None]:
Epoch 1/200
TRAIN, Loss: -0.4071
VALID, Loss: -0.6765

In [None]:
# -12