In [1]:
%%capture

import json
import speechbrain as sb
import os, sys
from speechbrain.utils.data_utils import get_all_files
import torch
from speechbrain.dataio.dataio import read_audio
import random
import torchaudio

import json
from tqdm import tqdm
import torchaudio
from joblib import Parallel, delayed

def process_file(path):
    # Optimized path operations
    parts = path.split("/")[-1].split("\\")[-1].split("_")
    id = "_".join(parts[:-1])
    num_speakers = parts[3]
    info = torchaudio.info(path)
    length = info.num_frames / 16000

    return id, {
        "wav_path": path.replace("\\","/"),
        "num_speakers": num_speakers,
        "length": length
    }

def load_json(json_paths, save_file="train"):
    data = {}

    # Parallel processing
    results = Parallel(n_jobs=-1, verbose=10)(
        delayed(process_file)(path) for path in json_paths
    )

    for id, path_data in results:
        data[id] = path_data

    with open(f"../data/{save_file}_data.json", 'w') as json_file:
        json.dump(data, json_file, indent=4)


# Example usage
train_files = get_all_files("../data/train", match_and=['_segment.wav'])
test_files = get_all_files("../data/dev", match_and=['_segment.wav'])
valid_files = get_all_files("../data/eval", match_and=['_segment.wav'])

load_json(train_files, save_file="train")
load_json(test_files, save_file="test")
load_json(valid_files, save_file="valid")


In [2]:
test_files_no_spk = [file for file in test_files if 'spk_0' in file]
test_files_1_spk = [file for file in test_files if 'spk_1' in file]
test_files_2_spk = [file for file in test_files if 'spk_2' in file]
test_files_3_spk = [file for file in test_files if 'spk_3' in file]
test_files_4_spk = [file for file in test_files if 'spk_4' in file]

load_json(test_files_no_spk, save_file="test_files_no_spk")
load_json(test_files_1_spk, save_file="test_files_1_spk")
load_json(test_files_2_spk, save_file="test_files_2_spk")
load_json(test_files_3_spk, save_file="test_files_3_spk")
load_json(test_files_4_spk, save_file="test_files_4_spk")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.009006738662719727s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  32 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.037862539291381836s.) Setting batch_size=4.
[Parallel(n_jobs=-1)]: Done  50 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 106 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.027352571487426758s.) Setting batch_size=8.
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 220 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Batch computation too fast (0.02952289581298828s.) 

In [3]:
%%file hparams_ecapa_tdnn_augmentation.yaml
# Basic configuration for reproducibility
seed: 1986
__set_seed: !!python/object/apply:torch.manual_seed [!ref <seed>]

# Output directories for saving the results, model checkpoints, and logs
output_folder: !ref ../results/ECAPA/Augmented/<seed>
save_folder: !ref <output_folder>/save
train_log: !ref <output_folder>/train_log.txt

# Paths to dataset annotations and noise profiles for augmentation
data_folder: ../data
train_annotation: !ref <data_folder>/train_data.json
valid_annotation: !ref <data_folder>/valid_data.json
test_annotation: !ref <data_folder>/test_data.json
test_0_spk_annotation: !ref <data_folder>/test_files_no_spk_data.json
test_1_spk_annotation: !ref <data_folder>/test_files_1_spk_data.json
test_2_spk_annotation: !ref <data_folder>/test_files_2_spk_data.json
test_3_spk_annotation: !ref <data_folder>/test_files_3_spk_data.json
test_4_spk_annotation: !ref <data_folder>/test_files_4_spk_data.json

noise_annotation: !ref <data_folder>/noises.csv
rir_annotation: !ref <data_folder>/simulated_rirs.csv

# Training hyperparameters
sample_rate: 16000
number_of_epochs: 20
batch_size: 64
lr_start: 0.001
lr_final: 0.0001
weight_decay: 0.00002
num_workers: 0 # Number of workers set to 0 for compatibility with Windows and 4 with Linux
n_classes: 5
dim: 192
num_attention_channels: 128
shuffle: True

lr: 0.0001
mode: exp_range
gamma: 0.9998
base_lr: 0.000001
max_lr: !ref <lr>
step_size: 396 # 4 times number of iterations/epoch (2 to 10 is suggested)

dataloader_options:
    batch_size: !ref <batch_size>
    num_workers: !ref <num_workers>
    shuffle: !ref <shuffle>

# Checkpointing configuration to enable and set intervals
ckpt_enable: True
ckpt_interval_minutes: 15

# Feature extraction parameters
n_mels: 40

# Data Augmentation settings, including noise, speed, reverb, frequency, and temporal dropping
skip_prep: True
snr_low: 0  # Min SNR for noise augmentation
snr_high: 15  # Max SNR for noise augmentation

add_noise: !new:speechbrain.augment.time_domain.AddNoise
    csv_file: !ref <noise_annotation>
    snr_low: !ref <snr_low>
    snr_high: !ref <snr_high>
    noise_sample_rate: !ref <sample_rate>
    clean_sample_rate: !ref <sample_rate>
    num_workers: !ref <num_workers>

# Speed perturbation
speed_changes: [95, 100, 105]  # List of speed changes for time-stretching

speed_perturb: !new:speechbrain.augment.time_domain.SpeedPerturb
    orig_freq: !ref <sample_rate>
    speeds: !ref <speed_changes>

# add_reverb: !new:speechbrain.augment.time_domain.AddReverb
#     csv_file: !ref <rir_annotation>
#     reverb_sample_rate: !ref <sample_rate>
#     clean_sample_rate: !ref <sample_rate>

# Frequency drop: randomly drops a number of frequency bands to zero.
drop_freq_low: 0  # Min frequency band dropout probability
drop_freq_high: 1  # Max frequency band dropout probability
drop_freq_count_low: 1  # Min number of frequency bands to drop
drop_freq_count_high: 3  # Max number of frequency bands to drop
drop_freq_width: 0.05  # Width of frequency bands to drop

drop_freq: !new:speechbrain.augment.time_domain.DropFreq
    drop_freq_low: !ref <drop_freq_low>
    drop_freq_high: !ref <drop_freq_high>
    drop_freq_count_low: !ref <drop_freq_count_low>
    drop_freq_count_high: !ref <drop_freq_count_high>
    drop_freq_width: !ref <drop_freq_width>

wav_augment: !new:speechbrain.augment.augmenter.Augmenter
    parallel_augment: False
    concat_original: True
    repeat_augment: 1
    shuffle_augmentations: False
    min_augmentations: 4
    max_augmentations: 4
    augment_prob: 1.0
    augmentations: [
        !ref <add_noise>,
        !ref <speed_perturb>,
        !ref <drop_freq>]

# Feature extraction and normalization configuration
compute_features: !new:speechbrain.lobes.features.Fbank
    n_mels: !ref <n_mels>
        
# Model components configuration for ECAPA-TDNN
mean_var_norm: !new:speechbrain.processing.features.InputNormalization
    norm_type: sentence
    std_norm: False
        
##################################
###### Model Configuration #######
########## ECAPA-TDNN ############
##################################
        
embedding_model: !new:speechbrain.lobes.models.ECAPA_TDNN.ECAPA_TDNN
    input_size: !ref <n_mels>
    channels: [256, 256, 256, 256, 768]
    kernel_sizes: [5, 3, 3, 3, 1]
    dilations: [1, 2, 3, 4, 1]
    attention_channels: !ref <num_attention_channels>
    lin_neurons: !ref <dim>
        
classifier: !new:speechbrain.lobes.models.ECAPA_TDNN.Classifier
    input_size: !ref <dim>
    out_neurons: !ref <n_classes>
        
epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
    limit: !ref <number_of_epochs> 

modules:
    compute_features: !ref <compute_features>
    embedding_model: !ref <embedding_model>
    classifier: !ref <classifier>
    mean_var_norm: !ref <mean_var_norm>

# Loss configuration with margin and scale parameters
compute_cost: !new:speechbrain.nnet.losses.LogSoftmaxWrapper
    loss_fn: !new:speechbrain.nnet.losses.AdditiveAngularMargin
        margin: 0.2
        scale: 30

# Optimizer and learning rate scheduler settings
opt_class: !name:torch.optim.Adam
    lr: !ref <lr_start>
    weight_decay: !ref <weight_decay>

lr_annealing: !new:speechbrain.nnet.schedulers.CyclicLRScheduler
    base_lr: !ref <base_lr>
    max_lr: !ref <max_lr>
    step_size: !ref <step_size> # 4 times number of iterations/epoch (2 to 10 is suggested)

# Logging and metric evaluation settings
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
    save_file: !ref <train_log>

error_stats: !name:speechbrain.utils.metric_stats.MetricStats
    metric: !name:speechbrain.nnet.losses.classification_error
        reduction: batch


# Checkpoint management for model saving and recovery
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
    checkpoints_dir: !ref <save_folder>
    recoverables:
        embedding_model: !ref <embedding_model>
        classifier: !ref <classifier>
        normalizer: !ref <mean_var_norm>
        counter: !ref <epoch_counter>


Overwriting hparams_ecapa_tdnn_augmentation.yaml


In [4]:
%%file train_ecapa_tdnn.py
import os
import sys
import torch
import torchaudio
import speechbrain as sb
from hyperpyyaml import load_hyperpyyaml

class ECAPABrain(sb.Brain):
    """Class that manages the training loop. See speechbrain.core.Brain."""
    
    def compute_forward(self, batch, stage):
        """Computation pipeline based on a encoder + speaker classifier.
        Data augmentation and environmental corruption are applied to the
        input speech.
        """
        batch = batch.to(self.device)
        wavs, lens = batch.sig

        # Add waveform augmentation if specified.
        if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
            wavs, lens = self.hparams.wav_augment(wavs, lens)

        # Feature extraction and normalization
        feats = self.modules.compute_features(wavs)
        feats = self.modules.mean_var_norm(feats, lens)

        # Embeddings + classifier
        embeddings = self.modules.embedding_model(feats)
        outputs = self.modules.classifier(embeddings)

        return outputs, lens

    def compute_objectives(self, predictions, batch, stage):
        """Computes the loss using speaker-id as label."""
        predictions, lens = predictions
        spkenc, _ = batch.num_speakers_encoded

        # Concatenate labels (due to data augmentation)
        if stage == sb.Stage.TRAIN and hasattr(self.hparams, "wav_augment"):
            spkenc = self.hparams.wav_augment.replicate_labels(spkenc)

        loss = self.hparams.compute_cost(predictions, spkenc, lens)

        if stage == sb.Stage.TRAIN and hasattr(
            self.hparams.lr_annealing, "on_batch_end"
        ):
            self.hparams.lr_annealing.on_batch_end(self.optimizer)

        if stage != sb.Stage.TRAIN:
            self.error_metrics.append(batch.id, predictions, spkenc, lens)

        return loss

    def on_stage_start(self, stage, epoch=None):
        """Gets called at the beginning of an epoch."""
        if stage != sb.Stage.TRAIN:
            self.error_metrics = self.hparams.error_stats()

    def on_stage_end(self, stage, stage_loss, epoch=None):
        """Gets called at the end of an epoch."""
        # Compute/store important stats
        stage_stats = {"loss": stage_loss}
        if stage == sb.Stage.TRAIN:
            self.train_stats = stage_stats
        else:
            stage_stats["ErrorRate"] = self.error_metrics.summarize("average")

        # Perform end-of-iteration things, like annealing, logging, etc.
        if stage == sb.Stage.VALID:
            old_lr, new_lr = self.hparams.lr_annealing(epoch)
            sb.nnet.schedulers.update_learning_rate(self.optimizer, new_lr)

            self.hparams.train_logger.log_stats(
                stats_meta={"epoch": epoch, "lr": old_lr},
                train_stats=self.train_stats,
                valid_stats=stage_stats,
            )
            self.checkpointer.save_and_keep_only(
                meta={"ErrorRate": stage_stats["ErrorRate"]},
                min_keys=["ErrorRate"],
            )

def dataio_prep(hparams):
    """Prepares the data IO (loading datasets, defining processing pipelines)"""

    # Initialize the label encoder
    label_encoder = sb.dataio.encoder.CategoricalEncoder()
    print(label_encoder)

    # Define audio pipeline
    @sb.utils.data_pipeline.takes("wav_path")
    @sb.utils.data_pipeline.provides("sig")
    def audio_pipeline(wav_path):
        sig, fs = torchaudio.load(wav_path)

        sig = torchaudio.functional.resample(sig, fs, 16000).squeeze(0)
        return sig

    # Define label pipeline
    @sb.utils.data_pipeline.takes("num_speakers")
    @sb.utils.data_pipeline.provides("num_speakers_encoded")
    def label_pipeline(num_speakers):
        num_speakers_encoded = label_encoder.encode_label_torch(num_speakers)
        yield num_speakers_encoded

    # Create datasets
    datasets = {}
    for dataset_name in ["train", "valid", "test", "test_0_spk", "test_1_spk", "test_2_spk", "test_3_spk", "test_4_spk"]:
        datasets[dataset_name] = sb.dataio.dataset.DynamicItemDataset.from_json(
            json_path=hparams[f"{dataset_name}_annotation"],
            dynamic_items=[audio_pipeline, label_pipeline],
            output_keys=["id", "sig", "num_speakers_encoded"],
        )
    print(datasets["train"])
    # Load or compute label encoder
    lab_enc_file = os.path.join(hparams["save_folder"], "label_encoder.txt")
    label_encoder.load_or_create(
        path=lab_enc_file,
        from_didatasets=[datasets["train"]],
        output_key="num_speakers",
    )

    return datasets

if __name__ == "__main__":
    # Loading the hyperparameters file and command line arguments
    hparams_file, run_opts, overrides = sb.parse_arguments(sys.argv[1:])

    # Load hyperparameter configuration file
    with open(hparams_file) as fin:
        hparams = load_hyperpyyaml(fin, overrides)

    # Create experiment directory
    sb.core.create_experiment_directory(
        experiment_directory=hparams["output_folder"],
        hyperparams_to_save=hparams_file,
        overrides=overrides,
    )

    # Prepare data IO
    datasets = dataio_prep(hparams)

    # Initialize the Brain object for training the ECAPA-TDNN model
    ecapa_brain = ECAPABrain(
        modules=hparams["modules"],
        opt_class=hparams["opt_class"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )

    # Train the model
    ecapa_brain.fit(
        epoch_counter=ecapa_brain.hparams.epoch_counter,
        train_set=datasets["train"],
        valid_set=datasets["valid"],
        train_loader_kwargs=hparams["dataloader_options"],
        valid_loader_kwargs=hparams["dataloader_options"],
    )

    # Evaluate the model
    ecapa_brain.evaluate(
        test_set=datasets["test"],
        min_key="error",
        test_loader_kwargs=hparams["dataloader_options"],
    )
    print("Error of No spk class")
    ecapa_brain.evaluate(
        test_set=datasets["test_0_spk"],
        min_key="error",
        test_loader_kwargs=hparams["dataloader_options"],
    )
    print("Error of No spk class")
    ecapa_brain.evaluate(
        test_set=datasets["test_0_spk"],
        min_key="error",
        test_loader_kwargs=hparams["dataloader_options"],
    )
    print("Error of 1 spk class")
    ecapa_brain.evaluate(
        test_set=datasets["test_1_spk"],
        min_key="error",
        test_loader_kwargs=hparams["dataloader_options"],
    )
    print("Error of 2 spk class")
    ecapa_brain.evaluate(
        test_set=datasets["test_2_spk"],
        min_key="error",
        test_loader_kwargs=hparams["dataloader_options"],
    )
    print("Error of 3 spk class")
    ecapa_brain.evaluate(
        test_set=datasets["test_3_spk"],
        min_key="error",
        test_loader_kwargs=hparams["dataloader_options"],
    )
    print("Error of 4 spk class")
    ecapa_brain.evaluate(
        test_set=datasets["test_4_spk"],
        min_key="error",
        test_loader_kwargs=hparams["dataloader_options"],
    )

Overwriting train_ecapa_tdnn.py


In [5]:
import torch
torch.cuda.set_device("cuda:0")

In [2]:
!python train_ecapa_tdnn.py hparams_ecapa_tdnn_augmentation.yaml

^C
