# Nvidia Nemo
In this code we run Nvidia Nemo on all of the dataset

In [2]:
import IPython
import matplotlib.pyplot as plt
import numpy as np
import librosa
from nemo.collections.asr.parts.utils.speaker_utils import (
    rttm_to_labels,
    labels_to_pyannote_object,
)
import os
import IPython
import matplotlib.pyplot as plt
import numpy as np
import librosa
from tqdm import tqdm

from omegaconf import OmegaConf
from nemo.collections.asr.models import ClusteringDiarizer


import wget
import json
import nemo.utils
import logging

nemo_logger = nemo.utils.logging
nemo_logger.setLevel(logging.ERROR)

In [3]:
# Directories
data_directory = "../Dataset/Audio/Test"
rttm_directory = "../Dataset/RTTMs/Test"
sr = 16000
results_directory = "../Results/Oracle_vad"
model_config = "../Config/"
Metadata_test = "../Metadata_test/"

# Loading the pre-trained model
MODEL_CONFIG = os.path.join(model_config, "diar_infer_telephonic.yaml")
if not os.path.exists(MODEL_CONFIG):
    print("Did not find the model config file, downloading it now")
    config_url = "https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml"
    MODEL_CONFIG = wget.download(config_url, model_config)
config = OmegaConf.load(MODEL_CONFIG)
pretrained_speaker_model = "titanet_large"
config.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
config.diarizer.speaker_embeddings.parameters.window_length_in_sec = [
    1.5,
    1.25,
    1.0,
    0.75,
    0.5,
]
config.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [
    0.75,
    0.625,
    0.5,
    0.375,
    0.1,
]
config.diarizer.speaker_embeddings.parameters.multiscale_weights = [1, 1, 1, 1, 1]
config.diarizer.oracle_vad = True  # ----> ORACLE VAD
config.diarizer.clustering.parameters.oracle_num_speakers = False
config.device = "cuda"
config.diarizer.out_dir = results_directory  # Set out_dir correctly
config.diarizer.output_dir = results_directory


# Iterate over all files in the directory
for filename in tqdm(os.listdir(data_directory)):
    if filename.endswith(".wav"):
        # Get the file paths
        audio_path = os.path.join(data_directory, filename)
        # Get the RTTM file path. They are are ../Dataset/RTTMs/Dev
        rttm_path = os.path.join(rttm_directory, filename.replace(".wav", ".rttm"))
        signal, sr = librosa.load(audio_path, sr=sr)

        # Get the labels from the RTTM file
        labels = rttm_to_labels(rttm_path)
        reference = labels_to_pyannote_object(labels)

        # Create the metadata dictionary
        meta = {
            "audio_filepath": audio_path,
            "offset": 0,
            "duration": None,
            "label": "infer",
            "text": "-",
            "num_speakers": None,
            "rttm_filepath": rttm_path,
            "uem_filepath": None,
        }

        # Write the metadata to a JSON file
        manifest_filename = filename.replace(".wav", ".json")
        manifest_path = os.path.join(Metadata_test, manifest_filename)
        with open(manifest_path, "w") as fp:
            json.dump(meta, fp)
            fp.write("\n")

        config.diarizer.manifest_filepath = (
            manifest_path  # Use the correct manifest path
        )
        oracle_vad_clusdiar_model = ClusteringDiarizer(cfg=config)
        oracle_vad_clusdiar_model.diarize()

        # Get the RTTMS
        rttm_pred_path = os.path.join(results_directory, "pred_rttms", filename.replace(".wav", ".rttm"))
        pred_labels_neural = rttm_to_labels(rttm_pred_path)

  0%|          | 0/232 [00:00<?, ?it/s]

Clustering Sub-Windows: 100%|██████████| 2/2 [00:05<00:00,  2.90s/window]
Clustering Sub-Windows: 100%|██████████| 2/2 [00:05<00:00,  2.77s/window]
Clustering Sub-Windows: 100%|██████████| 2/2 [00:05<00:00,  2.90s/window]
Clustering Sub-Windows: 100%|██████████| 2/2 [00:06<00:00,  3.15s/window]
Clustering Sub-Windows: 100%|██████████| 2/2 [00:06<00:00,  3.03s/window]
Clustering Sub-Windows: 100%|██████████| 2/2 [00:05<00:00,  2.81s/window]
Clustering Sub-Windows: 100%|██████████| 2/2 [00:05<00:00,  2.91s/window]
Clustering Sub-Windows: 100%|██████████| 2/2 [00:05<00:00,  2.80s/window]
Clustering Sub-Windows: 100%|██████████| 2/2 [00:05<00:00,  2.85s/window]
Clustering Sub-Windows: 100%|██████████| 2/2 [00:05<00:00,  2.71s/window]
Clustering Sub-Windows: 100%|██████████| 2/2 [00:06<00:00,  3.09s/window]
Clustering Sub-Windows: 100%|██████████| 2/2 [00:06<00:00,  3.00s/window]
Clustering Sub-Windows: 100%|██████████| 2/2 [00:06<00:00,  3.01s/window]
Clustering Sub-Windows: 100%|█████████