In [14]:
import IPython
import matplotlib.pyplot as plt
import numpy as np
import librosa
from nemo.collections.asr.parts.utils.speaker_utils import (
    rttm_to_labels,
    labels_to_pyannote_object,
)




In [None]:
import os
import IPython
import matplotlib.pyplot as plt
import numpy as np
import librosa

data_directory = "../Dataset/Audio/Dev"
rttm_directory = "../Dataset/RTTMs/Dev"

# Specify the file names
an4_audio = os.path.join(data_directory, "afjiv.wav")
an4_rttm = os.path.join(rttm_directory, "afjiv.rttm")

# Ensure the files exist
if not os.path.exists(an4_audio):
    raise FileNotFoundError(f"{an4_audio} not found.")
if not os.path.exists(an4_rttm):
    raise FileNotFoundError(f"{an4_rttm} not found.")

# Load and plot the audio
sr = 16000
signal, sr = librosa.load(an4_audio, sr=sr)

fig, ax = plt.subplots(1, 1)
fig.set_figwidth(20)
fig.set_figheight(2)
plt.plot(np.arange(len(signal)), signal, "gray")
fig.suptitle("Reference merged an4 audio", fontsize=16)
plt.xlabel("time (secs)", fontsize=18)
ax.margins(x=0)
plt.ylabel("signal strength", fontsize=16)
a, _ = plt.xticks()
plt.xticks(a, a / sr)

# Play the audio
IPython.display.Audio(an4_audio)

In [None]:
from nemo.collections.asr.parts.utils.speaker_utils import (
    rttm_to_labels,
    labels_to_pyannote_object,
)
labels = rttm_to_labels(an4_rttm)
reference = labels_to_pyannote_object(labels)
print(labels)
print(reference)

In [None]:
# Create a manifest for input with below format.
# {'audio_filepath': /path/to/audio_file, 'offset': 0, 'duration':None, 'label': 'infer', 'text': '-',
# 'num_speakers': None, 'rttm_filepath': /path/to/rttm/file, 'uem_filepath'='/path/to/uem/filepath'}
import json

# Define the root and data directory
ROOT = os.getcwd()
data_dir = os.path.join(ROOT, 'data')

# Ensure the data directory exists
os.makedirs(data_dir, exist_ok=True)


# Create the metadata dictionary
meta = {
    "audio_filepath": an4_audio,
    "offset": 0,
    "duration": None,
    "label": "infer",
    "text": "-",
    "num_speakers": None,
    "rttm_filepath": an4_rttm,
    "uem_filepath": None,
}

# Write the metadata to a JSON file
manifest_path = os.path.join(data_dir, "input_manifest.json")
with open(manifest_path, "w") as fp:
    json.dump(meta, fp)
    fp.write("\n")

# Display the content of the JSON file
with open(manifest_path, "r") as fp:
    content = fp.read()
    print(content)

# Create the output directory
output_dir = os.path.join(ROOT, "oracle_vad")
os.makedirs(output_dir, exist_ok=True)

In [None]:
from omegaconf import OmegaConf
import wget

MODEL_CONFIG = os.path.join(data_dir, "diar_infer_telephonic.yaml")
if not os.path.exists(MODEL_CONFIG):
    config_url = "https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/diar_infer_telephonic.yaml"
    MODEL_CONFIG = wget.download(config_url, data_dir)

config = OmegaConf.load(MODEL_CONFIG)
print(OmegaConf.to_yaml(config))

In [None]:
config.diarizer.manifest_filepath = "data/input_manifest.json"
config.diarizer.out_dir = (
    output_dir  # Directory to store intermediate files and prediction outputs
)
pretrained_speaker_model = "titanet_large"
config.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
config.diarizer.speaker_embeddings.parameters.window_length_in_sec = [
    1.5,
    1.25,
    1.0,
    0.75,
    0.5,
]
config.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [
    0.75,
    0.625,
    0.5,
    0.375,
    0.1,
]
config.diarizer.speaker_embeddings.parameters.multiscale_weights = [1, 1, 1, 1, 1]
config.diarizer.oracle_vad = True  # ----> ORACLE VAD
config.diarizer.clustering.parameters.oracle_num_speakers = False
print("Config: \n", config)

In [None]:
from nemo.collections.asr.models import ClusteringDiarizer
oracle_vad_clusdiar_model = ClusteringDiarizer(cfg=config)


In [21]:
# And lets diarize
oracle_vad_clusdiar_model.diarize()

[NeMo W 2024-05-29 20:13:24 clustering_diarizer:411] Deleting previous clustering diarizer outputs.


[NeMo I 2024-05-29 20:13:24 speaker_utils:93] Number of files to diarize: 1
[NeMo I 2024-05-29 20:13:24 clustering_diarizer:287] Subsegmentation for embedding extraction: scale0, c:\Users\rakin\Documents\GitHub\Speaker-Diarization\Main code\oracle_vad\speaker_outputs\subsegments_scale0.json
[NeMo I 2024-05-29 20:13:24 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-05-29 20:13:24 collections:732] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-05-29 20:13:24 collections:733] Dataset loaded with 154 items, total duration of  0.06 hours.
[NeMo I 2024-05-29 20:13:24 collections:735] # 154 files loaded accounting to # 1 labels


[1/5] extract embeddings: 100%|██████████| 3/3 [00:00<00:00, 10.58it/s]

[NeMo I 2024-05-29 20:13:24 clustering_diarizer:389] Saved embedding files to c:\Users\rakin\Documents\GitHub\Speaker-Diarization\Main code\oracle_vad\speaker_outputs\embeddings
[NeMo I 2024-05-29 20:13:24 clustering_diarizer:287] Subsegmentation for embedding extraction: scale1, c:\Users\rakin\Documents\GitHub\Speaker-Diarization\Main code\oracle_vad\speaker_outputs\subsegments_scale1.json
[NeMo I 2024-05-29 20:13:24 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-05-29 20:13:24 collections:732] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-05-29 20:13:24 collections:733] Dataset loaded with 184 items, total duration of  0.06 hours.
[NeMo I 2024-05-29 20:13:24 collections:735] # 184 files loaded accounting to # 1 labels



[2/5] extract embeddings: 100%|██████████| 3/3 [00:00<00:00, 30.87it/s]

[NeMo I 2024-05-29 20:13:25 clustering_diarizer:389] Saved embedding files to c:\Users\rakin\Documents\GitHub\Speaker-Diarization\Main code\oracle_vad\speaker_outputs\embeddings
[NeMo I 2024-05-29 20:13:25 clustering_diarizer:287] Subsegmentation for embedding extraction: scale2, c:\Users\rakin\Documents\GitHub\Speaker-Diarization\Main code\oracle_vad\speaker_outputs\subsegments_scale2.json
[NeMo I 2024-05-29 20:13:25 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-05-29 20:13:25 collections:732] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-05-29 20:13:25 collections:733] Dataset loaded with 233 items, total duration of  0.06 hours.
[NeMo I 2024-05-29 20:13:25 collections:735] # 233 files loaded accounting to # 1 labels



[3/5] extract embeddings: 100%|██████████| 4/4 [00:00<00:00, 32.33it/s]

[NeMo I 2024-05-29 20:13:25 clustering_diarizer:389] Saved embedding files to c:\Users\rakin\Documents\GitHub\Speaker-Diarization\Main code\oracle_vad\speaker_outputs\embeddings
[NeMo I 2024-05-29 20:13:25 clustering_diarizer:287] Subsegmentation for embedding extraction: scale3, c:\Users\rakin\Documents\GitHub\Speaker-Diarization\Main code\oracle_vad\speaker_outputs\subsegments_scale3.json
[NeMo I 2024-05-29 20:13:25 clustering_diarizer:343] Extracting embeddings for Diarization
[NeMo I 2024-05-29 20:13:25 collections:732] Filtered duration for loading collection is  0.00 hours.





[NeMo I 2024-05-29 20:13:25 collections:733] Dataset loaded with 318 items, total duration of  0.06 hours.
[NeMo I 2024-05-29 20:13:25 collections:735] # 318 files loaded accounting to # 1 labels


[4/5] extract embeddings: 100%|██████████| 5/5 [00:00<00:00, 33.47it/s]

[NeMo I 2024-05-29 20:13:25 clustering_diarizer:389] Saved embedding files to c:\Users\rakin\Documents\GitHub\Speaker-Diarization\Main code\oracle_vad\speaker_outputs\embeddings
[NeMo I 2024-05-29 20:13:25 clustering_diarizer:287] Subsegmentation for embedding extraction: scale4, c:\Users\rakin\Documents\GitHub\Speaker-Diarization\Main code\oracle_vad\speaker_outputs\subsegments_scale4.json
[NeMo I 2024-05-29 20:13:25 clustering_diarizer:343] Extracting embeddings for Diarization





[NeMo I 2024-05-29 20:13:25 collections:732] Filtered duration for loading collection is  0.00 hours.
[NeMo I 2024-05-29 20:13:25 collections:733] Dataset loaded with 1136 items, total duration of  0.16 hours.
[NeMo I 2024-05-29 20:13:25 collections:735] # 1136 files loaded accounting to # 1 labels


[5/5] extract embeddings: 100%|██████████| 18/18 [00:00<00:00, 38.46it/s]

[NeMo I 2024-05-29 20:13:25 clustering_diarizer:389] Saved embedding files to c:\Users\rakin\Documents\GitHub\Speaker-Diarization\Main code\oracle_vad\speaker_outputs\embeddings



clustering: 100%|██████████| 1/1 [00:00<00:00,  1.66it/s]

[NeMo I 2024-05-29 20:13:26 clustering_diarizer:464] Outputs are saved in c:\Users\rakin\Documents\GitHub\Speaker-Diarization\Main code\oracle_vad directory



    


[NeMo I 2024-05-29 20:13:26 der:176] Cumulative Results for collar 0.25 sec and ignore_overlap True: 
     FA: 0.0000	 MISS 0.0000	                 Diarization ER: 0.0000	, Confusion ER:0.0000


(<pyannote.metrics.diarization.DiarizationErrorRate at 0x1a812e945e0>,
 {'afjiv': {'speaker_0': 'spk02',
   'speaker_1': 'spk00',
   'speaker_2': 'spk03',
   'speaker_3': 'spk01',
   'speaker_4': 'spk04'}},
 (0.0, 0.0, 0.0, 0.0))

In [29]:
print("Clustering Diarizer Result (RTTM format)")
pred_labels_neural = rttm_to_labels(f"{output_dir}/pred_rttms/afjiv.rttm")
hypothesis_neural = labels_to_pyannote_object(pred_labels_neural)
print(pred_labels_neural)
print("Ground-truth Speaker Labels") 


Clustering Diarizer Result (RTTM format)
['5.28 34.519999999999996 speaker_2', '34.68 40.08 speaker_2', '40.12 40.96 speaker_2', '41.12 80.47999999999999 speaker_1', '80.92 83.72 speaker_0', '84.84 86.08 speaker_0', '87.08 88.0 speaker_0', '88.72 93.03999999999999 speaker_0', '93.08 94.36 speaker_0', '95.12 98.0 speaker_4', '98.72 100.44 speaker_4', '101.12 104.60000000000001 speaker_4', '105.56 107.08 speaker_4', '107.16 107.6 speaker_4', '108.16 110.03999999999999 speaker_4', '111.0 113.28 speaker_4', '113.84 118.12 speaker_4', '119.44 122.32 speaker_3', '122.96 124.55999999999999 speaker_3', '125.2 128.4 speaker_3', '128.48 129.64 speaker_3', '130.88 133.51999999999998 speaker_3', '133.68 134.12 speaker_3', '135.72 138.16 speaker_3', '138.92 140.48 speaker_3', '140.64 141.64 speaker_3', '142.2 144.32 speaker_3', '144.6 145.32 speaker_3']
Ground-truth Speaker Labels
