<a href="https://colab.research.google.com/github/Nukaraju2003/jyothimam/blob/main/Evaluating_the_accuracy_of_speaker_diarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from datetime import datetime

def time_to_seconds(time_str):
    """
    Convert time in the format '00:00:05.695' to seconds.
    """
    time_format = "%H:%M:%S.%f"
    time_obj = datetime.strptime(time_str, time_format)
    return time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1e6

def calculate_der(diarization_output, ground_truth):
    """
    Calculate the Diarization Error Rate (DER) for multiple speakers.

    Args:
    - diarization_output (list): List of tuples representing diarization output.
      Each tuple contains (start_time, end_time, predicted_speaker_id).
    - ground_truth (list): List of tuples representing ground truth diarization.
      Each tuple contains (start_time, end_time, true_speaker_id).

    Returns:
    - der (float): The Diarization Error Rate (DER) as a percentage.
    - der_speakers (dict): DER for each speaker as a dictionary.
    """

    # Convert time durations to seconds
    diarization_output = [(time_to_seconds(start), time_to_seconds(end), speaker) for start, end, speaker in diarization_output]
    ground_truth = [(time_to_seconds(start), time_to_seconds(end), speaker) for start, end, speaker in ground_truth]

    # Sort the diarization output and ground truth by start time
    diarization_output.sort(key=lambda x: x[0])
    ground_truth.sort(key=lambda x: x[0])

    total_ref_speaker_segments = len(ground_truth)
    total_hyp_speaker_segments = len(diarization_output)

    # Initialize variables to count errors for each speaker
    speaker_errors = {}
    diarization_errors = 0
    insertion_errors = 0

    # Initialize pointers for diarization output and ground truth
    d_ptr = 0
    gt_ptr = 0

    # Process the segments
    while d_ptr < total_hyp_speaker_segments and gt_ptr < total_ref_speaker_segments:
        d_seg = diarization_output[d_ptr]
        gt_seg = ground_truth[gt_ptr]

        d_start, d_end, d_speaker = d_seg
        gt_start, gt_end, gt_speaker = gt_seg

        # Check for overlap
        overlap_start = max(d_start, gt_start)
        overlap_end = min(d_end, gt_end)

        # Calculate overlap duration
        overlap_duration = max(0, overlap_end - overlap_start)

        # Calculate error counts
        if d_speaker != gt_speaker:
            if d_speaker not in speaker_errors:
                speaker_errors[d_speaker] = {"speaker_errors": 0, "diarization_errors": 0}
            speaker_errors[d_speaker]["speaker_errors"] += 1

        if overlap_duration == 0:
            insertion_errors += 1
        elif d_start != gt_start or d_end != gt_end:
            diarization_errors += 1

        # Move pointers
        if d_end == gt_end:
            gt_ptr += 1
        if d_end == d_seg[1]:
            d_ptr += 1

    # Calculate DER for each speaker
    der_speakers = {}
    for speaker, errors in speaker_errors.items():
        total_ref_speaker_segments = len([seg for seg in ground_truth if seg[2] == speaker])
        der_speaker = (errors["speaker_errors"] + errors["diarization_errors"]) / total_ref_speaker_segments * 100
        der_speakers[speaker] = der_speaker

    # Calculate DER (average over all speakers)
    total_errors = sum((errors["speaker_errors"] + errors["diarization_errors"] for errors in speaker_errors.values()))
    der = (total_errors + diarization_errors + insertion_errors) / total_ref_speaker_segments * 100

    return der, der_speakers

# Example usage:
diarization_output = [
    ("00:00:05.695", "00:00:10.678", "Speaker_A"),
    ("00:00:10.678", "00:00:20.456", "Speaker_A"),
    ("00:00:20.456", "00:00:30.123", "Speaker_B"),
    ("00:00:30.123", "00:00:40.987", "Speaker_B"),
    ("00:00:40.987", "00:00:50.789", "Speaker_C"),
]

ground_truth = [
    ("00:00:05.695", "00:00:10.678", "Speaker_A"),
    ("00:00:10.678", "00:00:20.456", "Speaker_A"),
    ("00:00:20.456", "00:00:30.123", "Speaker_B"),
    ("00:00:30.123", "00:00:40.987", "Speaker_B"),
    ("00:00:40.987", "00:00:50.789", "Speaker_C"),
]

der, der_speakers = calculate_der(diarization_output, ground_truth)
print(f"DER: {der:.2f}%")
print("DER for each speaker:")
for speaker, der_speaker in der_speakers.items():
    print(f"Speaker {speaker}: {der_speaker:.2f}%")


DER: 0.00%
DER for each speaker:
