In [1]:
import pandas as pd

# === STEP 1: Load and preprocess the segment files ===

def load_segments(file_path):
    """
    Load CSV and return list of (start, end, label) and total end time
    """
    df = pd.read_csv(file_path)
    intervals = []
    last_end = 0
    for _, row in df.iterrows():
        start = row['start']
        end = row['end']
        label = row['language_tag']
        intervals.append((start, end, label))
        last_end = max(last_end, end)
    return intervals, last_end

# === STEP 2: Compare segments and compute matching times ===

def compute_label_match_time(true_intervals, pred_intervals):
    """
    Compare two label sequences and compute overlapping match time
    """
    match_time = 0.0
    total_overlap_time = 0.0
    match_by_class = {'English': 0.0, 'Mandarin': 0.0, 'NON_SPEECH': 0.0}

    i = j = 0
    while i < len(true_intervals) and j < len(pred_intervals):
        s1, e1, l1 = true_intervals[i]
        s2, e2, l2 = pred_intervals[j]
        
        # Overlap calculation
        overlap_start = max(s1, s2)
        overlap_end = min(e1, e2)

        if overlap_start < overlap_end:
            duration = overlap_end - overlap_start
            total_overlap_time += duration
            if l1 == l2:
                match_time += duration
                match_by_class[l1] += duration

        # Move pointer with earlier end time
        if e1 <= e2:
            i += 1
        else:
            j += 1

    return match_time, total_overlap_time, match_by_class

# === STEP 3: Paths to the label files ===

true_path = "/home/teaching/Desktop/priyam/labels/_MERLIon-CCS-Challenge_Development-Set_Language-Labels_v001.csv"             # Ground truth with NON_SPEECH, English, Mandarin
pred_path = "/home/teaching/Desktop/priyam/labels/classified_segments_opp.csv"    # Language classifier output

# === STEP 4: Load intervals from files ===

true_segments, true_duration = load_segments(true_path)
pred_segments, pred_duration = load_segments(pred_path)

# === STEP 5: Compute matching times and error ===

match_time, overlap_time, match_by_class = compute_label_match_time(true_segments, pred_segments)

# === STEP 6: Print results ===

print("=== File Durations ===")
print(f"Ground Truth Duration: {true_duration:.2f} ms")
print(f"Prediction Duration:   {pred_duration:.2f} ms")
print(f"Overlap Duration Used: {overlap_time:.2f} ms")

print("\n=== Matching Durations ===")
print(f"Total Matching Time:   {match_time:.2f} ms")
for label, t in match_by_class.items():
    print(f"{label} Match: {t:.2f} ms")

# === STEP 7: LDER Calculation ===

match_ratio = 100 * match_time / overlap_time if overlap_time > 0 else 0
print(f"\nMatching Ratio (Overlap Only): {match_ratio:.2f}%")
print(f"LDER (Overlap Based): {100 - match_ratio:.2f}%")


=== File Durations ===
Ground Truth Duration: 1859291.00 ms
Prediction Duration:   611221.00 ms
Overlap Duration Used: 559396.00 ms

=== Matching Durations ===
Total Matching Time:   263663.00 ms
English Match: 203768.00 ms
Mandarin Match: 0.00 ms
NON_SPEECH Match: 59895.00 ms

Matching Ratio (Overlap Only): 47.13%
LDER (Overlap Based): 52.87%
