# Alignment Logic

In [1]:
from typing import List, Tuple, Dict
import numpy as np
# from transformers import AutoModel, AutoTokenizer
from dataclasses import dataclass
from Levenshtein import distance as levenshtein_distance
from phonetics import metaphone
import pysrt
from pathlib import Path
import json
import pandas as pd
from copy import deepcopy

In [2]:
@dataclass
class WordTimestamp:
    word: str
    start: float
    end: float

@dataclass
class TimeAlignedSentence:
    sentence: str
    start_time: float
    end_time: float
    words: List[WordTimestamp]


@dataclass
class AlignmentResult:
    ground_truth_sentence: list[str]
    predicted_sentence: TimeAlignedSentence | None
    similarity_score: float


In [3]:
import sys
sys.path.append('..')

from scripts.utils import (
    read_srt_file,
    load_segments_from_json,
    read_ground_truth_file,
    srt_to_text,
    seconds_to_srt_time,
    fix_json_file,
    generate_srt_file_output,
    combine_repeated_words,
    WordTimestamp,
    TimeAlignedSentence,
    AlignmentResult
)

In [4]:
class EnhancedTranscriptAligner:
    def __init__(self, similarity_threshold=0.75):
        # self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        # self.model = AutoModel.from_pretrained('bert-base-uncased')
        self.similarity_threshold = similarity_threshold

    def preprocess_gt_text(self, gt_text: str) -> List[str]:
        return " ".join([word.strip() for word in gt_text.strip().split()])

    def get_word_embeddings(self, words: List[str]) -> np.ndarray:
        inputs = self.tokenizer(words, return_tensors='pt', padding=True, truncation=True)
        outputs = self.model(**inputs)
        # Use the mean of the last hidden state as the word embeddings
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        return embeddings

    def cosine_similarity(self, embedding1: np.ndarray, embedding2: np.ndarray) -> float:
        return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

    def phonetic_similarity(self, word1: str, word2: str) -> float:
        return 1.0 if metaphone(word1) == metaphone(word2) else 0.0

    def combined_similarity(self, gt_word: str, pred_word: str) -> float:
        # cosine_sim = self.cosine_similarity(gt_embedding, pred_embedding)
        lev_dist = levenshtein_distance(gt_word, pred_word)
        phonetic_sim = self.phonetic_similarity(gt_word, pred_word)

        # Normalize Levenshtein distance to a similarity score
        max_len = max(len(gt_word), len(pred_word))
        lev_sim = 1 - (lev_dist / max_len) if max_len > 0 else 0

        # Weighted combination of similarities
        return 0.5 * phonetic_sim + 0.5 * lev_sim

    def align_predicted_sentences(
        self,
        ground_truth: str,
        predicted_sentences: List[TimeAlignedSentence]
    ) -> Tuple[List[AlignmentResult], set]:
        """
        Align ground truth text with time-aligned predicted sentences using dynamic programming.
        
        Args:
            ground_truth: Ground truth text
            predicted_sentences: List of predicted sentences with timestamps
            
        Returns:
            Tuple containing:
            - List of alignment results
            - Set of matched word indices
            - List of updated predicted sentences with aligned words
        """
        # Preprocess ground truth
        processed_gt_text = self.preprocess_gt_text(ground_truth)
        gt_words = processed_gt_text.split()
        
        # Initialize results
        best_alignment = []
        matched_word_indices = set()
        updated_pred_sentences = []
        start_search_position = 0
        
        # Create similarity matrix for dynamic programming
        dp_matrix = {}  # Cache similarity scores
        
        # Process each predicted sentence
        for p, pred_sentence in enumerate(predicted_sentences):
            print(f"Processing predicted sentence {p + 1} of {len(predicted_sentences)}: {pred_sentence.sentence}")
            pred_words = pred_sentence.sentence.lower().split()
            if not pred_words:
                continue
                
            window_size = len(pred_words)
            best_score = -np.inf
            best_position = None
            
            # Optimize search space by looking at nearby positions first
            search_positions = []
            # Start search near end of last match
            # last_match_end = max(matched_word_indices) if matched_word_indices else 0
            start_search_position += len(pred_words)
            

            # Search 5 positions ahead of last match
            search_positions.extend(range(start_search_position, min(start_search_position + 5, len(gt_words) - window_size + 1)))
            # Search 5 positions behind last match
            search_positions.extend(range(max(0, start_search_position - 5), start_search_position))
            
            # Add remaining positions
            # search_positions.extend(i for i in range(len(gt_words) - window_size + 1) 
            #                     if i not in search_positions)
            
            # Search for best alignment position
            for k in search_positions:
                # Skip if segment is already matched
                if any(idx in matched_word_indices for idx in range(k, k + window_size)):
                    continue

                # Skip if window extends beyond ground truth text
                if k + window_size > len(gt_words):
                    continue
                    
                # Calculate similarity score using cached values
                score = 0
                for m in range(window_size):
                    pair_key = (gt_words[k + m], pred_words[m])
                    if pair_key not in dp_matrix:
                        dp_matrix[pair_key] = self.combined_similarity(*pair_key)
                    score += dp_matrix[pair_key]
                score /= window_size
                # print(f"Similarity score at position {k} for {' '.join(gt_words[k:k + window_size])} is {score}")

                if score > best_score:
                    best_score = score
                    best_position = k
            
            # Process best alignment if found and above threshold
            if best_position is not None and best_score >= self.similarity_threshold:
                print(f"Best alignment found at position {best_position} with score {best_score}")
                aligned_indices = range(best_position, best_position + window_size)
                matched_word_indices.update(aligned_indices)

                # Create alignment result
                aligned_words = [gt_words[idx] for idx in aligned_indices]
                gt_sentence = ' '.join(aligned_words)
                print(f"Aligned GT sentence: {gt_sentence}\n")
                
                # Add confidence score based on context
                context_bonus = 0
                if best_alignment:
                    last_end = max(matched_word_indices - set(aligned_indices))
                    if best_position == last_end + 1:  # Perfect continuation
                        context_bonus = 0.1
                    elif abs(best_position - last_end) <= 3:  # Near match
                        context_bonus = 0.05
                
                final_score = min(1.0, best_score + context_bonus)
                
                try:
                    result = AlignmentResult(
                        ground_truth_sentence=gt_sentence,
                        predicted_sentence=pred_sentence,
                        similarity_score=final_score
                    )
                    best_alignment.append(result)
                    
                    # Create updated predicted sentence with aligned words
                    updated_pred = TimeAlignedSentence(
                        sentence=gt_sentence,
                        start_time=pred_sentence.start_time,
                        end_time=pred_sentence.end_time,
                        words=[
                            WordTimestamp(
                                word=gt_word,
                                start=pred_sentence.words[m].start,
                                end=pred_sentence.words[m].end
                            )
                            for m, gt_word in enumerate(aligned_words)
                        ]
                    )
                    updated_pred_sentences.append(updated_pred)
                except Exception as e:
                    print(f"Error creating updated predicted sentence: {e}")
                    continue
        print(f"Aligned {len(best_alignment)} out of {len(predicted_sentences)} sentences\n")
        return best_alignment, matched_word_indices, updated_pred_sentences

In [5]:
data_path = Path("../data/Punjabi")
audio_path = data_path / "Audio"
transcript_path = data_path / "Text"
srt_path = data_path / "Ground Truth SRT"
results_path = data_path / "Results"

In [None]:
benchmark_df = pd.read_csv(data_path / "benchmark_list.csv", index_col=0).reset_index(drop=True)
benchmark_df.head()

In [None]:
benchmark_paths = benchmark_df["Story Name"].unique().tolist()
benchmark_paths

In [None]:
# Create lists to store decoded audio and transcripts
base_lists = {"audio_path": [], "transcript_path": [], "srt_path": [], "transcript": [], "sampling_rate": [], "array": []}
dataset_dict = {"train": deepcopy(base_lists), "val": deepcopy(base_lists)}
dataset_dict

In [None]:
for dir_path in audio_path.glob("*"):
    for file_path in dir_path.glob("*.wav"):
        print(file_path.name)
        if any(benchmark_path in file_path.name for benchmark_path in benchmark_paths):
            dataset_type = "val"
        else:
            dataset_type = "train"
        print(dataset_type)
        print("\n")

        # Append audio path and transcript path to dataset
        file_audio_path = str(file_path)
        file_transcript_path = transcript_path / dir_path.name.replace("Videos", "Text") / str(file_path.name.replace(".wav", ".txt"))
        file_srt_path = srt_path / dir_path.name.replace("Videos", "SRT") / f"{file_path.name.replace('.wav', '.srt')}"

        dataset_dict[dataset_type]["audio_path"].append(file_audio_path)
        dataset_dict[dataset_type]["transcript_path"].append(file_transcript_path)
        dataset_dict[dataset_type]["srt_path"].append(file_srt_path)
        # # Load and decode audio
        # decoded_audio, sampling_rate = sf.read(file_audio_path)
        # dataset_dict[dataset_type]["array"].append(decoded_audio)
        # dataset_dict[dataset_type]["sampling_rate"].append(sampling_rate)

        # # Read transcript and append to dataset
        # with open(file_transcript_path, 'r', encoding='utf-8') as file:
        #     transcript = file.read()
        #     dataset_dict[dataset_type]["transcript"].append(transcript)

In [10]:
def load_segments_from_json(json_path: str) -> List[TimeAlignedSentence]:
    with open(json_path, "r", encoding="utf-8") as file:
        result = json.load(file)

    time_aligned_sentences = []
    for segment in result["segments"]:
        # Split text on punctuation marks
        text = segment["text"]
        words_data = segment["words"]
        
        # Track current sentence being built
        current_sentence_words = []
        current_words_data = []
        
        for i, word_data in enumerate(words_data):
            current_sentence_words.append(word_data["word"].strip())
            current_words_data.append(word_data)
            
            # Check conditions for splitting:
            # 1. Punctuation marks
            # 2. Length > 5 words
            # 3. Last word in segment
            word = word_data["word"].strip()
            should_split = (
                any(punct in word for punct in ['.', '।', '?', '!']) or
                len(current_sentence_words) > 4 or
                i == len(words_data) - 1
            )
            
            if should_split and current_words_data:
                # Create WordTimestamp objects for current sentence
                word_timestamps = [
                    WordTimestamp(
                        word=w["word"].strip(),
                        start=w["start"],
                        end=w["end"]
                    ) for w in current_words_data
                ]
                
                # Create TimeAlignedSentence with accurate timestamps
                time_aligned_sentence = TimeAlignedSentence(
                    sentence=" ".join(current_sentence_words),
                    start_time=current_words_data[0]["start"],
                    end_time=current_words_data[-1]["end"],
                    words=word_timestamps
                )
                
                time_aligned_sentences.append(time_aligned_sentence)
                
                # Reset for next sentence
                current_sentence_words = []
                current_words_data = []

    return time_aligned_sentences

In [None]:
inference_data = {}
segments_path = results_path / "Segments New"
# segments_path = results_path / "Segments"

for file_transcript_path, file_srt_path in zip(dataset_dict["val"]["transcript_path"], dataset_dict["val"]["srt_path"]):
    base_name = Path(file_transcript_path).name.replace(".txt", "")
    inference_data[base_name] = {}
    inference_data[base_name]["srt_path"] = file_srt_path
    json_path = segments_path / Path(file_transcript_path).name.replace(".txt", ".json")

    print(f"Loading input predicted json file {json_path} ...")
    pred_srt_sentences = load_segments_from_json(json_path)
    inference_data[base_name]["pred_srt_sentences"] = pred_srt_sentences


    print(f"Loading ground truth text file {file_transcript_path} ...")
    ground_truth_text = read_ground_truth_file(file_transcript_path)
    inference_data[base_name]["ground_truth_text"] = ground_truth_text

In [None]:
print("Loading the alignment logic ...\n")
# Align the predicted sentences with the ground truth text
aligner = EnhancedTranscriptAligner(similarity_threshold=0.65)

In [13]:
# Post processing the results

def post_process_results(results: List[AlignmentResult], ground_truth_text: str, matched_word_indices: set, updated_pred_sentences: List[TimeAlignedSentence]) -> Tuple[List[AlignmentResult], List[TimeAlignedSentence]]:
    """
    Post-process the alignment results to handle unaligned and overlapping words.

    Args:
        results (List[AlignmentResult]): List of alignment results.
        ground_truth_text (str): Ground truth text.
        matched_word_indices (set): Set of matched word indices.

    Returns:
        List[AlignmentResult]: List of post-processed alignment results.
    """
    gt_words_list = ground_truth_text.split()
    gt_words_unaligned_indices = set(range(len(gt_words_list))) - matched_word_indices

    for i in range(1, len(results)):
        prev_result = results[i - 1]
        curr_result = results[i]

        if prev_result.predicted_sentence and curr_result.predicted_sentence:
            prev_end_time = prev_result.predicted_sentence.end_time
            curr_start_time = curr_result.predicted_sentence.start_time

            if prev_end_time == curr_start_time:
                prev_words = set(prev_result.ground_truth_sentence)
                curr_words = set(curr_result.ground_truth_sentence)

                # Form a string which is supposed to match with ground truth
                match_string = ' '.join(prev_result.ground_truth_sentence) + ' ' + ' '.join(curr_result.ground_truth_sentence)
                if match_string in ground_truth_text:
                    continue
                else:
                    # Identify unaligned words in the between the segments
                    unaligned_words_list = [gt_words_list[idx] for idx in gt_words_unaligned_indices]
                    max_n = min(5, len(unaligned_words_list))  # Handle case where unaligned words are less than 5
                    for n in range(1, max_n + 1):  # Form unaligned strings of 1 to max_n consecutive words
                        for j in range(len(unaligned_words_list) - n + 1):
                            unaligned_string = ' '.join(unaligned_words_list[j:j + n])
                            match_string = ' '.join(prev_result.ground_truth_sentence) + ' ' + unaligned_string + ' ' + ' '.join(curr_result.ground_truth_sentence)
                            if match_string in ground_truth_text:
                                # Add them to the previous segment
                                prev_result.ground_truth_sentence += ' ' + unaligned_string
                                for word in unaligned_words_list[j:j + n]:
                                    gt_words_unaligned_indices.remove(gt_words_list.index(word))
                                break
                        else:
                            continue
                        break

                    # Identify overlapping words
                    overlapping_words = prev_words & curr_words
                    # Remove overlapping words from the previous segment
                    if overlapping_words:
                        prev_result.ground_truth_sentence = ' '.join(
                            word for word in prev_result.ground_truth_sentence.split() if word not in overlapping_words
                        )

    
    used_indices = set()
    for i in range(1, len(updated_pred_sentences)):
        prev_result = updated_pred_sentences[i - 1]
        curr_result = updated_pred_sentences[i]

        if prev_result.words and curr_result.words:
            prev_words = prev_result.words
            curr_words = curr_result.words


        # Identify unaligned words between the segments
        prev_words_list = [w.word for w in prev_words]
        curr_words_list = [w.word for w in curr_words]

        # Find the indices of the previous and current words in the ground truth
        prev_index = -1
        curr_index = -1

        for j in range(len(gt_words_list) - len(prev_words_list) + 1):
            if j not in used_indices and gt_words_list[j:j + len(prev_words_list)] == prev_words_list:
                prev_index = j + len(prev_words_list) - 1
                used_indices.update(range(j, j + len(prev_words_list)))
                break

        for j in range(len(gt_words_list) - len(curr_words_list) + 1):
            if j not in used_indices and gt_words_list[j:j + len(curr_words_list)] == curr_words_list:
                curr_index = j
                # used_indices.update(range(j, j + len(curr_words_list)))
                break
       
        # If both indices are valid, handle the alignment cases
        if prev_index != -1 and curr_index != -1:
            if prev_index == curr_index - 1:
                # Ideal continuation, do nothing
                pass
            elif prev_index >= curr_index:
                # Remove overlapping words from prev_words using indices
                overlap_count = prev_index - curr_index + 1
                prev_words = prev_words[:-overlap_count]
            elif prev_index < curr_index - 1:
                # There are unaligned words in between, add these words to curr_words
                unaligned_words = gt_words_list[prev_index + 1:curr_index]
                for word in reversed(unaligned_words):
                    curr_words.insert(0, WordTimestamp(word=word, start=curr_words[0].start, end=curr_words[0].end))

    return results, updated_pred_sentences

In [None]:
inference_results = {}

for base_name in inference_data.keys():
    ground_truth_text = inference_data[base_name]["ground_truth_text"]
    pred_srt_sentences = inference_data[base_name]["pred_srt_sentences"]

    print("-" * 100)
    print(f"Aligning for {base_name} ...")
    results, matched_word_indices, updated_pred_sentences = aligner.align_predicted_sentences(ground_truth_text, pred_srt_sentences)
    inference_results[base_name] = {}
    inference_results[base_name]["results"] = results
    inference_results[base_name]["updated_pred_sentences"] = updated_pred_sentences
    inference_results[base_name]["matched_word_indices"] = matched_word_indices

    # print("Post processing the results ...\n")
    # # Post processing
    # post_processed_results, post_processed_pred_sentences = post_process_results(results, ground_truth_text, matched_word_indices, updated_pred_sentences)
    # inference_results[base_name]["post_processed_results"] = post_processed_results
    # inference_results[base_name]["post_processed_updated_pred_sentences"] = post_processed_pred_sentences

    # break


In [None]:
alignment_word_level_path = results_path / "Alignment" / "Word_level"
alignment_word_level_path.mkdir(parents=True, exist_ok=True)

alignment_sentence_level_path = results_path / "Alignment" / "Sentence_level"
alignment_sentence_level_path.mkdir(parents=True, exist_ok=True)

for base_name in inference_results:
    if not base_name == "Abdul_Kalam,_Missile_Man_Punjabi":
        continue
    print(f"Alignment Results for {base_name} ...")
    print(f"Matched word indices: {inference_results[base_name]['matched_word_indices']}")
    print("-" * 60)
    print("   Ground Truth  |  Predicted  |  Time  | Similarity   ")
    print("-" * 60)

    for result, updated_pred in zip(
        inference_results[base_name]["results"], 
        inference_results[base_name]["updated_pred_sentences"]):
        if result.predicted_sentence:
            print(f"{result.ground_truth_sentence:<50} | "
                f"{result.predicted_sentence.sentence:<50} | "
                f"{result.predicted_sentence.start_time:.2f}-{result.predicted_sentence.end_time:.2f} | "
                f"{result.similarity_score:.3f}")
            print("Word Level Timestamps for this predicted segment:")
            for word in updated_pred.words:
                print(f"   {word.word:<15} | Start: {word.start:.2f} | End: {word.end:.2f}")
            print("-" * 60)
        else:
            print(f"{result.ground_truth_sentence:<50} | {'':50} | {'':8} | {result.similarity_score:.3f}")

    generate_srt_file_output(
        # inference_results[base_name]["post_processed_results"],
        # inference_results[base_name]["post_processed_pred_sentences"],
        inference_results[base_name]["results"],
        inference_results[base_name]["updated_pred_sentences"],
        alignment_word_level_path / f"{base_name}.srt",
        alignment_sentence_level_path / f"{base_name}.srt"
    )
    print(f"Aligned GT transcript saved as {base_name}_Word_level.srt")
    break

In [16]:

def combine_repeated_words(srt_file_path, output_file_path):
    # Open the SRT file
    subs = pysrt.open(srt_file_path)
    combined_subs = pysrt.SubRipFile()

    previous_word = None
    start_time = None
    end_time = None

    for sub in subs:
        # Clean the text to avoid whitespace issues
        current_word = sub.text.strip()

        if previous_word is None:
            # Initialize the first word block
            previous_word = current_word
            start_time = sub.start
            end_time = sub.end
        elif current_word == previous_word:
            # If the word is repeated, update the end time
            end_time = sub.end
        else:
            # Add the combined segment to the output
            combined_subs.append(pysrt.SubRipItem(index=len(combined_subs) + 1, start=start_time, end=end_time, text=previous_word))
            # Start a new block for the new word
            previous_word = current_word
            start_time = sub.start
            end_time = sub.end

    # Add the last remaining block
    combined_subs.append(pysrt.SubRipItem(index=len(combined_subs) + 1, start=start_time, end=end_time, text=previous_word))

    combined_subs.save(output_file_path)
    print(f"Combined SRT saved at {output_file_path}")


In [17]:
# for base_name in inference_results:
#     # Example usage
#     srt_file_path = alignment_word_level_path / f"{base_name}.srt"  # Replace with your input SRT file path
#     output_file_path = alignment_word_level_path / f"{base_name}_final.srt"  # Replace with your desired output SRT file path
#     combine_repeated_words(srt_file_path, output_file_path)

In [20]:
# prompt: there is a word level srt file and a sentence level srt file, compare the sentence with the combined words and match them.
# And now calculate the deviation between timestamps of combined words (first word start ts and last word end ts) and sentence level timestamps.
# Give a metric with subtitle error rate from 0 to 100
def calculate_subtitle_error_rate(word_srt_path, sentence_srt_path, tolerance_threshold=0.1, max_deviation=5):
    """
    Calculates Subtitle Error Rate (SER) by comparing word-level and sentence-level SRT files.
    
    Args:
        word_srt_path (str|Path): Path to the SRT file with word-level timestamps
        sentence_srt_path (str|Path): Path to the SRT file with sentence-level timestamps
        tolerance_threshold (float): Acceptable timestamp deviation threshold in seconds (default: 0.1)
        max_deviation (float): Maximum allowed deviation before excluding from calculation (default: 5)
    
    Returns:
        dict: Dictionary containing:
            - ser: Subtitle Error Rate as percentage
            - deviations: List of timestamp deviations
            - matches: Number of successful matches
            - total_sentences: Total number of sentences processed
    """
    try:
        word_subs = pysrt.open(str(word_srt_path))
        sentence_subs = pysrt.open(str(sentence_srt_path))
    except Exception as e:
        raise RuntimeError(f"Error opening SRT files: {e}")

    # Initialize tracking variables
    deviations = []
    matches = 0
    total_sentences = len(sentence_subs)
    matched_word_indices = set()
    
    # Track overall timeline
    timeline = {
        'start': min(sub.start.ordinal for sub in sentence_subs),
        'end': max(sub.end.ordinal for sub in sentence_subs)
    }

    def clean_text(text):
        """Normalize text for comparison"""
        return " ".join(word.strip() for word in text.split())

    def find_matching_words(sentence_text, word_subs, matched_indices, max_attempts=5):
        """Find matching words for a sentence with sliding window"""
        sentence_length = len(sentence_text.split())
        window = []
        start_time = float('inf')
        end_time = float('-inf')
        new_matches = set()

        for idx, word_sub in enumerate(word_subs):
            if idx in matched_indices:
                continue

            window.append(word_sub)
            new_matches.add(idx)

            # Remove words from start if window too large
            while len(window) > sentence_length:
                window.pop(0)
                new_matches.remove(min(new_matches))

            if len(window) == sentence_length:
                window_text = clean_text(" ".join(sub.text for sub in window))
                if window_text == sentence_text:
                    return {
                        'start': window[0].start.ordinal / 1000,
                        'end': window[-1].end.ordinal / 1000,
                        'matches': new_matches
                    }

        return None

    # Process each sentence
    for sentence_sub in sentence_subs:
        sentence_text = clean_text(sentence_sub.text)
        sentence_start = sentence_sub.start.ordinal / 1000

        # Find matching words
        match_result = find_matching_words(sentence_text, word_subs, matched_word_indices)
        
        if match_result:
            matches += 1
            matched_word_indices.update(match_result['matches'])
            
            # Calculate deviation
            deviation = abs(match_result['start'] - sentence_start)
            if deviation > tolerance_threshold and deviation <= max_deviation:
                deviations.append(deviation)

    # Calculate final metrics
    total_duration = (timeline['end'] - timeline['start']) / 1000
    ser = (sum(deviations) / total_duration * 100) if total_duration > 0 else 0

    results = {
        'ser': round(ser, 3),
        'deviations': deviations,
        'matches': matches,
        'total_sentences': total_sentences,
        'match_rate': round(matches / total_sentences * 100, 2) if total_sentences > 0 else 0
    }

    # Print summary
    print(f"Subtitle Error Rate (SER): {results['ser']}%")
    print(f"Match Rate: {results['match_rate']}% ({matches}/{total_sentences} sentences)\n")
    print(f"Average Deviation: {round(sum(deviations) / len(deviations), 3) if deviations else 0} seconds \n")

    return results

In [None]:
for base_name in inference_results:
    # if not base_name == "Abdul_Kalam,_Missile_Man_Punjabi":
    #     continue
    print(f"Calculating SER for {base_name} ...")
    pred_aligned_srt_path = alignment_word_level_path / f"{base_name}.srt"
    gt_aligned_srt_path = inference_data[base_name]["srt_path"]
    calculate_subtitle_error_rate(pred_aligned_srt_path, gt_aligned_srt_path)
    # break