In [8]:
import whisper
import os
from jiwer import wer
import pandas as pd
from typing import Literal 
import numpy as np
from unidecode import unidecode
from abc import ABC, abstractmethod
from typing import List, Optional, Dict, Any

# Assuming OpenAI client library is already installed
from openai import OpenAI

import sys
sys.modules.pop('agents_pipeline', None)
from utils import normalize_text
from agents_pipeline import BasePipeline, GenerateWhisperPromptPipeline, GenerateNamesPipeline, GenerateTopicPipeline, FixTranscriptByLLMPipeline

import warnings
warnings.filterwarnings(
    "ignore",
    message="FP16 is not supported on CPU; using FP32 instead",
    category=UserWarning,
)

In [None]:
#API_KEY
API_KEY = "Put your OpenAI API key here"

# Parameters:
GPT_MODEL_NAME = "gpt-4o"

In [12]:
full_prompt_pipeline = GenerateWhisperPromptPipeline(api_key=API_KEY)
names_pipeline = GenerateNamesPipeline(api_key=API_KEY)

In [10]:
whisper_model = whisper.load_model("medium.en", "cpu")

  checkpoint = torch.load(fp, map_location=device)


In [13]:
transcript = """
                They got a two-on-one facing Gordon and Caruso lays it in man He's been so valuable off the bench this season for the Thunder inside home grip And he barely draws iron Wallace with a hustle and the stick back
                Gilders Alexander with a rebound plenty of time. He can walk it across those tween draws the foul He goes to the right hand and he gets fouled by Peyton watch. It just runs right into the old SGA flock
                25 points tied for the fewest combined total in any first quarter in post-season history. Runner to begin the second quarter is up and in for the Thunder. They have a double-digit lead 19-8. And they turn them over.
                With great hands by Kason Wallace. What a pest. Hardinstein catching a body. Another pick from Gore. Murray through the window to A.G. 10-footer. Yes! Finally, Denver's third bucket of the basketball game.
            """
is_generated, initial_prompt = await names_pipeline.process(transcript)
print(f"is_generated: {is_generated}, Initial Prompt for Whisper: {initial_prompt}")

is_generated: True, Initial Prompt for Whisper: Aaron Gordon, Alex Caruso, Chet Holmgren, Wallace, Shai Gilgeous-Alexander, Payton Watson, Kason Wallace, Isaiah Hartenstein, Jamal Murray


In [None]:
# Utils
def load_csv(file_path: str, header_col = 1) -> pd.DataFrame:
    """Load a CSV file into a DataFrame."""
    return pd.read_csv(file_path, header=header_col)

In [None]:
async def run_flow(df: pd.DataFrame, path_prefix: str, output_file: str = "output", Pipelines: List[BasePipeline] = None,
                   print_results: bool = False, save_results: bool = False) -> None:

    for index in range(df.shape[0]):
        ground_truth = df.iloc[index]["transcript"]
        file_path = os.path.join(path_prefix, df.iloc[index]["segment_filename"])
        transcription_raw = whisper.transcribe(whisper_model, file_path)["text"]
        transcription_raw_norm = normalize_text(transcription_raw).strip()
        ground_truth_norm = normalize_text(ground_truth).strip()
        wer_before = wer(truth=ground_truth_norm, hypothesis=transcription_raw_norm)
        print(f"start {index} from {df.shape[0]}")
        if save_results:
            df.at[index, f"stt_raw_norm"]  = transcription_raw_norm
            df.at[index, f"stt_raw_wer"]  = wer_before

        for pipeline in Pipelines: 
            runs = []          
            for run_idx in range(pipeline.get_num_iterations_allowed()):
                try:
                    is_generated_initial_prompt, pipeline_response = await pipeline.process(transcription_raw)
                    output_generated = pipeline_response
                    if is_generated_initial_prompt:
                        if pipeline_response is None or len(pipeline_response) == 0: 
                            print(f"Pipeline {pipeline.get_pipeline_name()} returned an empty response.")
                            output_generated = transcription_raw_norm
                        else:     
                            try:
                                initial_prompt = unidecode(pipeline_response).strip()
                            except Exception as e:
                                print(f"Error decoding pipeline response: {e}")
                                initial_prompt = pipeline_response
                            transcription_with_prompt = whisper.transcribe(whisper_model, file_path, initial_prompt=initial_prompt)["text"]
                            output_generated = transcription_with_prompt

                            # Check if the transcription with prompt is significantly different from the raw transcription, which mean it is corrupted.
                            if abs(len(transcription_raw_norm) - len(transcription_with_prompt)) > len(transcription_raw_norm) * 0.2:
                                print(f"{pipeline.get_pipeline_name()} is corrupted")
                                flow_output_norm = transcription_raw_norm
                                if save_results:
                                    df.at[index, f"{pipeline.get_pipeline_name()}_is_corrupted"] = True
                    else:
                        initial_prompt = None


                    flow_output_norm = normalize_text(output_generated).strip()
                    print(f"flow_output_norm: {pipeline_response}")
                    
                    wer_after = wer(truth=ground_truth_norm, hypothesis=flow_output_norm)

                        
                    runs.append({
                        "id": run_idx,
                        "wer": wer_after,
                        "flow_output_norm": flow_output_norm,
                        "initial_prompt": initial_prompt
                    })

                    break

                except Exception as e:
                    print(f"Error processing index {index} with pipeline {pipeline.get_pipeline_name()}: {e}")
                    continue

            if len(runs) == 0:
                print(f"No runs were successful for index {index} with pipeline {pipeline.get_pipeline_name()}.")
                continue

            key_fn = (lambda x: x["wer"])
            selected = min(runs, key=key_fn)
                
            if print_results:
                print(f"index: {index}, Pipeline: {pipeline.get_pipeline_name()}")
                print(f"Target:------------------->{ground_truth_norm}")
                print(f"STT raw:------------------>{transcription_raw_norm}")
                print(f"After Flow:--------------->{selected['flow_output_norm']}")

                result = "*Improved*" if selected['wer'] < wer_before else "*Not Improved*" if  selected['wer'] > wer_before else "*Same Result*"
                print(f"{result}, WER before LLM correction: {wer_before:.4f}, WER after LLM correction: {selected['wer']:.4f}")
                print()


            if save_results:
                if is_generated_initial_prompt and initial_prompt is not None:
                    df.at[index, f"{pipeline.get_pipeline_name()}_initial_prompt"] = selected["initial_prompt"]
                df.at[index, f"{pipeline.get_pipeline_name()}_norm"] = selected["flow_output_norm"]
                df.at[index, f"{pipeline.get_pipeline_name()}_wer"] = selected["wer"]


    
    if save_results:
        df.to_csv(f"{output_file}.csv", index=False)
        print(f"Results saved to {output_file}")

In [None]:
num_iterations = 3 # Number of iterations allowed until first success

Pipelines: List[BasePipeline] = [
    FixTranscriptByLLMPipeline(api_key=API_KEY, pipline_name="FixTranscriptByLLMPipeline", num_iterations_allowed=num_iterations, verbose=False),
    GenerateWhisperPromptPipeline(api_key=API_KEY, pipline_name="GenerateWhisperPromptPipeline", num_iterations_allowed=num_iterations, verbose=False),
    GenerateNamesPipeline(api_key=API_KEY, pipline_name="GenerateNamesPipeline", num_iterations_allowed=num_iterations, verbose=False),
    GenerateTopicPipeline(api_key=API_KEY, pipline_name="GenerateTopicPipeline", num_iterations_allowed=num_iterations, verbose=False),
]   

# Run the flow with the defined pipelines
file_name = "fs8ryPzUgW0"
date_str = "1.8.25"
csv_file_path = f"../notebooks/downloads/segments_metadata_{file_name}.csv"
df = load_csv(csv_file_path)
path_prefix=f"../notebooks/downloads/segments/"
await run_flow(df, path_prefix, f"full_report_{file_name}_{date_str}", Pipelines, print_results=True, save_results=True)

start 0 from 24
flow_output_norm: Three games of playoff action coming at you with a lot of defense. The boost is the NBA.com Top 10. But first, at number 10, Franz Wagner.
index: 0, Pipeline: FixTranscriptByLLMPipeline
Target:------------------->three games of playoff action coming at you with a lot of defense the boot is the nba com top ten but first at number ten franz wagner
STT raw:------------------>three games of playoff action coming at you with a lot of defense the boot is the nba com top ten but first at number ten franz wagner
After Flow:--------------->three games of playoff action coming at you with a lot of defense the boost is the nba com top ten but first at number ten franz wagner
*Not Improved*, WER before LLM correction: 0.0000, WER after LLM correction: 0.0357

Best Names: ['Franz Wagner']
sentence build: Franz Wagner is impressive on the court, showing remarkable skill in basketball.
flow_output_norm: Franz Wagner is impressive on the court, showing remarkable skil

In [16]:
file_eval_paths = [
    "full_report_-71SWHoWQJI_31.1.7.25.csv",
    "full_report_-hL_qIOiQAc_31.7.25.csv",
    "full_report_8IVObmY-QTg_31.1.7.25.csv",
    "full_report_df-0dAkQImY_31.1.7.25.csv",
    "full_report_n1e18JbhUXs_31.1.7.25.csv",
    "full_report_Tt8Cepx7HqY_31.1.7.25.csv",
    "full_report_xwJwMgEkcs4_31.1.7.25.csv",
    "full_report_pFfteuNJenw_31.1.7.25.csv",
    # "full_report_xwJwMgEkcs4_1.8.25.csv", # Not good results, not used in the evaluation
    "full_report_8IVObmY-QTg_1.8.25.csv", 
]

In [17]:
file_eval_path = file_eval_paths[0]

df = pd.read_csv(file_eval_path)  # adjust path if needed

# Identify WER columns
wer_cols = [c for c in df.columns if c.endswith("_wer")]

# Compute improvement percentages vs stt_raw_wer
improvement_df = pd.DataFrame()
for col in wer_cols:
    if col == "stt_raw_wer":
        continue
    improvement_df[col] = (df["stt_raw_wer"] - df[col]) / df["stt_raw_wer"] * 100

# Summary stats
summary = pd.DataFrame({
    "mean_wer": {col: df[col].mean() for col in wer_cols},
    "median_wer": {col: df[col].median() for col in wer_cols},
    "mean_improvement_%": {col: improvement_df[col].mean() if col in improvement_df else np.nan for col in wer_cols},
    "median_improvement_%": {col: improvement_df[col].median() if col in improvement_df else np.nan for col in wer_cols},
})

print("=== Summary ===")
print(summary)

=== Summary ===
                                   mean_wer  median_wer  mean_improvement_%  \
stt_raw_wer                        0.121777    0.082553                 NaN   
FixTranscriptByLLMPipeline_wer     0.117443    0.078947                -inf   
GenerateWhisperPromptPipeline_wer  0.159563    0.084524                -inf   
GenerateNamesPipeline_wer          0.092798    0.075236                -inf   
GenerateTopicPipeline_wer          0.146418    0.084040                -inf   

                                   median_improvement_%  
stt_raw_wer                                         NaN  
FixTranscriptByLLMPipeline_wer                      0.0  
GenerateWhisperPromptPipeline_wer                   0.0  
GenerateNamesPipeline_wer                           0.0  
GenerateTopicPipeline_wer                           0.0  


In [None]:
from typing import List
import numpy as np
import pandas as pd
from collections import defaultdict

def evaluate_wer_pipeline(
    file_eval_paths: List[str],
    header_columns: int = 1,
    aggregate: bool = True,              # <— new: also print an "all files" rollup
    print_per_file: bool = True,         # <— keep per-file printouts (set False to suppress)
    tol: float = 1e-8                    # <— tolerance for "Same" (np.isclose)
) -> None:
    """
    Evaluate WER improvements across multiple pipeline reports.

    Args:
        file_eval_paths: List of CSV paths, each with 'stt_raw_wer' and one or more '*_wer' columns.
        header_columns:  Passed through to your load_csv.
        aggregate:       If True, also show a single summary accumulated over all files.
        print_per_file:  If True, print each file's summary as before.
        tol:             Absolute tolerance for treating WER as "Same".
    """
    # Global accumulator keyed by pipeline name
    agg = defaultdict(lambda: {"total_segments": 0, "Improved": 0, "Same": 0, "Not Improved": 0})

    for file_eval_path in file_eval_paths:
        if print_per_file:
            print(f"\nProcessing file: {file_eval_path}")

        df = load_csv(file_eval_path, header_columns)

        # Identify WER pipelines excluding the raw baseline
        wer_cols = [c for c in df.columns if c.endswith("_wer")]
        if "stt_raw_wer" not in wer_cols:
            # Skip files that don't have the baseline column
            if print_per_file:
                print("  Skipped: missing 'stt_raw_wer'")
            continue

        other_pipelines = [c for c in wer_cols if c != "stt_raw_wer"]

        summary_rows = []
        raw = df["stt_raw_wer"]

        for col in other_pipelines:
            other = df[col]

            # masks: improved/same/not improved (treat NaNs as not comparable)
            valid_mask = (~raw.isna()) & (~other.isna())
            improved_mask = (other < raw) & valid_mask
            same_mask = np.isclose(other, raw, atol=tol, rtol=0, equal_nan=False) & valid_mask
            not_improved_mask = (other > raw) & valid_mask

            improved_count = int(improved_mask.sum())
            same_count = int(same_mask.sum())
            not_improved_count = int(not_improved_mask.sum())
            total_segments = int(valid_mask.sum())  # only count rows where both WERs exist

            # Per-file summary row
            summary_rows.append({
                "pipeline": col,
                "total_segments": total_segments,
                "Improved": improved_count,
                "Same": same_count,
                "Not Improved": not_improved_count,
                "Improved_%": (improved_count / total_segments * 100) if total_segments else 0.0,
                "Same_%": (same_count / total_segments * 100) if total_segments else 0.0,
                "Not_Improved_%": (not_improved_count / total_segments * 100) if total_segments else 0.0,
            })

            # Accumulate into global rollup
            agg[col]["total_segments"] += total_segments
            agg[col]["Improved"] += improved_count
            agg[col]["Same"] += same_count
            agg[col]["Not Improved"] += not_improved_count

        if print_per_file:
            summary_df = pd.DataFrame(summary_rows)
            if not summary_df.empty:
                print(summary_df.to_string(index=False))
            else:
                print("  No pipeline columns found to compare.")

    # Print a single accumulated summary across all files
    if aggregate and agg:
        rows = []
        for pipeline, counts in agg.items():
            total = counts["total_segments"]
            imp = counts["Improved"]
            same = counts["Same"]
            not_imp = counts["Not Improved"]
            rows.append({
                "pipeline": pipeline,
                "total_segments": total,
                "Improved": imp,
                "Same": same,
                "Not Improved": not_imp,
                "Improved_%": (imp / total * 100) if total else 0.0,
                "Same_%": (same / total * 100) if total else 0.0,
                "Not_Improved_%": (not_imp / total * 100) if total else 0.0,
            })

        agg_df = pd.DataFrame(rows).sort_values(["Improved_%", "Improved"], ascending=[False, False])
        print("\n=== Accumulated summary across ALL files ===")
        print(agg_df.to_string(index=False))

### Make adjusments:

In [7]:
# Put all the csv files from the after_evaluation folder in file_eval_paths
file_eval_paths = os.listdir("after_evaluation")
file_eval_paths = [f"after_evaluation/{f}" for f in file_eval_paths if f.endswith(".csv")]

evaluate_wer_pipeline(file_eval_paths, 1)
#evaluate_wer_pipeline(["after_evaluation/full_report_fs8ryPzUgW0_2.8.25.csv"])
#evaluate_wer_pipeline(file_eval_paths)


Processing file: after_evaluation/full_report_8IVObmY-QTg_2.8.25.csv
                         pipeline  total_segments  Improved  Same  Not Improved  Improved_%    Same_%  Not_Improved_%
   FixTranscriptByLLMPipeline_wer              76        10    53            13   13.157895 69.736842       17.105263
GenerateWhisperPromptPipeline_wer              76        22    51             3   28.947368 67.105263        3.947368
        GenerateNamesPipeline_wer              76        20    34            22   26.315789 44.736842       28.947368
        GenerateTopicPipeline_wer              76        18    30            28   23.684211 39.473684       36.842105

Processing file: after_evaluation/full_report_xwJwMgEkcs4_2.8.25.csv
                         pipeline  total_segments  Improved  Same  Not Improved  Improved_%  Same_%  Not_Improved_%
   FixTranscriptByLLMPipeline_wer              40         7    27             6        17.5    67.5            15.0
GenerateWhisperPromptPipeline_wer     