# Overlapping speech 

In [None]:
from notebook_utils import (
    load_files_to_df,
    expand_abbreviations,
    make_plot,
)
from asr_eval import utils
from pathlib import Path
import pandas as pd


pd.options.mode.copy_on_write = True

In [None]:
p_current = Path("../data/output/2024")

df = load_files_to_df(p_current)
df = expand_abbreviations(df)

In [None]:
def recreate_audiofilepath(segmented_audio_path: str) -> str:
    # new_str = segmented_audio_path.replace("segmented/", "")
    new_str, _, end_time = segmented_audio_path.rpartition("_")
    new_str, _, start_time = new_str.rpartition("_")
    return new_str


df["audiofile"] = df["segmented_audio"].apply(recreate_audiofilepath)

In [None]:
dfs = []
for (filename, model, lang, lang_pred), df_ in df.groupby(
    ["audiofile", "model_name", "language_code", "prediction_langcode"]
):
    if lang != lang_pred:
        print(
            f"""filename:\t{filename}\nmodel\t\t{model}\nlang\t\t{lang}\npred lang:\t{lang_pred}\n"""
        )
    df_ = df_.copy()
    df_.index = range(len(df_))
    df_.sort_values(["start_ms"], inplace=True)

    df_["start_s"] = round(df_["start_ms"] / 1000, 0)
    df_["end_s"] = round(df_["end_ms"] / 1000, 0)

    # Two segments are overlapping if one starts before the other ends or ends after the other starts
    df_["overlapping"] = (df_["start_s"] < df_["end_s"].shift(1)) | (
        df_["end_s"] > df_["start_s"].shift(-1)
    )
    dfs.append(df_)


df = pd.concat(dfs, ignore_index=True)

mean_score_file = Path("../data/output/2024/mean_scores_overlapping.csv")
if not mean_score_file.exists():
    mean_score_df = utils.calculate_mean_scores(df, "overlapping")
    mean_score_df.to_csv(mean_score_file, index=False)
else:
    mean_score_df = pd.read_csv(mean_score_file)

df.overlapping.value_counts()

# Ytelse på overlappende vs ikke-overlappende tale 
Som forventet er ytelsen dårligere på overlappende tale.

In [None]:
imagedir = Path("images")
imagedir.mkdir(exist_ok=True)

for lang in df.language_code.unique():
    make_plot(
        mean_score_df,
        plot_type="barchart",
        feature="overlapping",
        metric="WER",
        language=lang,
        save_to_dir=imagedir,
    )

# Ytelse per dialekt på overlappende vs ikke-overlappende tale

In [None]:
ovelapping_df = df[df["overlapping"]]
not_overlapping_df = df[~df["overlapping"]]

overlapping_dialect_file = Path(
    "../data/output/2024/mean_scores_dialect_overlapping.csv"
)
not_overlapping_dialect_file = Path(
    "../data/output/2024/mean_scores_dialect_not_overlapping.csv"
)

if not (overlapping_dialect_file.exists() and not_overlapping_dialect_file.exists()):
    overlapping_dialect_df = utils.calculate_mean_scores(ovelapping_df, "dialect")
    not_overlapping_dialect_df = utils.calculate_mean_scores(
        not_overlapping_df, "dialect"
    )
    overlapping_dialect_df.to_csv(overlapping_dialect_file, index=False)
    not_overlapping_dialect_df.to_csv(not_overlapping_dialect_file, index=False)
else:
    overlapping_dialect_df = pd.read_csv(overlapping_dialect_file)
    not_overlapping_dialect_df = pd.read_csv(not_overlapping_dialect_file)

In [None]:
def add_average_scores_across_langs(df: pd.DataFrame) -> pd.DataFrame:
    """For model outputs in multiple languages, calculate scores across languages"""
    # Define the score columns to average
    score_columns = [
        "CER",
        "WER",
        "aligned semantic distance",
        "semantic distance",
        "semantic distance (sBERT)",
    ]

    new_rows = []
    for (modell, dialect), group in df.groupby(["modell", "dialect"]):
        if group["språk"].nunique() > 1:  # Check if there are different 'språk' values
            avg_scores = group[
                score_columns
            ].mean()  # Compute the average of score columns
            new_row = {
                "modell": modell,
                "dialect": dialect,
                "språk": "both",
                **avg_scores,
            }
            new_rows.append(new_row)
    return pd.concat((df, pd.DataFrame(new_rows)))


overlapping_dialect_df = add_average_scores_across_langs(overlapping_dialect_df)
not_overlapping_dialect_df = add_average_scores_across_langs(not_overlapping_dialect_df)

## Heatmap for dialects non-overlapping speech only 

In [None]:
make_plot(
    not_overlapping_dialect_df,
    plot_type="heatmap",
    feature="dialect",
    metric="WER",
    language="nob",
    save_to_dir=imagedir,
    title_text=" (uten overlappende tale)",
)

In [None]:
make_plot(
    not_overlapping_dialect_df,
    plot_type="heatmap",
    feature="dialect",
    metric="WER",
    language="nno",
    save_to_dir=imagedir,
    title_text=" (uten overlappende tale)",
)

In [None]:
make_plot(
    not_overlapping_dialect_df,
    plot_type="heatmap",
    feature="dialect",
    metric="WER",
    language="both",
    save_to_dir=imagedir,
    title_text=" (uten overlappende tale)",
)

## Heatmap for dialects overlapping speech only 

In [None]:
make_plot(
    overlapping_dialect_df,
    plot_type="heatmap",
    feature="dialect",
    metric="WER",
    language="nob",
    save_to_dir=imagedir,
    title_text=" (bare overlappende tale)",
)

In [None]:
make_plot(
    overlapping_dialect_df,
    plot_type="heatmap",
    feature="dialect",
    metric="WER",
    language="nno",
    save_to_dir=imagedir,
    title_text=" (bare overlappende tale)",
)

In [None]:
make_plot(
    overlapping_dialect_df,
    plot_type="heatmap",
    feature="dialect",
    metric="WER",
    language="both",
    save_to_dir=imagedir,
    title_text=" (bare overlappende tale)",
)