# Compare results for different years

In [None]:
from pathlib import Path
import pandas as pd
from notebook_utils import load_files_to_df, expand_abbreviations
from asr_eval import utils

## Calculate mean scores for each model, language and year

In [None]:
mean_score_file = Path("../data/output/2024/mean_scores_year.csv")

if not mean_score_file.exists():
    p_previous = Path("../data/output/2023")
    p_current = Path("../data/output/2024")

    prev_year_df = load_files_to_df(p_previous)
    current_year_df = load_files_to_df(p_current)

    data_df = pd.concat([prev_year_df, current_year_df])

    data_df = expand_abbreviations(data_df)

    # Filter columns
    columns_to_keep = [
        "cer",
        "wer",
        "sbert_semdist",
        "semdist",
        "aligned_semdist",
        "date",
        "model_name",
        "language_code",
        "prediction_langcode",
        "year",
        "dialect",
        "gender",
        "standardized_text",
        "standardized_text_nn",
        "standardized_prediction",
    ]

    data_df = data_df[columns_to_keep]
    mean_score_df = utils.calculate_mean_scores(data_df, "year")
    mean_score_df.drop_duplicates(subset=["modell", "språk", "year"], inplace=True)
    mean_score_df.to_csv(mean_score_file, index=False)
else:
    mean_score_df = pd.read_csv(mean_score_file, dtype={"year": str})