# TIMIT Evaluation 

Runs evaluation scripts on the TIMIT corpus to get phone error rates and edit distances for TIMIT (unseen data) for the following models:
- Our models that were fine-tuned on the Buckeye corpus
- C. Taguchi Model
- Allosaraus Model
- Whisper to Epitran

### Additional installation step for Epitran

```bash
$ git clone http://github.com/festvox/flite
$ cd flite
$ ./configure && make
$ sudo make install
$ cd testsuite
$ make lex_lookup
$ sudo cp lex_lookup /usr/local/bin
```

In [None]:
from pathlib import Path

import datasets
import numpy as np
import pandas as pd
import transformers

import multipa.evaluation
import multipa.evaluation_extras

DEVICE = "mps"

# Paths For TIMIT Database and TIMIT IPA
# timit_data_dir = Path("/Users/parthbhangla/Desktop/Multipa_Datasets/TIMIT/COMPLETE")
# transcriptions_path = Path("/Users/parthbhangla/Desktop/Multipa_Datasets/TIMIT/complete_ipa.csv")
timit_data_dir = Path("../../data/TIMIT Dataset/COMPLETE")
transcriptions_path = Path("../../data/TIMIT Dataset/complete_ipa.csv")

# HuggingFace Models Evaluating
our_model = "ginic/full_dataset_train_3_wav2vec2-large-xlsr-53-buckeye-ipa"
taguchi_1k = "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns"

# Set up results directories
RESULTS_DIR =Path("../../data/timit_results")
VERBOSE_RESULTS_DIR = RESULTS_DIR / "detailed_predictions"
AGGREGATE_METRICS_CSV = RESULTS_DIR / "aggregate_metrics/all_models_eval.csv"
EDIT_DIST_DIR = RESULTS_DIR / "edit_distances"
VERBOSE_RESULTS_DIR.mkdir(parents=True, exist_ok=True)
AGGREGATE_METRICS_CSV.parent.mkdir(parents=True, exist_ok=True)
EDIT_DIST_DIR.mkdir(parents=True, exist_ok=True)

# Post-pocessing options
IS_REMOVE_SPACES = True
IS_NORMALIZE_IPA = True # make common substitutions for IPA compliance using ipatok.tokenise

NUM_PROC = 8 # Number of processes for HuggingFace dataset map and filter

# Computes and stores by-model performance metrics
model_evaluator = multipa.evaluation.ModelEvaluator()

evaluated_models = []

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def read_timit_gold_standard_transcriptions(transcriptions_path):
    """Returns a dictionary of {"audio_filename" -> {"ipa_transcription": transcription, "filename": original_filename}}"""
    gold_standard_df = pd.read_csv(transcriptions_path)
    gold_standard_df["filename"] = gold_standard_df["audio_filename"].str.lower()
    gold_standard_df.set_index("filename", inplace=True)
    return gold_standard_df.to_dict("index")


In [None]:
# Load TIMIT audio as a HuggingFace dataset with audio and gold standard transcriptions together
# This loads TIMIT as a Dataset with the same columns as the Buckeye corpus we've been working with
gold_standard_transcriptions = read_timit_gold_standard_transcriptions(transcriptions_path)

timit_wavs = [p for p in timit_data_dir.rglob("*") if p.suffix.lower() == ".wav"]
print("Total WAV files found:", len(timit_wavs))
data = []

for p in timit_wavs:
    clean_filename = "/" + str(p.relative_to(timit_data_dir.parent)).lower()
    ipa_transcription = gold_standard_transcriptions[clean_filename]["ipa_transcription"]

    entry = {
        "audio": {"path": str(p)},
        "filename": clean_filename,
        "ipa":ipa_transcription
    }
    data.append(entry)

audio_dataset = datasets.Dataset.from_list(data)
print(audio_dataset)
print(audio_dataset[0])

# TODO: Evaluate on the whole dataset
# Test with a small subset if wanted
#audio_subset = audio_dataset.take(10)
audio_subset = audio_dataset

In [None]:
# Sample audio correctly and preprocess transcriptions to remove whitepsace
audio_subset, audio_without_speech = multipa.evaluation.preprocess_test_data(audio_subset,
    is_remove_space=IS_REMOVE_SPACES, is_normalize_ipa = IS_NORMALIZE_IPA, num_proc=NUM_PROC)
print("Audio with speech transcriptions")
print(audio_subset)
print(audio_subset[0])

# Sanity check that there's no audio without transcriptions
print("Audio without speech transcriptions")
print(audio_without_speech)


In [None]:
# Allosaurus inference and metrics compute
allosaurus_model = "eng2102"
phone_inventory = "eng"
allosaurus_model_name = f"allosaurus_{allosaurus_model}_{phone_inventory}"

# Download model and predict
allosaurus_predictions = multipa.evaluation_extras.allosaurus_predict(audio_subset, model=allosaurus_model, phone_inventory=phone_inventory, is_remove_spaces=IS_REMOVE_SPACES, is_normalize_ipa=IS_NORMALIZE_IPA, num_proc=NUM_PROC)

# Get evaluation results for raw model output
allosaurus_metrics = model_evaluator.eval_non_empty_transcriptions(allosaurus_model_name,
    allosaurus_predictions[multipa.evaluation.PREDICTION_KEY], audio_subset["ipa"])

# Write prediction details and edit distances
model_evaluator.write_edit_distance_results(allosaurus_model_name, EDIT_DIST_DIR)
multipa.evaluation.write_detailed_prediction_results(VERBOSE_RESULTS_DIR, allosaurus_model_name, audio_subset, allosaurus_predictions, allosaurus_metrics)

# Save model results for later
print("Done evaluating Allosaurus")
evaluated_models.append(allosaurus_model_name)
full_analysis_dataset = audio_subset.add_column(allosaurus_model_name, allosaurus_predictions[multipa.evaluation.PREDICTION_KEY])
print(full_analysis_dataset)

In [None]:
# HuggingFace model inference and evaluation
# These work with the multipa.evaluation code
models = [(our_model, IS_NORMALIZE_IPA), (taguchi_1k, False)]
for model_name, ipa_norm_flag in models:
    clean_model_name = multipa.evaluation.clean_model_name(model_name)
    print(f"Running ASR for model: {model_name}")
    asr_pipe = transformers.pipeline("automatic-speech-recognition", model=model_name, device=DEVICE)
    predictions_dataset = multipa.evaluation.get_clean_predictions(audio_subset, asr_pipe,
        num_proc=NUM_PROC, is_remove_space=IS_REMOVE_SPACES, is_normalize_ipa=ipa_norm_flag)

    # Compute all metrics
    model_metrics = model_evaluator.eval_non_empty_transcriptions(model_name,
        predictions_dataset[multipa.evaluation.PREDICTION_KEY], audio_subset["ipa"])

    # Write prediction details and edit distances
    model_evaluator.write_edit_distance_results(model_name, EDIT_DIST_DIR)
    multipa.evaluation.write_detailed_prediction_results(VERBOSE_RESULTS_DIR, clean_model_name, audio_subset, predictions_dataset, model_metrics)

    print("Done evaluating", model_name)
    evaluated_models.append(model_name)
    full_analysis_dataset = full_analysis_dataset.add_column(name=model_name, column=predictions_dataset[multipa.evaluation.PREDICTION_KEY])
    print(full_analysis_dataset)

In [None]:
# Orthographic to epitran models
models = [
    "openai/whisper-large-v3-turbo",
    # "openai/whisper-large-v3",
    "openai/whisper-medium.en",
]
for m in models:
    model_name = f"{m}_to_epitran".replace("/", "_")
    print("Evaulating", model_name)

    # Download model and predict
    epitran_predictions = multipa.evaluation_extras.hf_model_to_epitran_predict(m, audio_subset, device=DEVICE, num_proc=NUM_PROC, is_remove_spaces=IS_REMOVE_SPACES, is_normalize_ipa=IS_NORMALIZE_IPA)
    metrics = model_evaluator.eval_non_empty_transcriptions(
        model_name, epitran_predictions[multipa.evaluation.PREDICTION_KEY], audio_subset["ipa"]
    )
    multipa.evaluation.write_detailed_prediction_results(VERBOSE_RESULTS_DIR, model_name, audio_subset, epitran_predictions, metrics)
    model_evaluator.write_edit_distance_results(model_name, EDIT_DIST_DIR)
    print("Done evaluating", model_name)
    evaluated_models.append(model_name)
    full_analysis_dataset = full_analysis_dataset.add_column(name=model_name, column=epitran_predictions[multipa.evaluation.PREDICTION_KEY])
    print(full_analysis_dataset)

In [None]:
# Models fine-tuned on TIMIT
hf_to_phonecodes_models = [("excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k", "timit", "ipa"), ("excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k_simplified", "timit", "ipa")]

for model_name, in_code, out_code in hf_to_phonecodes_models:
    model_predictions = multipa.evaluation_extras.hf_to_phonecodes(audio_subset, model_name, in_code, out_code)

    metrics = model_evaluator.eval_non_empty_transcriptions(
        model_name,
        model_predictions[multipa.evaluation.PREDICTION_KEY],
        audio_subset["ipa"])
    multipa.evaluation.write_detailed_prediction_results(
        VERBOSE_RESULTS_DIR, multipa.evaluation.clean_model_name(model_name), audio_subset, model_predictions, metrics
    )
    model_evaluator.write_edit_distance_results(model_name, EDIT_DIST_DIR)
    evaluated_models.append(model_name)
    full_analysis_dataset = full_analysis_dataset.add_column(name=model_name, column=model_predictions[multipa.evaluation.PREDICTION_KEY])



In [None]:
# Write all results to file for comparison
model_evaluator.to_csv(AGGREGATE_METRICS_CSV)

In [None]:
print("These models were evaluated:", evaluated_models)
print("Dataset snippet for full anslysis:")
print(full_analysis_dataset)
print(full_analysis_dataset[0])

In [None]:
predictions_df = full_analysis_dataset.to_pandas()
print("predictions_df snippet")
print(predictions_df.head())

full_comparison_df = predictions_df.drop(
    columns=["audio"]
    )

print("full_comparison_df snippet")
print(full_comparison_df.head())

In [None]:
full_comparison_df.columns

In [None]:
# Finalize aggregate and by-dialect results
gold_col = "ipa"
model_names = evaluated_models
model_eval = multipa.evaluation.ModelEvaluator()

def extract_dialect(path_str):
    path = Path(path_str)
    parts = [p for p in path.parts if p.lower().startswith("dr")]
    return parts[0].upper() if parts else "UNKNOWN"

full_comparison_df["dialect"] = full_comparison_df["filename"].apply(extract_dialect)
print("Dialect groups found:", full_comparison_df["dialect"].unique())

summary_data = {}
dialect_results = []

for model_name in model_names:
    print(f"Evaluating model: {model_name}")

    predictions = full_comparison_df[model_name].tolist()
    references = full_comparison_df[gold_col].tolist()

    metrics = model_eval.eval_non_empty_transcriptions(model_name, predictions, references)

    for metric_name in ["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]:
        col_name = f"{metric_name} VS {model_name}"
        full_comparison_df[col_name] = metrics[metric_name]

    summary_data[model_name] = {
        metric_name: float(np.mean(metrics[metric_name]))
        for metric_name in ["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]
    }

    for dialect, df_group in full_comparison_df.groupby("dialect"):
        result_row = {
            "dialect": dialect,
            "model": model_name,
        }
        for metric_name in ["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]:
            col_name = f"{metric_name} VS {model_name}"
            result_row[metric_name] = df_group[col_name].mean()
        dialect_results.append(result_row)


summary_df = pd.DataFrame(summary_data).T
summary_df = summary_df[["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]]
summary_df = summary_df.reset_index()
summary_df = summary_df.rename(columns={"index": "model"})
summary_df.to_csv("timit_model_evaluation_summary.csv", index=False)
print("Average evaluation metrics per model saved to timit_model_evaluation_summary.csv")


dialect_summary_df = pd.DataFrame(dialect_results)
dialect_summary_df.to_csv("timit_dialect_model_comparison.csv", index=False)
print("Dialect evaluation complete. Results saved to timit_dialect_model_comparison.csv")