# TIMIT Evaluation 

Runs evaluation scripts on the TIMIT corpus to get phone error rates and edit distances for TIMIT (unseen data) for the following models:
- Our models that were fine-tuned on the Buckeye corpus
- C. Taguchi Model
- Allosaraus Model
- Whisper to Epitran

### Additional installation step for Epitran

```bash
$ git clone http://github.com/festvox/flite
$ cd flite
$ ./configure && make
$ sudo make install
$ cd testsuite
$ make lex_lookup
$ sudo cp lex_lookup /usr/local/bin
```

In [1]:
from pathlib import Path

import datasets
import numpy as np
import pandas as pd
import transformers

import multipa.evaluation
import multipa.evaluation_extras

DEVICE = "mps"

# Paths For TIMIT Database and TIMIT IPA
# timit_data_dir = Path("/Users/parthbhangla/Desktop/Multipa_Datasets/TIMIT/COMPLETE")
# transcriptions_path = Path("/Users/parthbhangla/Desktop/Multipa_Datasets/TIMIT/complete_ipa.csv")
timit_data_dir = Path("../../data/TIMIT Dataset/COMPLETE")
transcriptions_path = Path("../../data/TIMIT Dataset/complete_ipa.csv")

# HuggingFace Models Evaluating
our_model = "ginic/full_dataset_train_3_wav2vec2-large-xlsr-53-buckeye-ipa"
taguchi_1k = "ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns"

# Set up results directories
RESULTS_DIR =Path("../../data/timit_results")
VERBOSE_RESULTS_DIR = RESULTS_DIR / "detailed_predictions"
AGGREGATE_METRICS_CSV = RESULTS_DIR / "aggregate_metrics/all_models_eval.csv"
EDIT_DIST_DIR = RESULTS_DIR / "edit_distances"
VERBOSE_RESULTS_DIR.mkdir(parents=True, exist_ok=True)
AGGREGATE_METRICS_CSV.parent.mkdir(parents=True, exist_ok=True)
EDIT_DIST_DIR.mkdir(parents=True, exist_ok=True)

# Post-pocessing options
IS_REMOVE_SPACES = True
IS_NORMALIZE_IPA = True # make common substitutions for IPA compliance using ipatok.tokenise

NUM_PROC = 8 # Number of processes for HuggingFace dataset map and filter

# Computes and stores by-model performance metrics
model_evaluator = multipa.evaluation.ModelEvaluator()

evaluated_models = []

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def read_timit_gold_standard_transcriptions(transcriptions_path):
    """Returns a dictionary of {"audio_filename" -> {"ipa_transcription": transcription, "filename": original_filename}}"""
    gold_standard_df = pd.read_csv(transcriptions_path)
    gold_standard_df["filename"] = gold_standard_df["audio_filename"].str.lower()
    gold_standard_df.set_index("filename", inplace=True)
    return gold_standard_df.to_dict("index")


In [3]:
# Load TIMIT audio as a HuggingFace dataset with audio and gold standard transcriptions together
# This loads TIMIT as a Dataset with the same columns as the Buckeye corpus we've been working with
gold_standard_transcriptions = read_timit_gold_standard_transcriptions(transcriptions_path)

timit_wavs = [p for p in timit_data_dir.rglob("*") if p.suffix.lower() == ".wav"]
print("Total WAV files found:", len(timit_wavs))
data = []

for p in timit_wavs:
    clean_filename = "/" + str(p.relative_to(timit_data_dir.parent)).lower()
    ipa_transcription = gold_standard_transcriptions[clean_filename]["ipa_transcription"]

    entry = {
        "audio": {"path": str(p)},
        "filename": clean_filename,
        "ipa":ipa_transcription
    }
    data.append(entry)

audio_dataset = datasets.Dataset.from_list(data)
print(audio_dataset)
print(audio_dataset[0])

# TODO: Evaluate on the whole dataset
# Test with a small subset if wanted
#audio_subset = audio_dataset.take(10)
audio_subset = audio_dataset

Total WAV files found: 6300
Dataset({
    features: ['audio', 'filename', 'ipa'],
    num_rows: 6300
})
{'audio': {'path': '../../data/TIMIT Dataset/COMPLETE/DR4/MMDM0/SI681.WAV'}, 'filename': '/complete/dr4/mmdm0/si681.wav', 'ipa': ' w ɨ d s ʌ tʃ ɨ n æ k t ɨ v ɹ ɨ f j ʉ ʒ l̩  b i j ʉ s f l̩  '}


In [4]:
# Sample audio correctly and preprocess transcriptions to remove whitepsace
audio_subset, audio_without_speech = multipa.evaluation.preprocess_test_data(audio_subset,
    is_remove_space=IS_REMOVE_SPACES, is_normalize_ipa = IS_NORMALIZE_IPA, num_proc=NUM_PROC)
print("Audio with speech transcriptions")
print(audio_subset)
print(audio_subset[0])

# Sanity check that there's no audio without transcriptions
print("Audio without speech transcriptions")
print(audio_without_speech)


Map (num_proc=8): 100%|██████████| 6300/6300 [00:00<00:00, 15068.13 examples/s]
Filter (num_proc=8): 100%|██████████| 6300/6300 [00:03<00:00, 1943.79 examples/s]
Filter (num_proc=8): 100%|██████████| 6300/6300 [00:01<00:00, 4819.14 examples/s]


Audio with speech transcriptions
Dataset({
    features: ['audio', 'filename', 'ipa'],
    num_rows: 6300
})
{'audio': {'path': '../../data/TIMIT Dataset/COMPLETE/DR4/MMDM0/SI681.WAV', 'array': array([-2.13623047e-04,  6.10351562e-05,  3.05175781e-05, ...,
       -3.05175781e-05, -9.15527344e-05, -6.10351562e-05]), 'sampling_rate': 16000}, 'filename': '/complete/dr4/mmdm0/si681.wav', 'ipa': 'wɨdsʌtʃɨnæktɨvɹɨfjʉʒl̩bijʉsfl̩'}
Audio without speech transcriptions
Dataset({
    features: ['audio', 'filename', 'ipa'],
    num_rows: 0
})


In [5]:
# Allosaurus inference and metrics compute
allosaurus_model = "eng2102"
phone_inventory = "eng"
allosaurus_model_name = f"allosaurus_{allosaurus_model}_{phone_inventory}"

# Download model and predict
allosaurus_predictions = multipa.evaluation_extras.allosaurus_predict(audio_subset, model=allosaurus_model, phone_inventory=phone_inventory, is_remove_spaces=IS_REMOVE_SPACES, is_normalize_ipa=IS_NORMALIZE_IPA, num_proc=NUM_PROC)

# Get evaluation results for raw model output
allosaurus_metrics = model_evaluator.eval_non_empty_transcriptions(allosaurus_model_name,
    allosaurus_predictions[multipa.evaluation.PREDICTION_KEY], audio_subset["ipa"])

# Write prediction details and edit distances
model_evaluator.write_edit_distance_results(allosaurus_model_name, EDIT_DIST_DIR)
multipa.evaluation.write_detailed_prediction_results(VERBOSE_RESULTS_DIR, allosaurus_model_name, audio_subset, allosaurus_predictions, allosaurus_metrics)

# Save model results for later
print("Done evaluating Allosaurus")
evaluated_models.append(allosaurus_model_name)
full_analysis_dataset = audio_subset.add_column(allosaurus_model_name, allosaurus_predictions[multipa.evaluation.PREDICTION_KEY])
print(full_analysis_dataset)

Evaluating allosaurus. Model: eng2102 Phone inventory: eng


  model_state_dict = torch.load(str(path), map_location=torch.device('cpu'))
100%|██████████| 6300/6300 [36:33<00:00,  2.87it/s]
Map (num_proc=8): 100%|██████████| 6300/6300 [00:00<00:00, 20833.67 examples/s]
Flattening the indices: 100%|██████████| 6300/6300 [00:00<00:00, 38894.17 examples/s]
Creating CSV from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 115.29ba/s]


Done evaluating Allosaurus


Flattening the indices: 100%|██████████| 6300/6300 [00:00<00:00, 45634.59 examples/s]

Dataset({
    features: ['audio', 'filename', 'ipa', 'allosaurus_eng2102_eng'],
    num_rows: 6300
})





In [6]:
# HuggingFace model inference and evaluation
# These work with the multipa.evaluation code
models = [(our_model, IS_NORMALIZE_IPA), (taguchi_1k, False)]
for model_name, ipa_norm_flag in models:
    clean_model_name = multipa.evaluation.clean_model_name(model_name)
    print(f"Running ASR for model: {model_name}")
    asr_pipe = transformers.pipeline("automatic-speech-recognition", model=model_name, device=DEVICE)
    predictions_dataset = multipa.evaluation.get_clean_predictions(audio_subset, asr_pipe,
        num_proc=NUM_PROC, is_remove_space=IS_REMOVE_SPACES, is_normalize_ipa=ipa_norm_flag)

    # Compute all metrics
    model_metrics = model_evaluator.eval_non_empty_transcriptions(model_name,
        predictions_dataset[multipa.evaluation.PREDICTION_KEY], audio_subset["ipa"])

    # Write prediction details and edit distances
    model_evaluator.write_edit_distance_results(model_name, EDIT_DIST_DIR)
    multipa.evaluation.write_detailed_prediction_results(VERBOSE_RESULTS_DIR, clean_model_name, audio_subset, predictions_dataset, model_metrics)

    print("Done evaluating", model_name)
    evaluated_models.append(model_name)
    full_analysis_dataset = full_analysis_dataset.add_column(name=model_name, column=predictions_dataset[multipa.evaluation.PREDICTION_KEY])
    print(full_analysis_dataset)

Running ASR for model: ginic/full_dataset_train_3_wav2vec2-large-xlsr-53-buckeye-ipa


Map (num_proc=8): 100%|██████████| 6300/6300 [00:00<00:00, 21280.99 examples/s]
Flattening the indices: 100%|██████████| 6300/6300 [00:00<00:00, 35771.35 examples/s]
Creating CSV from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 105.62ba/s]


Done evaluating ginic/full_dataset_train_3_wav2vec2-large-xlsr-53-buckeye-ipa
Dataset({
    features: ['audio', 'filename', 'ipa', 'allosaurus_eng2102_eng', 'ginic/full_dataset_train_3_wav2vec2-large-xlsr-53-buckeye-ipa'],
    num_rows: 6300
})
Running ASR for model: ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns


Some weights of the model checkpoint at ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.we

Done evaluating ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns
Dataset({
    features: ['audio', 'filename', 'ipa', 'allosaurus_eng2102_eng', 'ginic/full_dataset_train_3_wav2vec2-large-xlsr-53-buckeye-ipa', 'ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns'],
    num_rows: 6300
})


In [7]:
# Orthographic to epitran models
models = [
    "openai/whisper-large-v3-turbo",
    # "openai/whisper-large-v3",
    "openai/whisper-medium.en",
]
for m in models:
    model_name = f"{m}_to_epitran".replace("/", "_")
    print("Evaulating", model_name)

    # Download model and predict
    epitran_predictions = multipa.evaluation_extras.hf_model_to_epitran_predict(m, audio_subset, device=DEVICE, num_proc=NUM_PROC, is_remove_spaces=IS_REMOVE_SPACES, is_normalize_ipa=IS_NORMALIZE_IPA)
    metrics = model_evaluator.eval_non_empty_transcriptions(
        model_name, epitran_predictions[multipa.evaluation.PREDICTION_KEY], audio_subset["ipa"]
    )
    multipa.evaluation.write_detailed_prediction_results(VERBOSE_RESULTS_DIR, model_name, audio_subset, epitran_predictions, metrics)
    model_evaluator.write_edit_distance_results(model_name, EDIT_DIST_DIR)
    print("Done evaluating", model_name)
    evaluated_models.append(model_name)
    full_analysis_dataset = full_analysis_dataset.add_column(name=model_name, column=epitran_predictions[multipa.evaluation.PREDICTION_KEY])
    print(full_analysis_dataset)

Evaulating openai_whisper-large-v3-turbo_to_epitran
Building pipeline and downloading model
Predicting with openai/whisper-large-v3-turbo


You have passed language=english, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=english.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transliterating with Epitran


100%|██████████| 6300/6300 [21:45<00:00,  4.83it/s]
Map (num_proc=8): 100%|██████████| 6300/6300 [00:00<00:00, 15489.47 examples/s]
Flattening the indices: 100%|██████████| 6300/6300 [00:00<00:00, 16141.89 examples/s]
Creating CSV from Arrow format: 100%|██████████| 7/7 [00:00<00:00, 72.00ba/s]


Done evaluating openai_whisper-large-v3-turbo_to_epitran
Dataset({
    features: ['audio', 'filename', 'ipa', 'allosaurus_eng2102_eng', 'ginic/full_dataset_train_3_wav2vec2-large-xlsr-53-buckeye-ipa', 'ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns', 'openai_whisper-large-v3-turbo_to_epitran'],
    num_rows: 6300
})
Evaulating openai_whisper-medium.en_to_epitran
Building pipeline and downloading model
Predicting with openai/whisper-medium.en




Transliterating with Epitran


  0%|          | 0/6300 [00:00<?, ?it/s]python(75343) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(75344) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(75345) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(75346) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(75347) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(75348) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(75349) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(75350) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
  0%|          | 1/6300 [00:01<1:55:05,  1.10s/it]python(75351) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(75352) MallocStackLog

Done evaluating openai_whisper-medium.en_to_epitran
Dataset({
    features: ['audio', 'filename', 'ipa', 'allosaurus_eng2102_eng', 'ginic/full_dataset_train_3_wav2vec2-large-xlsr-53-buckeye-ipa', 'ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns', 'openai_whisper-large-v3-turbo_to_epitran', 'openai_whisper-medium.en_to_epitran'],
    num_rows: 6300
})


In [8]:
# Models fine-tuned on TIMIT
hf_to_phonecodes_models = [("excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k", "timit", "ipa"), ("excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k_simplified", "timit", "ipa")]

for model_name, in_code, out_code in hf_to_phonecodes_models:
    model_predictions = multipa.evaluation_extras.hf_to_phonecodes(audio_subset, model_name, in_code, out_code)

    metrics = model_evaluator.eval_non_empty_transcriptions(
        model_name,
        model_predictions[multipa.evaluation.PREDICTION_KEY],
        audio_subset["ipa"])
    multipa.evaluation.write_detailed_prediction_results(
        VERBOSE_RESULTS_DIR, multipa.evaluation.clean_model_name(model_name), audio_subset, model_predictions, metrics
    )
    model_evaluator.write_edit_distance_results(model_name, EDIT_DIST_DIR)
    evaluated_models.append(model_name)
    full_analysis_dataset = full_analysis_dataset.add_column(name=model_name, column=model_predictions[multipa.evaluation.PREDICTION_KEY])



Some weights of the model checkpoint at excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.par

In [9]:
# Write all results to file for comparison
model_evaluator.to_csv(AGGREGATE_METRICS_CSV)

In [10]:
print("These models were evaluated:", evaluated_models)
print("Dataset snippet for full anslysis:")
print(full_analysis_dataset)
print(full_analysis_dataset[0])

These models were evaluated: ['allosaurus_eng2102_eng', 'ginic/full_dataset_train_3_wav2vec2-large-xlsr-53-buckeye-ipa', 'ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns', 'openai_whisper-large-v3-turbo_to_epitran', 'openai_whisper-medium.en_to_epitran', 'excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k', 'excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k_simplified']
Dataset snippet for full anslysis:
Dataset({
    features: ['audio', 'filename', 'ipa', 'allosaurus_eng2102_eng', 'ginic/full_dataset_train_3_wav2vec2-large-xlsr-53-buckeye-ipa', 'ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns', 'openai_whisper-large-v3-turbo_to_epitran', 'openai_whisper-medium.en_to_epitran', 'excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k', 'excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k_simplified'],
    num_rows: 6300
})
{'audio': {'path': '../../data/TIMIT Dataset/COMPLETE/DR4/MMDM0/SI681.WAV', 'array': array([-2.13623047e-04, 

In [11]:
predictions_df = full_analysis_dataset.to_pandas()
print("predictions_df snippet")
print(predictions_df.head())

full_comparison_df = predictions_df.drop(
    columns=["audio"]
    )

print("full_comparison_df snippet")
print(full_comparison_df.head())

predictions_df snippet
                                               audio  \
0  {'bytes': None, 'path': '../../data/TIMIT Data...   
1  {'bytes': None, 'path': '../../data/TIMIT Data...   
2  {'bytes': None, 'path': '../../data/TIMIT Data...   
3  {'bytes': None, 'path': '../../data/TIMIT Data...   
4  {'bytes': None, 'path': '../../data/TIMIT Data...   

                        filename                                     ipa  \
0  /complete/dr4/mmdm0/si681.wav          wɨdsʌtʃɨnæktɨvɹɨfjʉʒl̩bijʉsfl̩   
1    /complete/dr4/mmdm0/sa2.wav          doʊɾ̃æsmiɾɨkɪɹiɛɾ̃ɔliɹæɡlʌkðæt   
2  /complete/dr4/mmdm0/sx411.wav  bʌɾə˞skɑtʃfʌdʒɡoʊzwɛlwəðvəɾ̃ɪləaɪskɹim   
3    /complete/dr4/mmdm0/sa1.wav      ʃiædjə˞dɑɹksʉɾɨnɡɹiziwɔʃwɑɾə˞ɔljɪɹ   
4  /complete/dr4/mmdm0/sx231.wav                           ʔɑʔɑɾ̃ə˞mɑmɑm   

                   allosaurus_eng2102_eng  \
0         wɪðsʌt͡ʃænæktʌvɹəfjuzəlbijusfəl   
1          dɑnæsktmitəkæɹiɪnowliɹɛɡlɛkðæt   
2  bɛtɹ̩skɑt͡ʃfɹ̩d͡ʒɡowzwɛlwɑzvənilæskɹjn   
3  

In [12]:
full_comparison_df.columns

Index(['filename', 'ipa', 'allosaurus_eng2102_eng',
       'ginic/full_dataset_train_3_wav2vec2-large-xlsr-53-buckeye-ipa',
       'ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns',
       'openai_whisper-large-v3-turbo_to_epitran',
       'openai_whisper-medium.en_to_epitran',
       'excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k',
       'excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k_simplified'],
      dtype='object')

In [13]:
# Finalize aggregate and by-dialect results
gold_col = "ipa"
model_names = evaluated_models
model_eval = multipa.evaluation.ModelEvaluator()

def extract_dialect(path_str):
    path = Path(path_str)
    parts = [p for p in path.parts if p.lower().startswith("dr")]
    return parts[0].upper() if parts else "UNKNOWN"

full_comparison_df["dialect"] = full_comparison_df["filename"].apply(extract_dialect)
print("Dialect groups found:", full_comparison_df["dialect"].unique())

summary_data = {}
dialect_results = []

for model_name in model_names:
    print(f"Evaluating model: {model_name}")

    predictions = full_comparison_df[model_name].tolist()
    references = full_comparison_df[gold_col].tolist()

    metrics = model_eval.eval_non_empty_transcriptions(model_name, predictions, references)

    for metric_name in ["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]:
        col_name = f"{metric_name} VS {model_name}"
        full_comparison_df[col_name] = metrics[metric_name]

    summary_data[model_name] = {
        metric_name: float(np.mean(metrics[metric_name]))
        for metric_name in ["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]
    }

    for dialect, df_group in full_comparison_df.groupby("dialect"):
        result_row = {
            "dialect": dialect,
            "model": model_name,
        }
        for metric_name in ["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]:
            col_name = f"{metric_name} VS {model_name}"
            result_row[metric_name] = df_group[col_name].mean()
        dialect_results.append(result_row)


summary_df = pd.DataFrame(summary_data).T
summary_df = summary_df[["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]]
summary_df = summary_df.reset_index()
summary_df = summary_df.rename(columns={"index": "model"})
summary_df.to_csv("timit_model_evaluation_summary.csv", index=False)
print("Average evaluation metrics per model saved to timit_model_evaluation_summary.csv")


dialect_summary_df = pd.DataFrame(dialect_results)
dialect_summary_df.to_csv("timit_dialect_model_comparison.csv", index=False)
print("Dialect evaluation complete. Results saved to timit_dialect_model_comparison.csv")

Dialect groups found: ['DR4' 'DR3' 'DR2' 'DR5' 'DR7' 'DR6' 'DR1' 'DR8']
Evaluating model: allosaurus_eng2102_eng
Evaluating model: ginic/full_dataset_train_3_wav2vec2-large-xlsr-53-buckeye-ipa
Evaluating model: ctaguchi/wav2vec2-large-xlsr-japlmthufielta-ipa1000-ns
Evaluating model: openai_whisper-large-v3-turbo_to_epitran
Evaluating model: openai_whisper-medium.en_to_epitran
Evaluating model: excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k
Evaluating model: excalibur12/wav2vec2-large-lv60_phoneme-timit_english_timit-4k_simplified
Average evaluation metrics per model saved to timit_model_evaluation_summary.csv
Dialect evaluation complete. Results saved to timit_dialect_model_comparison.csv
