# TIMIT Evaluation 

Runs evaluation scripts on the TIMIT corpus to get phone error rates and edit distances for TIMIT (unseen data) for the following models:
- Facebook's facebook/wav2vec2-lv-60-espeak-cv-ft 

For convenience, this model has been separated from the others, because it requires https://github.com/espeak-ng/espeak-ng and needs special modules to run on the Unity cluster. 

On unity, you need to use `--constraint=avx512` and in addition to standard python modules, make sure you load the following: 
```bash
module load conda/latest uri/main all/eSpeak-NG/1.50-gompi-2020a
```

It's easiest to start run this on Unity by starting an interactive job and using jupyter to run the notebook in the terminal:
```bash
$ salloc -p gpu -G 1 -c 12 --mem 12GB --constraint=avx512 --time=8:00:0
$ module load conda/latest uri/main all/eSpeak-NG/1.50-gompi-2020a
$ conda activate ./env_cuda124
$ pip install jupyter
$ cd notebooks/timit_evaluation
$ jupyter nbconvert --execute --to notebook --inplace model_evaluation_espeak_only.ipynb
```

Alternatively, you can install espeak locally and run this notebook on your machine.  

In [1]:
from pathlib import Path

import datasets
import numpy as np
import pandas as pd
import transformers

import multipa.evaluation


DEVICE = -1  # -1 for CPU, or set GPU index if available

# Paths For TIMIT Database and TIMIT IPA
# timit_data_dir = Path("/Users/parthbhangla/Desktop/Multipa_Datasets/TIMIT/COMPLETE")
# transcriptions_path = Path("/Users/parthbhangla/Desktop/Multipa_Datasets/TIMIT/complete_ipa.csv")
timit_data_dir = Path("../../data/TIMIT Dataset/COMPLETE")
transcriptions_path = Path("../../data/TIMIT Dataset/complete_ipa.csv")

# HuggingFace Models Evaluating

HF_MODEL_LIST = ["facebook/wav2vec2-lv-60-espeak-cv-ft", "facebook/wav2vec2-xlsr-53-espeak-cv-ft"]

# Set up results directories
RESULTS_DIR =Path("../../data/timit_results")
VERBOSE_RESULTS_DIR = RESULTS_DIR / "detailed_predictions"
AGGREGATE_METRICS_CSV = RESULTS_DIR / "aggregate_metrics" / "facebook_wav2vec2-espeak.csv"
EDIT_DIST_DIR = RESULTS_DIR / "edit_distances"
VERBOSE_RESULTS_DIR.mkdir(parents=True, exist_ok=True)
AGGREGATE_METRICS_CSV.parent.mkdir(parents=True, exist_ok=True)
EDIT_DIST_DIR.mkdir(parents=True, exist_ok=True)

# Processing options
IS_REMOVE_SPACES = True
IS_NORMALIZE_IPA = True
NUM_PROC = 8 # Number of processes for HuggingFace dataset map and filter

# Computes and stores by-model performance metrics
model_evaluator = multipa.evaluation.ModelEvaluator()

evaluated_models = []

  import pynvml  # type: ignore[import]


  import pkg_resources


In [2]:
def read_timit_gold_standard_transcriptions(transcriptions_path):
    """Returns a dictionary of {"audio_filename" -> {"ipa_transcription": transcription, "filename": original_filename}}"""
    gold_standard_df = pd.read_csv(transcriptions_path)
    gold_standard_df["filename"] = gold_standard_df["audio_filename"].str.lower()
    gold_standard_df.set_index("filename", inplace=True)
    return gold_standard_df.to_dict("index")


In [3]:
# Load TIMIT audio as a HuggingFace dataset with audio and gold standard transcriptions together
# This loads TIMIT as a Dataset with the same columns as the Buckeye corpus we've been working with
gold_standard_transcriptions = read_timit_gold_standard_transcriptions(transcriptions_path)

timit_wavs = [p for p in timit_data_dir.rglob("*") if p.suffix.lower() == ".wav"]
print("Total WAV files found:", len(timit_wavs))
data = []

for p in timit_wavs:
    clean_filename = "/" + str(p.relative_to(timit_data_dir.parent)).lower()
    ipa_transcription = gold_standard_transcriptions[clean_filename]["ipa_transcription"]

    entry = {
        "audio": {"path": str(p)},
        "filename": clean_filename,
        "ipa":ipa_transcription
    }
    data.append(entry)

audio_dataset = datasets.Dataset.from_list(data)
print(audio_dataset)
print(audio_dataset[0])

# TODO: Evaluate on the whole dataset
# Test with a small subset if wanted
#audio_subset = audio_dataset.select([i for i in list(range(10))])
audio_subset = audio_dataset
full_analysis_dataset = audio_subset

Total WAV files found: 6300
Dataset({
    features: ['audio', 'filename', 'ipa'],
    num_rows: 6300
})
{'audio': {'path': '../../data/TIMIT Dataset/COMPLETE/DR1/FAKS0/SA1.WAV'}, 'filename': '/complete/dr1/faks0/sa1.wav', 'ipa': ' ʃ i ɦ æ d j ɝ d ɑ ɹ k s u ɾ ɪ ŋ g ɹ i s i w ɑ ʃ  w ɑ ɾ ɝ ʔ ɔ l j i ɚ '}


In [4]:
# Sample audio correctly and preprocess transcriptions to remove whitepsace
audio_subset, audio_without_speech = multipa.evaluation.preprocess_test_data(audio_subset,
    is_remove_space=IS_REMOVE_SPACES, is_normalize_ipa=IS_NORMALIZE_IPA, num_proc=NUM_PROC)
print("Audio with speech transcriptions")
print(audio_subset)
print(audio_subset[0])

# Sanity check that there's no audio without transcriptions
print("Audio without speech transcriptions")
print(audio_without_speech)


Map (num_proc=8):   0%|          | 0/6300 [00:00<?, ? examples/s]

Filter (num_proc=8):   0%|          | 0/6300 [00:00<?, ? examples/s]

Filter (num_proc=8):   0%|          | 0/6300 [00:00<?, ? examples/s]

Audio with speech transcriptions
Dataset({
    features: ['audio', 'filename', 'ipa'],
    num_rows: 6300
})


{'audio': {'path': '../../data/TIMIT Dataset/COMPLETE/DR1/FAKS0/SA1.WAV', 'array': array([9.15527344e-05, 1.52587891e-04, 6.10351562e-05, ...,
       2.44140625e-04, 3.05175781e-04, 2.13623047e-04], shape=(63488,)), 'sampling_rate': 16000}, 'filename': '/complete/dr1/faks0/sa1.wav', 'ipa': 'ʃiɦædjɜ˞dɑɹksuɾɪŋɡɹisiwɑʃwɑɾɜ˞ʔɔljiə˞'}
Audio without speech transcriptions
Dataset({
    features: ['audio', 'filename', 'ipa'],
    num_rows: 0
})


In [5]:
# HuggingFace model inference and evaluation
for model_name in HF_MODEL_LIST:
    clean_model_name = multipa.evaluation.clean_model_name(model_name)
    print(f"Running ASR for model: {model_name}")
    asr_pipe = transformers.pipeline("automatic-speech-recognition", model=model_name, device=DEVICE)
    predictions_dataset = multipa.evaluation.get_clean_predictions(
        audio_subset, asr_pipe, num_proc=NUM_PROC, is_remove_space=IS_REMOVE_SPACES, is_normalize_ipa=IS_NORMALIZE_IPA
    )

    # Compute all metrics
    model_metrics = model_evaluator.eval_non_empty_transcriptions(
        model_name, predictions_dataset[multipa.evaluation.PREDICTION_KEY], audio_subset["ipa"]
    )

    # Write prediction details and edit distances
    model_evaluator.write_edit_distance_results(model_name, EDIT_DIST_DIR)
    multipa.evaluation.write_detailed_prediction_results(
        VERBOSE_RESULTS_DIR, clean_model_name, audio_subset, predictions_dataset, model_metrics
    )

    print("Done evaluating", model_name)
    evaluated_models.append(model_name)
    full_analysis_dataset = full_analysis_dataset.add_column(
        name=model_name, column=predictions_dataset[multipa.evaluation.PREDICTION_KEY]
    )
    print(full_analysis_dataset)


Running ASR for model: facebook/wav2vec2-lv-60-espeak-cv-ft


Some weights of the model checkpoint at facebook/wav2vec2-lv-60-espeak-cv-ft were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-lv-60-espeak-cv-ft and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=8):   0%|          | 0/6300 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/6300 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Done evaluating facebook/wav2vec2-lv-60-espeak-cv-ft
Dataset({
    features: ['audio', 'filename', 'ipa', 'facebook/wav2vec2-lv-60-espeak-cv-ft'],
    num_rows: 6300
})
Running ASR for model: facebook/wav2vec2-xlsr-53-espeak-cv-ft


Some weights of the model checkpoint at facebook/wav2vec2-xlsr-53-espeak-cv-ft were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xlsr-53-espeak-cv-ft and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=8):   0%|          | 0/6300 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/6300 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Done evaluating facebook/wav2vec2-xlsr-53-espeak-cv-ft
Dataset({
    features: ['audio', 'filename', 'ipa', 'facebook/wav2vec2-lv-60-espeak-cv-ft', 'facebook/wav2vec2-xlsr-53-espeak-cv-ft'],
    num_rows: 6300
})


In [6]:
# Write all results to file for comparison
model_evaluator.to_csv(AGGREGATE_METRICS_CSV)

In [7]:
print("These models were evaluated:", evaluated_models)
print("Dataset snippet for full anslysis:")
print(full_analysis_dataset)
print(full_analysis_dataset[0])

These models were evaluated: ['facebook/wav2vec2-lv-60-espeak-cv-ft', 'facebook/wav2vec2-xlsr-53-espeak-cv-ft']
Dataset snippet for full anslysis:
Dataset({
    features: ['audio', 'filename', 'ipa', 'facebook/wav2vec2-lv-60-espeak-cv-ft', 'facebook/wav2vec2-xlsr-53-espeak-cv-ft'],
    num_rows: 6300
})
{'audio': {'path': '../../data/TIMIT Dataset/COMPLETE/DR1/FAKS0/SA1.WAV'}, 'filename': '/complete/dr1/faks0/sa1.wav', 'ipa': ' ʃ i ɦ æ d j ɝ d ɑ ɹ k s u ɾ ɪ ŋ g ɹ i s i w ɑ ʃ  w ɑ ɾ ɝ ʔ ɔ l j i ɚ ', 'facebook/wav2vec2-lv-60-espeak-cv-ft': 'ʃiːhædjə˞dɑːɹksuːɾɪnɡɹiːsiwɑːʃwɑːɾə˞ɹɑːljiː', 'facebook/wav2vec2-xlsr-53-espeak-cv-ft': 'ʃiːhædjə˞dɑːksuːtɪnɡɹiːsiːwɑːʃwɑːɾə˞ɑːljɪ'}


In [8]:
predictions_df = full_analysis_dataset.to_pandas()
print("predictions_df snippet")
print(predictions_df.head())

full_comparison_df = predictions_df.drop(
    columns=["audio"]
    )

print("full_comparison_df snippet")
print(full_comparison_df.head())

predictions_df snippet
                                               audio  \
0  {'path': '../../data/TIMIT Dataset/COMPLETE/DR...   
1  {'path': '../../data/TIMIT Dataset/COMPLETE/DR...   
2  {'path': '../../data/TIMIT Dataset/COMPLETE/DR...   
3  {'path': '../../data/TIMIT Dataset/COMPLETE/DR...   
4  {'path': '../../data/TIMIT Dataset/COMPLETE/DR...   

                         filename  \
0     /complete/dr1/faks0/sa1.wav   
1     /complete/dr1/faks0/sa2.wav   
2  /complete/dr1/faks0/si1573.wav   
3  /complete/dr1/faks0/si2203.wav   
4   /complete/dr1/faks0/si943.wav   

                                                 ipa  \
0   ʃ i ɦ æ d j ɝ d ɑ ɹ k s u ɾ ɪ ŋ g ɹ i s i w ɑ...   
1   d oʊ n æ s  m i t ɨ k ɛ ɹ i ɨ n ɔɪ l i ɹ æ g ...   
2   h ɪ z k æ p t ɨ n w ə s θ ɪ n æ n ɦ æ g ɝ d ɨ...   
3   ð ɨ ɹ i z ə n z f ɚ ð ɪ s d aɪ v s i m d f u ...   
4   p ɚ d ʌ k ʃ ɨ n m eɪ f ɔ l f ɑ ɹ b ə l oʊ ə k...   

                facebook/wav2vec2-lv-60-espeak-cv-ft  \
0        ʃiːhædjə˞dɑːɹks

In [9]:
gold_col = "ipa"
model_names = evaluated_models
model_eval = multipa.evaluation.ModelEvaluator()

def extract_dialect(path_str):
    path = Path(path_str)
    parts = [p for p in path.parts if p.lower().startswith("dr")]
    return parts[0].upper() if parts else "UNKNOWN"

full_comparison_df["dialect"] = full_comparison_df["filename"].apply(extract_dialect)
print("Dialect groups found:", full_comparison_df["dialect"].unique())

summary_data = {}
dialect_results = []

for model_name in model_names:
    print(f"Evaluating model: {model_name}")

    predictions = full_comparison_df[model_name].tolist()
    references = full_comparison_df[gold_col].tolist()

    metrics = model_eval.eval_non_empty_transcriptions(model_name, predictions, references)

    for metric_name in ["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]:
        col_name = f"{metric_name} VS {model_name}"
        full_comparison_df[col_name] = metrics[metric_name]

    summary_data[model_name] = {
        metric_name: float(np.mean(metrics[metric_name]))
        for metric_name in ["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]
    }

    for dialect, df_group in full_comparison_df.groupby("dialect"):
        result_row = {
            "dialect": dialect,
            "model": model_name,
        }
        for metric_name in ["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]:
            col_name = f"{metric_name} VS {model_name}"
            result_row[metric_name] = df_group[col_name].mean()
        dialect_results.append(result_row)


summary_df = pd.DataFrame(summary_data).T
summary_df = summary_df[["phone_error_rates", "phone_feature_error_rates", "feature_error_rates"]]
summary_df = summary_df.reset_index()
summary_df = summary_df.rename(columns={"index": "model"})
summary_df.to_csv("timit_facebook_wav2vec2-lv-60-espeak-cv-ft_evaluation_summary.csv", index=False)
print(
    "Average evaluation metrics per model saved to timit_facebook_wav2vec2-lv-60-espeak-cv-ft_evaluation_summary.csv"
)


dialect_summary_df = pd.DataFrame(dialect_results)
dialect_summary_df.to_csv("timit_dialect_facebook_wav2vec2-lv-60-espeak-cv-ft_comparison.csv", index=False)
print(
    "Dialect evaluation complete. Results saved to timit_dialect_facebook_wav2vec2-lv-60-espeak-cv-ft_comparison.csv"
)

Dialect groups found: ['DR1' 'DR2' 'DR3' 'DR4' 'DR5' 'DR6' 'DR7' 'DR8']
Evaluating model: facebook/wav2vec2-lv-60-espeak-cv-ft


Evaluating model: facebook/wav2vec2-xlsr-53-espeak-cv-ft


Average evaluation metrics per model saved to timit_facebook_wav2vec2-lv-60-espeak-cv-ft_evaluation_summary.csv
Dialect evaluation complete. Results saved to timit_dialect_facebook_wav2vec2-lv-60-espeak-cv-ft_comparison.csv
