# Evaluation Pipeline
---
## To download our best achieved results, please refer to the instructions provided in the README

# Imports

In [None]:
import os
import random
import zipfile

import torch
import numpy as np
import pandas as pd
from tabulate import tabulate
from transformers import T5Tokenizer

from data_utils.dataset import prepare_data_basic, prepare_data_for_prompt_engineering
from data_utils.utils import read_json
from models.rec_ace import RecACEWrapModel, detokenize_and_clean
from evaluation import Evaluator, calculate_exact_match, calculate_wer, BERTS

# BERT Score model warm-up
BERTS()

## Seeds

In [2]:
SEED = 42

# Set the random seed for Python
random.seed(SEED)

# Set the random seed for numpy
np.random.seed(SEED)

torch.manual_seed(SEED)

<torch._C.Generator at 0x182c47e4250>

# Data Paths

In [3]:
datasets_dict = {
    # Default
    'Default Test Clean': 'data/default/test_clean.json',
    'Default Test Other': 'data/default/test_other.json',
    # Video
    'Video Test Clean': 'data/video/test_clean.json',
    'Video Test Other': 'data/video/test_other.json', 
}

# Load essentials

In [4]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

In [5]:
# Base architecture
t5_type = 't5-small'

# How to quantize the confidence vectors [only required for rec_ac]
bin_size=10

results_dir = './results'

### Tokenizer

In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

## Read Data

In [7]:
test_set = read_json(json_path=datasets_dict['Default Test Clean'])
test_set_other = read_json(json_path=datasets_dict['Default Test Other'])

## Prepare as DataLoader

In [8]:
batch_size = 8
test_loader = prepare_data_basic(data=test_set, tokenizer=tokenizer, batch_size=batch_size, shuffle=False)
test_loader_other = prepare_data_basic(data=test_set_other , tokenizer=tokenizer, batch_size=batch_size, shuffle=False)

- Converting the input sentences into tokens
- Converting the GT sentences into tokens
- Converting the input sentences into tokens
- Converting the GT sentences into tokens


# Evaluation

Define metrics for evaluation:
1. WER - Word Error Rate
1. EM - Exact Match
1. BS - BERT Score

In [9]:
eval_metrics = ['wer', 'em', 'bs']

In [10]:
def evaluate_model_performance(model, metrics=eval_metrics, data_loaders={'Clean': test_loader, 'Other': test_loader_other}):

    evaluators = {}
    data_types = list(data_loaders.keys())

    ### Evaluate TEST set
    model.eval()

    for data_type in data_types:
        evaluator = Evaluator(metrics=metrics, set_types=['test'])

        # No need for gradients when evaluating
        with torch.no_grad():
            for batch in data_loaders[data_type]:

                X = batch['sentences'].to(DEVICE)
                S = batch['scores'].to(DEVICE)
                y = batch['labels'].to(DEVICE)

                test_preds = model(input_ids=X, labels=y, scores_ids=S)
                test_logits = test_preds.logits

                test_reference = detokenize_and_clean(tokenizer, y)
                test_predicted = detokenize_and_clean(tokenizer, test_logits.argmax(dim=-1))
                
                evaluator.calculate_metrics(set_type='test', reference=test_reference, predicted=test_predicted)

        evaluator.end_epoch_routine(print_metrics=False)
        evaluators[data_type] = evaluator

    return {data_type: evaluator.metrics_df for data_type, evaluator in evaluators.items()}

In [11]:
def print_metrics_comp_table(evals_df_dict, title='Metrics Comparison'):
    # merge all dfs from evals_df_dict, use the keys as an index. Each df has only one line so it's ok
    # don't use the old index
    evals_df = pd.concat(evals_df_dict.values(), keys=evals_df_dict.keys()).reset_index(level=1, drop=True)

    print(title)
    print(tabulate(evals_df, headers='keys', tablefmt='psql', floatfmt='.4f'))
    print()

# Evaluating metrics for the ASR

In [12]:
asr_evaluators = {}

for data_type in ['Clean', 'Other']:

    # Print data type header
    print(f'Running ASR evaluation on {data_type} datasets')

    asr_evaluator = Evaluator(metrics=eval_metrics, set_types=['test'])

    for batch in (test_loader if data_type == 'Clean' else test_loader_other):
        reference = detokenize_and_clean(tokenizer, batch['labels'])
        predicted = detokenize_and_clean(tokenizer, batch['sentences'])
        asr_evaluator.calculate_metrics(set_type='test', reference=reference, predicted=predicted)

    asr_evaluator.end_epoch_routine(print_metrics=False)

    # Print final metrics
    asr_evaluator.print_final_metrics()

    # Save results to disk
    dir_path = os.path.join(results_dir, 'ASR', data_type)
    os.makedirs(dir_path, exist_ok=True)
    asr_evaluator.store_df(dir_path)

    # Save evaluator for later use
    asr_evaluators[data_type] = asr_evaluator

    print()

Running ASR evaluation on Clean datasets
Test Metrics:
+----+-------+-------+-------+
|    |   wer |    em |    bs |
|----+-------+-------+-------|
|  1 | 0.124 | 0.288 | 0.914 |
+----+-------+-------+-------+


Running ASR evaluation on Other datasets
Test Metrics:
+----+-------+-------+-------+
|    |   wer |    em |    bs |
|----+-------+-------+-------|
|  1 | 0.272 | 0.136 | 0.812 |
+----+-------+-------+-------+




# Load Best Models, Evaluate on Test set

## Rec-ACE

In [13]:
model_res_dir = r'results/rec_ace_Clean/2023-08-23_01-46-03'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

rec_ace_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'rec_ace', use_pretrained=True, bin_size=10).to(DEVICE)

Best epoch by the wer is 21


In [14]:
rec_ace_results = evaluate_model_performance(rec_ace_best_model)

## T5

In [15]:
model_res_dir = r'results/original_f5_Clean/2023-08-27_01-50-22'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

t5_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'original', use_pretrained=True, bin_size=10).to(DEVICE)

Best epoch by the wer is 34


In [16]:
t5_results = evaluate_model_performance(t5_best_model)



## Rec-ACE (trained on Other dataset)

In [17]:
model_res_dir = r'results/rec_ace_Other/2023-08-24_01-05-43'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

rec_ace_other_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'rec_ace', use_pretrained=True, bin_size=10).to(DEVICE)

Best epoch by the wer is 49


In [18]:
rec_ace_other_results = evaluate_model_performance(rec_ace_other_best_model)

## T5 (trained on Other dataset)

In [19]:
model_res_dir = r'results/original_f5_Other/2023-08-24_17-27-55'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

t5_other_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'original', use_pretrained=True, bin_size=10).to(DEVICE)

Best epoch by the wer is 48


In [20]:
t5_other_results = evaluate_model_performance(t5_other_best_model)

## Model T5 trained on prompt-engineered data ("Clean" dataset)

In [21]:
p_test_loader = prepare_data_for_prompt_engineering(data=test_set , tokenizer=tokenizer, batch_size=batch_size, shuffle=False)
p_test_loader_other = prepare_data_for_prompt_engineering(data=test_set_other , tokenizer=tokenizer, batch_size=batch_size, shuffle=False)

- Converting the input sentences into tokens
- Converting the GT sentences into tokens
- Converting the input sentences into tokens
- Converting the GT sentences into tokens


In [22]:
model_res_dir = r'results/original_t5_prompt_Clean/2023-08-26_10-58-53'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

t5_prompt_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'original', use_pretrained=True, bin_size=10).to(DEVICE)

Best epoch by the wer is 19


In [23]:
t5_prompt_results = evaluate_model_performance(t5_prompt_best_model, data_loaders={'Clean': p_test_loader, 'Other': p_test_loader_other})

# Compare Results

In [24]:
for set_type in ['Clean', 'Other']:
    print_metrics_comp_table({
        'ASR':                      asr_evaluators[set_type].metrics_df['test'],
        'RecAce(trained on Clean)': rec_ace_results[set_type]['test'],
        'T5(trained on Clean)':     t5_results[set_type]['test'],
        'RecAce(trained on Other)': rec_ace_other_results[set_type]['test'],
        'T5(trained on Other)':     t5_other_results[set_type]['test'],
        'T5P(trained on Clean)':    t5_prompt_results[set_type]['test'],
    }, title=f'{set_type} Test Set Results')

Clean Test Set Results
+--------------------------+--------+--------+--------+
|                          |    wer |     em |     bs |
|--------------------------+--------+--------+--------|
| ASR                      | 0.1237 | 0.2880 | 0.9137 |
| RecAce(trained on Clean) | 0.1052 | 0.3545 | 0.9179 |
| T5(trained on Clean)     | 0.1057 | 0.3471 | 0.9189 |
| RecAce(trained on Other) | 0.0904 | 0.3778 | 0.9253 |
| T5(trained on Other)     | 0.0937 | 0.3591 | 0.9227 |
| T5P(trained on Clean)    | 0.1106 | 0.3361 | 0.9148 |
+--------------------------+--------+--------+--------+

Other Test Set Results
+--------------------------+--------+--------+--------+
|                          |    wer |     em |     bs |
|--------------------------+--------+--------+--------|
| ASR                      | 0.2722 | 0.1356 | 0.8123 |
| RecAce(trained on Clean) | 0.2512 | 0.1574 | 0.8095 |
| T5(trained on Clean)     | 0.2526 | 0.1506 | 0.8087 |
| RecAce(trained on Other) | 0.2256 | 0.1720 | 0.8239 |
|

## Examples:

In [51]:
how_much_to_print = 20
min_wer_diff = 0.4 # ASR WER - REC WER
rec_ace_max_wer = 0.4

In [26]:
exa_batch_size = 1
exa_loader = prepare_data_basic(data=test_set , tokenizer=tokenizer, batch_size=exa_batch_size, shuffle=False)
exa_loader_other = prepare_data_basic(data=test_set_other , tokenizer=tokenizer, batch_size=exa_batch_size, shuffle=False)

- Converting the input sentences into tokens
- Converting the GT sentences into tokens
- Converting the input sentences into tokens
- Converting the GT sentences into tokens


In [27]:
examples_model = rec_ace_other_best_model

In [52]:
ii = 0
for batch in exa_loader_other:
    X = batch['sentences'].to(DEVICE)
    S = batch['scores'].to(DEVICE)
    y = batch['labels'].to(DEVICE)

    test_preds = examples_model(input_ids=X, labels=y, scores_ids=S)
    test_logits = test_preds.logits

    ex_hypothesis = detokenize_and_clean(tokenizer, X)
    ex_reference = detokenize_and_clean(tokenizer, y)
    ex_predicted = detokenize_and_clean(tokenizer, test_logits.argmax(dim=-1))

    rec_ace_wer = calculate_wer(ex_reference, ex_predicted)
    asr_wer = calculate_wer(ex_reference, ex_hypothesis)

    if asr_wer - rec_ace_wer > min_wer_diff and rec_ace_wer < rec_ace_max_wer:
        ii+=1
        print(f'Example #{ii}:')
        print(f'- Reference:\n\t"{ex_reference[0]}"')
        print(f'- ASR hypothesis:\n\t"{ex_hypothesis[0]}" (WER={asr_wer:.4f})')
        print(f'- RED-ACE:\n\t"{ex_predicted[0]}" (WER={rec_ace_wer:.4f})')
        print('-' * 20)

    if ii == how_much_to_print:
        break

Example #1:
- Reference:
	"the feverish colour came into her cheek and the feverish flame into her eye"
- ASR hypothesis:
	"favorite color came into a cheeks and the feverish flame into a r i" (WER=0.5714)
- RED-ACE:
	"the feverish colour came into her cheeks the feverish flame into her eye" (WER=0.1429)
--------------------
Example #2:
- Reference:
	"exactly here replied the brahman"
- ASR hypothesis:
	"exactly hair reply to carmen" (WER=0.8000)
- RED-ACE:
	"exactly here replied the carzenman" (WER=0.2000)
--------------------
Example #3:
- Reference:
	"from about two thousand b c"
- ASR hypothesis:
	"i'm about 2,000 bc" (WER=0.8333)
- RED-ACE:
	"and about two thousand b c" (WER=0.1667)
--------------------
Example #4:
- Reference:
	"fourteen ninety nine"
- ASR hypothesis:
	"1499" (WER=1.0000)
- RED-ACE:
	"fourteen ninety nine" (WER=0.0000)
--------------------
Example #5:
- Reference:
	"confectionary fifteen o eight"
- ASR hypothesis:
	"confectionery 1508" (WER=1.0000)
- RED-ACE:
	"c

# Pack models and results in a zip file

In [34]:
import os
import zipfile

def write_to_zip(zipf, folder_path, _file):
    file_path = os.path.join(folder_path, _file)
    zipf.write(file_path, arcname=file_path)

def zip_csv_files(directories, zip_filename):
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for directory in directories:
            best_model = None
            for folder_path, _, files in os.walk(directory):
                for _file in files:
                    # Write metrics files to zip
                    if _file.endswith('.csv'):
                        write_to_zip(zipf, folder_path, _file)
                        print(f'Wrote {_file} to zip @ {folder_path}')
                        continue
                    
                    # Get best model epoch
                    best_model = best_model if best_model else Evaluator.get_best_epoch(directory, metric)

                    # Write best model files to zip
                    if _file.endswith(f'{best_model}.pt'):
                        write_to_zip(zipf, folder_path, _file)
                        print(f'Wrote {_file} to zip @ {folder_path}')

# Example usage
directories_to_zip = [
    r'results/ASR',
    r'results/rec_ace_Clean/2023-08-23_01-46-03',
    r'results/original_f5_Clean/2023-08-27_01-50-22',
    r'results/rec_ace_Other/2023-08-24_01-05-43',
    r'results/original_f5_Other/2023-08-24_17-27-55',
    r'results/original_t5_prompt_Clean/2023-08-26_10-58-53']

output_zip_filename = 'res_and_models.zip'
zip_csv_files(directories_to_zip, output_zip_filename)

Wrote dev_metrics.csv to zip @ results/ASR\Clean
Wrote test_metrics.csv to zip @ results/ASR\Clean
Wrote train_metrics.csv to zip @ results/ASR\Clean
Wrote dev_metrics.csv to zip @ results/ASR\Other
Wrote test_metrics.csv to zip @ results/ASR\Other
Wrote train_metrics.csv to zip @ results/ASR\Other
Wrote dev_metrics.csv to zip @ results/rec_ace_Clean/2023-08-23_01-46-03
Wrote epoch_21.pt to zip @ results/rec_ace_Clean/2023-08-23_01-46-03
Wrote train_metrics.csv to zip @ results/rec_ace_Clean/2023-08-23_01-46-03
Wrote dev_metrics.csv to zip @ results/original_f5_Clean/2023-08-27_01-50-22
Wrote epoch_34.pt to zip @ results/original_f5_Clean/2023-08-27_01-50-22
Wrote train_metrics.csv to zip @ results/original_f5_Clean/2023-08-27_01-50-22
Wrote dev_metrics.csv to zip @ results/rec_ace_Other/2023-08-24_01-05-43
Wrote epoch_49.pt to zip @ results/rec_ace_Other/2023-08-24_01-05-43
Wrote train_metrics.csv to zip @ results/rec_ace_Other/2023-08-24_01-05-43
Wrote dev_metrics.csv to zip @ result