# Imports

In [27]:
import os
import random

import torch
import numpy as np
import pandas as pd
from tabulate import tabulate
from transformers import T5Tokenizer

from evaluation import Evaluator
from data_utils.dataset import prepare_data_basic, prepare_data_for_prompt_engineering
from data_utils.utils import read_json
from models.rec_ace import RecACEWrapModel, detokenize_and_clean

## Seeds

In [2]:
SEED = 42

# Set the random seed for Python
random.seed(SEED)

# Set the random seed for numpy
np.random.seed(SEED)

torch.manual_seed(SEED)

<torch._C.Generator at 0x132d4f130>

# Data Paths

In [3]:
datasets_dict = {
    # Default
    'Default Test Clean': 'data/default/test_clean.json',
    'Default Test Other': 'data/default/test_other.json',
    # Video
    'Video Test Clean': 'data/video/test_clean.json',
    'Video Test Other': 'data/video/test_other.json', 
}

# Load essentials

In [4]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cpu'

In [5]:
# Base architecture
t5_type = 't5-small'

# How to quantize the confidence vectors [only required for rec_ac]
bin_size=10

results_dir = './results'

### Tokenizer

In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


## Read Data

In [7]:
test_set = read_json(json_path=datasets_dict['Default Test Clean'])
test_set_other = read_json(json_path=datasets_dict['Default Test Other'])

## Prepare as DataLoader

In [18]:
batch_size = 8
test_loader = prepare_data_basic(data=test_set , tokenizer=tokenizer, batch_size=batch_size, shuffle=False)
test_loader_other = prepare_data_basic(data=test_set_other , tokenizer=tokenizer, batch_size=batch_size, shuffle=False)

- Converting the input sentences into tokens
- Converting the GT sentences into tokens
- Converting the input sentences into tokens
- Converting the GT sentences into tokens


# Evaluation

Define metrics for evaluation:
1. WER - Word Error Rate
1. EM - Exact Match

In [9]:
eval_metrics = ['wer', 'em']

In [10]:
def evaluate_model_performance(model, metrics=eval_metrics, data_loaders={'Clean': test_loader, 'Other': test_loader_other}):

    evaluators = {}
    data_types = list(data_loaders.keys())

    ### Evaluate TEST set
    model.eval()

    for data_type in data_types:
        evaluator = Evaluator(metrics=metrics, set_types=['test'])

        # No need for gradients when evaluating
        with torch.no_grad():
            for batch in data_loaders[data_type]:

                X = batch['sentences'].to(DEVICE)
                S = batch['scores'].to(DEVICE)
                y = batch['labels'].to(DEVICE)

                test_preds = model(input_ids=X, labels=y, scores_ids=S)
                test_logits = test_preds.logits

                test_reference = detokenize_and_clean(tokenizer, y)
                test_predicted = detokenize_and_clean(tokenizer, test_logits.argmax(dim=-1))
                
                evaluator.calculate_metrics(set_type='test', reference=test_reference, predicted=test_predicted)

        evaluator.end_epoch_routine(print_metrics=False)
        evaluators[data_type] = evaluator

    return {data_type: evaluator.metrics_df for data_type, evaluator in evaluators.items()}

In [11]:
def print_metrics_comp_table(evals_df_dict, title='Metrics Comparison'):
    # merge all dfs from evals_df_dict, use the keys as an index. Each df has only one line so it's ok
    # don't use the old index
    evals_df = pd.concat(evals_df_dict.values(), keys=evals_df_dict.keys()).reset_index(level=1, drop=True)

    print(title)
    print(tabulate(evals_df, headers='keys', tablefmt='psql', floatfmt='.4f'))
    print()

# Evaluating metrics for the ASR

In [22]:
asr_evaluators = {}

for data_type in ['Clean', 'Other']:

    # Print data type header
    print(f'Running ASR evaluation on {data_type} datasets')

    asr_evaluator = Evaluator(metrics=eval_metrics, set_types=['test'])

    for batch in (test_loader if data_type == 'Clean' else test_loader_other):
        reference = detokenize_and_clean(tokenizer, batch['labels'])
        predicted = detokenize_and_clean(tokenizer, batch['sentences'])
        asr_evaluator.calculate_metrics(set_type='test', reference=reference, predicted=predicted)

    asr_evaluator.end_epoch_routine(print_metrics=False)

    # Print final metrics
    asr_evaluator.print_final_metrics()

    # Save results to disk
    dir_path = os.path.join(results_dir, 'ASR', data_type)
    os.makedirs(dir_path, exist_ok=True)
    asr_evaluator.store_df(dir_path)

    # Save evaluator for later use
    asr_evaluators[data_type] = asr_evaluator

    print()

Running ASR evaluation on Clean datasets
Test Metrics:
+----+-------+-------+
|    |   wer |    em |
|----+-------+-------|
|  1 | 0.124 | 0.288 |
+----+-------+-------+


Running ASR evaluation on Other datasets
Test Metrics:
+----+-------+-------+
|    |   wer |    em |
|----+-------+-------|
|  1 | 0.273 | 0.135 |
+----+-------+-------+




# Load Best Models, Evaluate on Test set

## Rec-ACE

In [24]:
model_res_dir = r'results/rec_ace_Clean/2023-08-23_01-46-03'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

rec_ace_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'rec_ace', use_pretrained=True, bin_size=10).to(DEVICE)

AssertionError: File 'results\rec_ace_Clean\2023-08-23_01-46-03/dev_metrics.csv' does not exist.

In [None]:
rec_ace_results = evaluate_model_performance(rec_ace_best_model)

## T5

In [50]:
model_res_dir = r'results/original_f5_Clean/2023-08-27_01-50-22'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

t5_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'original', use_pretrained=True, bin_size=10).to(DEVICE)

Best epoch by the wer is 34


In [51]:
t5_results = evaluate_model_performance(t5_best_model)

## Rec-ACE (trained on Other dataset)

In [12]:
model_res_dir = r'results/rec_ace_Other/2023-08-24_01-05-43'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

rec_ace_other_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'rec_ace', use_pretrained=True, bin_size=10).to(DEVICE)

Best epoch by the wer is 49


In [30]:
rec_ace_other_results = evaluate_model_performance(rec_ace_other_best_model)

NameError: name 'rec_ace_other_best_model' is not defined

## T5 (trained on Other dataset)

In [46]:
model_res_dir = r'results/original_f5_Other/2023-08-24_17-27-55'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

t5_other_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'original', use_pretrained=True, bin_size=10).to(DEVICE)

AssertionError: File 'results\original_f5_Other\2023-08-24_17-27-55/dev_metrics.csv' does not exist.

In [34]:
t5_other_results = evaluate_model_performance(t5_other_best_model)

## Model T5 trained on prompt-engineered data ("Clean" dataset)

In [28]:
p_test_loader = prepare_data_for_prompt_engineering(data=test_set , tokenizer=tokenizer, batch_size=batch_size, shuffle=False)
p_test_loader_other = prepare_data_for_prompt_engineering(data=test_set_other , tokenizer=tokenizer, batch_size=batch_size, shuffle=False)

- Converting the input sentences into tokens
- Converting the GT sentences into tokens
- Converting the input sentences into tokens
- Converting the GT sentences into tokens


In [42]:
model_res_dir = r'results/original_t5_prompt_Clean/2023-08-26_10-58-53'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

t5_prompt_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'original', use_pretrained=True, bin_size=10).to(DEVICE)

Best epoch by the wer is 19


In [43]:
t5_prompt_results = evaluate_model_performance(t5_prompt_best_model, data_loaders={'Clean': p_test_loader, 'Other': p_test_loader_other})

# Compare Results

In [52]:
for set_type in ['Clean', 'Other']:
    print_metrics_comp_table({
        'ASR':                      asr_evaluators[set_type].metrics_df['test'],
        'RecAce(trained on Clean)': rec_ace_results[set_type]['test'],
        'T5(trained on Clean)':     t5_results[set_type]['test'],
        'RecAce(trained on Other)': rec_ace_other_results[set_type]['test'],
        'T5(trained on Other)':     t5_other_results[set_type]['test'],
        'T5P(trained on Clean)':    t5_prompt_results[set_type]['test'],
    }, title=f'{set_type} Test Set Results')

Clean Test Set Results
+--------------------------+--------+--------+
|                          |    wer |     em |
|--------------------------+--------+--------|
| ASR                      | 0.1239 | 0.2875 |
| RecAce(trained on Clean) | 0.1054 | 0.3538 |
| T5(trained on Clean)     | 0.1060 | 0.3465 |
| RecAce(trained on Other) | 0.0907 | 0.3773 |
| T5(trained on Other)     | 0.0939 | 0.3586 |
| T5P(trained on Clean)    | 0.1108 | 0.3356 |
+--------------------------+--------+--------+

Other Test Set Results
+--------------------------+--------+--------+
|                          |    wer |     em |
|--------------------------+--------+--------|
| ASR                      | 0.2726 | 0.1350 |
| RecAce(trained on Clean) | 0.2515 | 0.1567 |
| T5(trained on Clean)     | 0.2527 | 0.1501 |
| RecAce(trained on Other) | 0.2257 | 0.1713 |
| T5(trained on Other)     | 0.2304 | 0.1640 |
| T5P(trained on Clean)    | 0.2557 | 0.1525 |
+--------------------------+--------+--------+



## Examples:

In [None]:
how_much_to_print = 10

In [36]:
exa_batch_size = 1
exa_loader = prepare_data_basic(data=test_set , tokenizer=tokenizer, batch_size=exa_batch_size, shuffle=False)
exa_loader_other = prepare_data_basic(data=test_set_other , tokenizer=tokenizer, batch_size=exa_batch_size, shuffle=False)

- Converting the input sentences into tokens
- Converting the GT sentences into tokens
- Converting the input sentences into tokens
- Converting the GT sentences into tokens


In [16]:
examples_model = rec_ace_other_best_model

In [44]:
ii = 0
for batch in exa_loader_other:
    X = batch['sentences']
    S = batch['scores']
    y = batch['labels']

    test_preds = examples_model(input_ids=X, labels=y, scores_ids=S)
    test_logits = test_preds.logits

    ex_hypothesis = detokenize_and_clean(tokenizer, X)
    ex_reference = detokenize_and_clean(tokenizer, y)
    ex_predicted = detokenize_and_clean(tokenizer, test_logits.argmax(dim=-1))

    if calculate_exact_match(ex_reference, ex_predicted) and not calculate_exact_match(ex_reference, ex_hypothesis):
        ii+=1
        print(f'Example #{ii}:')
        print(f'- Reference:\n\t"{ex_reference[0]}"')
        print(f'- ASR hypothesis:\n\t"{ex_hypothesis[0]}"')
        print(f'- RED-ACE:\n\t"{ex_predicted[0]}"')
        print('-' * 20)

    if ii == how_much_to_print:
        break

Example #1:
- Reference:
	"i really liked that account of himself better than anything else he said"
- ASR hypothesis:
	"i really like that account of himself better than anything else he said"
- RED-ACE:
	"i really liked that account of himself better than anything else he said"
--------------------
Example #2:
- Reference:
	"no mamma that anne buckley would never have done"
- ASR hypothesis:
	"no mama that and buckley would never have done"
- RED-ACE:
	"no mamma that anne buckley would never have done"
--------------------
Example #3:
- Reference:
	"in making a post mortem examination the alimentary canal should be removed and preserved for further investigation"
- ASR hypothesis:
	"in making a post-mortem examination the alimentary canal should be removed and preserved for further investigation"
- RED-ACE:
	"in making a post mortem examination the alimentary canal should be removed and preserved for further investigation"
--------------------


KeyboardInterrupt: 