# Imports

In [1]:
import os
import random

import torch
import numpy as np
import pandas as pd
from tabulate import tabulate
from transformers import T5Tokenizer

from evaluation import Evaluator
from data_utils.dataset import prepare_data_basic
from data_utils.utils import read_json
from models.rec_ace import RecACEWrapModel, detokenize_and_clean

## Seeds

In [2]:
SEED = 42

# Set the random seed for Python
random.seed(SEED)

# Set the random seed for numpy
np.random.seed(SEED)

torch.manual_seed(SEED)

<torch._C.Generator at 0x132d4f130>

# Data Paths

In [3]:
datasets_dict = {
    # Default
    'Default Test Clean': 'data/default/test_clean.json',
    'Default Test Other': 'data/default/test_other.json',
    # Video
    'Video Test Clean': 'data/video/test_clean.json',
    'Video Test Other': 'data/video/test_other.json', 
}

# Load essentials

In [4]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cpu'

In [5]:
# Base architecture
t5_type = 't5-small'

# How to quantize the confidence vectors [only required for rec_ac]
bin_size=10

results_dir = './results'

### Tokenizer

In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


## Read Data

In [7]:
test_set = read_json(json_path=datasets_dict['Default Test Clean'])
test_set_other = read_json(json_path=datasets_dict['Default Test Other'])

## Prepare as DataLoader

In [18]:
batch_size = 8
test_loader = prepare_data_basic(data=test_set , tokenizer=tokenizer, batch_size=batch_size, shuffle=False)
test_loader_other = prepare_data_basic(data=test_set_other , tokenizer=tokenizer, batch_size=batch_size, shuffle=False)

- Converting the input sentences into tokens
- Converting the GT sentences into tokens
- Converting the input sentences into tokens
- Converting the GT sentences into tokens


# Evaluation

Define metrics for evaluation:
1. WER - Word Error Rate
1. EM - Exact Match
1. BLEU - Bilingual Evaluation Understudy
1. GLEU
1. BERT - Bert Score

In [9]:
eval_metrics = ['wer', 'em']

In [10]:
def evaluate_model_performance(model, metrics=eval_metrics, data_loaders={'Clean': test_loader, 'Other': test_loader_other}):

    evaluators = {}
    data_types = list(data_loaders.keys())

    ### Evaluate TEST set
    model.eval()

    for data_type in data_types:
        evaluator = Evaluator(metrics=metrics, set_types=['test'])

        # No need for gradients when evaluating
        with torch.no_grad():
            for batch in data_loaders[data_type]:

                X = batch['sentences'].to(DEVICE)
                S = batch['scores'].to(DEVICE)
                y = batch['labels'].to(DEVICE)

                test_preds = model(input_ids=X, labels=y, scores_ids=S)
                test_logits = test_preds.logits

                test_reference = detokenize_and_clean(tokenizer, y)
                test_predicted = detokenize_and_clean(tokenizer, test_logits.argmax(dim=-1))
                
                evaluator.calculate_metrics(set_type='test', reference=test_reference, predicted=test_predicted)

        evaluator.end_epoch_routine(print_metrics=False)
        evaluators[data_type] = evaluator

    return {data_type: evaluator.metrics_df for data_type, evaluator in evaluators.items()}

In [11]:
def print_metrics_comp_table(evals_df_dict, title='Metrics Comparison'):
    # merge all dfs from evals_df_dict, use the keys as an index. Each df has only one line so it's ok
    # don't use the old index
    evals_df = pd.concat(evals_df_dict.values(), keys=evals_df_dict.keys()).reset_index(level=1, drop=True)

    print(title)
    print(tabulate(evals_df, headers='keys', tablefmt='psql', floatfmt='.4f'))
    print()

# Evaluating metrics for the ASR

In [23]:
asr_evaluators = {}

for data_type in ['Clean', 'Other']:

    # Print data type header
    print(f'Running ASR evaluation on {data_type} datasets')

    asr_evaluator = Evaluator(metrics=eval_metrics, set_types=['test'])

    for batch in (test_loader if data_type == 'Clean' else test_loader_other):
        reference = detokenize_and_clean(tokenizer, batch['sentences'])
        predicted = detokenize_and_clean(tokenizer, batch['labels'])
        asr_evaluator.calculate_metrics(set_type='test', reference=reference, predicted=predicted)

    asr_evaluator.end_epoch_routine(print_metrics=False)

    # Print final metrics
    asr_evaluator.print_final_metrics()

    # Save results to disk
    dir_path = os.path.join(results_dir, 'ASR', data_type)
    os.makedirs(dir_path, exist_ok=True)
    asr_evaluator.store_df(dir_path)

    # Save evaluator for later use
    asr_evaluators[data_type] = asr_evaluator

    print()

Running ASR evaluation on Clean datasets
Test Metrics:
+----+-------+-------+
|    |   wer |    em |
|----+-------+-------|
|  1 | 0.129 | 0.288 |
+----+-------+-------+


Running ASR evaluation on Other datasets
Test Metrics:
+----+-------+-------+
|    |   wer |    em |
|----+-------+-------|
|  1 | 0.315 | 0.135 |
+----+-------+-------+




# Load Best Models, Evaluate on Test set

## Rec-ACE

In [24]:
model_res_dir = r'results\rec_ace_Clean\2023-08-23_01-46-03'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

rec_ace_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'rec_ace', use_pretrained=True, bin_size=10).to(DEVICE)

AssertionError: File 'results\rec_ace_Clean\2023-08-23_01-46-03/dev_metrics.csv' does not exist.

In [None]:
rec_ace_results = evaluate_model_performance(rec_ace_best_model)

## T5

In [25]:
model_res_dir = r'results\original_f5_Clean\2023-08-23_13-31-43'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

t5_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'original', use_pretrained=True, bin_size=10).to(DEVICE)

AssertionError: File 'results\original_f5_Clean\2023-08-23_13-31-43/dev_metrics.csv' does not exist.

In [16]:
t5_results = evaluate_model_performance(t5_best_model)

## Rec-ACE (trained on Other dataset)

In [12]:
model_res_dir = r'results/rec_ace_Other/2023-08-24_01-05-43'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

rec_ace_other_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'rec_ace', use_pretrained=True, bin_size=10).to(DEVICE)

Best epoch by the wer is 49


In [30]:
rec_ace_other_results = evaluate_model_performance(rec_ace_other_best_model)

NameError: name 'rec_ace_other_best_model' is not defined

## T5 (trained on Other dataset)

In [26]:
model_res_dir = r'results\original_f5_Other\2023-08-24_17-27-55'
metric = 'wer'
epoch = Evaluator.get_best_epoch(model_res_dir, metric)
print(f'Best epoch by the {metric} is {epoch}')

t5_other_best_model = RecACEWrapModel.load_from_disk(os.path.join(model_res_dir, f'epoch_{epoch}.pt'), 't5-small', 'original', use_pretrained=True, bin_size=10).to(DEVICE)

AssertionError: File 'results\original_f5_Other\2023-08-24_17-27-55/dev_metrics.csv' does not exist.

In [27]:
t5_other_results = evaluate_model_performance(t5_other_best_model)

NameError: name 't5_other_best_model' is not defined

# Compare Results

In [21]:
for set_type in ['Clean', 'Other']:
    print_metrics_comp_table({
        'ASR':                      asr_evaluators[set_type].metrics_df['test'],
        'RecAce(trained on Clean)': rec_ace_results[set_type]['test'],
        'T5(trained on Clean)':     t5_results[set_type]['test'],
        'RecAce(trained on Other)': rec_ace_other_results[set_type]['test'],
        'T5(trained on Other)':     t5_other_results[set_type]['test'],
    }, title=f'{set_type} Test Set Results')

Clean Test Set Results
+--------------------------+--------+--------+
|                          |    wer |     em |
|--------------------------+--------+--------|
| ASR                      | 0.1289 | 0.2875 |
| RecAce(trained on Clean) | 0.1054 | 0.3538 |
| T5(trained on Clean)     | 0.1396 | 0.3117 |
| RecAce(trained on Other) | 0.0907 | 0.3773 |
| T5(trained on Other)     | 0.0939 | 0.3586 |
+--------------------------+--------+--------+

Other Test Set Results
+--------------------------+--------+--------+
|                          |    wer |     em |
|--------------------------+--------+--------|
| ASR                      | 0.3150 | 0.1350 |
| RecAce(trained on Clean) | 0.2515 | 0.1567 |
| T5(trained on Clean)     | 0.2846 | 0.1382 |
| RecAce(trained on Other) | 0.2257 | 0.1713 |
| T5(trained on Other)     | 0.2304 | 0.1640 |
+--------------------------+--------+--------+



## Examples:

In [29]:
# How many examples to print (must be < batch_size or else take batch size)
how_much_to_print = 10
N_examples = how_much_to_print if how_much_to_print < batch_size else batch_size
print(f'Printing {N_examples} examples')

Printing 8 examples


In [16]:
examples_model = rec_ace_other_best_model

In [14]:
examples_loader = test_loader_other
batch = next(iter(examples_loader))

In [15]:
X = batch['sentences']
S = batch['scores']
y = batch['labels']

In [19]:
test_preds = examples_model(input_ids=X, labels=y, scores_ids=S)
test_logits = test_preds.logits

ex_hypothesis = detokenize_and_clean(tokenizer, X)
ex_reference = detokenize_and_clean(tokenizer, y)
ex_predicted = detokenize_and_clean(tokenizer, test_logits.argmax(dim=-1))

In [35]:
for ii in range(len(ex_hypothesis)):
    print(f'Example #{ii}:')
    print(f'- Reference:\n\t"{ex_reference[ii]}"')
    print(f'- ASR hypothesis:\n\t"{ex_hypothesis[ii]}"')
    print(f'- RED-ACE:\n\t"{ex_predicted[ii]}"')
    print('-' * 20)

Example #0:
- Reference:
	"there's iron they say in all our blood and a grain or two perhaps is good but his he makes me harshly feel has got a little too much of steel anon"
- ASR hypothesis:
	"design they say you nola blood and agreeing to paps is good but he is he makes me harshly feel has got a little too much of steel"
- RED-ACE:
	"the's  or say   blood blood and agreea  to two  it good but  he makes me harshly feel has got a little too much of steel"
--------------------
Example #1:
- Reference:
	"margaret said mister hale as he returned from showing his guest downstairs i could not help watching your face with some anxiety when mister thornton made his confession of having been a shop boy"
- ASR hypothesis:
	"margaret said mr. hale as he returned from showing his guests downstairs but mr. thornton made his confession of having been a truck boy"
- RED-ACE:
	"margaret said mister hale as he returned from showing his guests downstairs buta suppose not believe but the  but the  but 