# Imports

In [1]:
import re
import os
import random
from datetime import datetime

from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
import numpy as np
import torch
from transformers import T5Tokenizer

from evaluation import Evaluator
from visualization import plot_graphs
from data_utils.dataset import prepare_data
from data_utils.utils import read_json
from models.rec_ace import RecACEWrapModel, detokenize_and_clean



## Seeds

In [2]:
SEED = 42

# Set the random seed for Python
random.seed(SEED)

# Set the random seed for numpy
np.random.seed(SEED)

torch.manual_seed(SEED)

<torch._C.Generator at 0x244372ccbf0>

# Data Paths

In [3]:
datasets_dict = {
    # Default
    'Default Test Clean': 'data/default/test_clean.json',
    'Default Test Other': 'data/default/test_other.json',
    # Video
    'Video Test Clean': 'data/video/test_clean.json',
    'Video Test Other': 'data/video/test_other.json', 
}

# Load essentials

In [4]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
DEVICE

'cuda'

In [6]:
debug = True

# Base architecture
t5_type = 't5-small'

# How to quantize the confidence vectors [only required for rec_ac]
bin_size=10

### Tokenizer

In [7]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

## Read Data

In [8]:
test_set = read_json(json_path=datasets_dict['Default Test Clean'])

## Prepare as DataLoader

In [9]:
batch_size = 64
test_loader = prepare_data(data=test_set , tokenizer=tokenizer, batch_size=batch_size, shuffle=False)

- Converting the input sentences into tokens
- Converting the GT sentences into tokens


# Evaluating metrics for the ASR

In [11]:
asr_evaluator = Evaluator(metrics=['wer', 'em', 'bleu', 'gleu'], set_types=types)

for batch in test_loader:
    reference = detokenize_and_clean(tokenizer, batch['sentences'])
    predicted = detokenize_and_clean(tokenizer, batch['labels'])
    asr_evaluator.calculate_metrics(set_type='test', reference=reference, predicted=predicted)

asr_evaluator.end_epoch_routine(print_metrics=False)

# Print final metrics
asr_evaluator.print_final_metrics()

# Save results to disk
dir_path = os.path.join('results', 'ASR')
os.makedirs(dir_path, exist_ok=True)
asr_evaluator.store_df(dir_path)

Test Metrics:
+----+-------+-------+--------+--------+
|    |   wer |    em |   bleu |   gleu |
|----+-------+-------+--------+--------|
|  1 | 0.129 | 0.288 |  0.760 |  0.793 |
+----+-------+-------+--------+--------+



# Load best Debug model

In [13]:
epoch = Evaluator.get_best_epoch(r'results\DebugRecAce\2023-08-22_21-15-34')
rec_ace_best_model = RecACEWrapModel.load_from_disk(fr'results\DebugRecAce\2023-08-22_21-15-34\epoch_{epoch}.pt', 't5-small', 'rec_ace', use_pretrained=True, bin_size=10).to(DEVICE)

In [15]:
### Evaluate TEST set
rec_ace_best_model.eval()

test_losses = []
evaluator = Evaluator(metrics=['wer', 'em'], set_types=['test'])

# No need for gradients when evaluating
with torch.no_grad():
    for batch in test_loader:

        X = batch['sentences'].to(DEVICE)
        S = batch['scores'].to(DEVICE)
        y = batch['labels'].to(DEVICE)

        test_preds = rec_ace_best_model(input_ids=X, labels=y, scores_ids=S)

        test_loss = test_preds.loss
        test_logits = test_preds.logits

        test_reference = detokenize_and_clean(tokenizer, y)
        test_predicted = detokenize_and_clean(tokenizer, test_logits.argmax(dim=-1))
        
        test_losses.append(test_loss.item())
        evaluator.calculate_metrics(set_type='test', reference=test_reference, predicted=test_predicted)
    
    evaluator.end_epoch_routine(print_metrics=False)

evaluator.print_final_metrics()

Test Metrics:
+----+-------+-------+
|    |   wer |    em |
|----+-------+-------|
|  1 | 0.279 | 0.096 |
+----+-------+-------+

