In [1]:
from TASD.translate_sequence_to_tuples import preprocess_for_metrics
from TASD.evaluation import calculate_metrics_for_examples
from transformers import AutoTokenizer
from datasets import load_metric
import numpy as np
import constants

predictions_load = np.load("predictions.npy")
labels_load = np.load("labels.npy")

In [2]:
# Source: https://medium.com/nlplanet/a-full-guide-to-finetuning-t5-for-text2text-and-building-a-demo-with-streamlit-c72009631887

def compute_metrics(predictions, labels):
    tokenizer = AutoTokenizer.from_pretrained(constants.MODEL_NAME_TASD)
    metric = load_metric("rouge")

    # Preprocess predictions
    decoded_preds, decoded_labels, pred_tuples, labels_tuples = preprocess_for_metrics(
        predictions, labels, tokenizer)

    # Text Based Metrics
    results = metric.compute(predictions=decoded_preds,
                             references=decoded_labels, use_stemmer=True)
    results = {key: value.mid.fmeasure * 100 for key, value in results.items()}
    prediction_lens = [np.count_nonzero(
        pred != tokenizer.pad_token_id) for pred in predictions]
    results["gen_len"] = np.mean(prediction_lens)
    results = {k: round(v, 4) for k, v in results.items()}

    # Calculate Total Metrics
    total_metrics = calculate_metrics_for_examples(labels_tuples, pred_tuples)

    for metric in ["f1", "recall", "precision", "accuracy"]:
        results[metric] = total_metrics[metric]

    # Calculate metrics for each aspect category
    for aspect_category in constants.ASPECT_CATEGORIES:
        pred_tuples_ac = [[tuple for tuple in example if tuple["aspect_category"]
                           == aspect_category] for example in pred_tuples]
        labels_tuples_ac = [[tuple for tuple in example if tuple["aspect_category"]
                             == aspect_category] for example in labels_tuples]
        ac_metrics = calculate_metrics_for_examples(labels_tuples_ac, pred_tuples_ac)
        for metric in ["f1", "recall", "precision", "accuracy"]:
            results[metric+"_"+aspect_category] = ac_metrics[metric]


    return results

In [3]:
compute_metrics(predictions_load, labels_load)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
  metric = load_metric("rouge")


{'rouge1': 100.0, 'rouge2': 100.0, 'rougeL': 100.0, 'rougeLsum': 100.0, 'gen_len': 25.964, 'f1': 1.0, 'recall': 1.0, 'precision': 1.0, 'accuracy': 1.0, 'f1_GENERAL-IMPRESSION': 1.0, 'recall_GENERAL-IMPRESSION': 1.0, 'precision_GENERAL-IMPRESSION': 1.0, 'accuracy_GENERAL-IMPRESSION': 1.0, 'f1_FOOD': 1.0, 'recall_FOOD': 1.0, 'precision_FOOD': 1.0, 'accuracy_FOOD': 1.0, 'f1_SERVICE': 1.0, 'recall_SERVICE': 1.0, 'precision_SERVICE': 1.0, 'accuracy_SERVICE': 1.0, 'f1_AMBIENCE': 1.0, 'recall_AMBIENCE': 1.0, 'precision_AMBIENCE': 1.0, 'accuracy_AMBIENCE': 1.0, 'f1_PRICE': 1.0, 'recall_PRICE': 1.0, 'precision_PRICE': 1.0, 'accuracy_PRICE': 1.0}


{'rouge1': 100.0,
 'rouge2': 100.0,
 'rougeL': 100.0,
 'rougeLsum': 100.0,
 'gen_len': 25.964,
 'f1': 1.0,
 'recall': 1.0,
 'precision': 1.0,
 'accuracy': 1.0,
 'f1_GENERAL-IMPRESSION': 1.0,
 'recall_GENERAL-IMPRESSION': 1.0,
 'precision_GENERAL-IMPRESSION': 1.0,
 'accuracy_GENERAL-IMPRESSION': 1.0,
 'f1_FOOD': 1.0,
 'recall_FOOD': 1.0,
 'precision_FOOD': 1.0,
 'accuracy_FOOD': 1.0,
 'f1_SERVICE': 1.0,
 'recall_SERVICE': 1.0,
 'precision_SERVICE': 1.0,
 'accuracy_SERVICE': 1.0,
 'f1_AMBIENCE': 1.0,
 'recall_AMBIENCE': 1.0,
 'precision_AMBIENCE': 1.0,
 'accuracy_AMBIENCE': 1.0,
 'f1_PRICE': 1.0,
 'recall_PRICE': 1.0,
 'precision_PRICE': 1.0,
 'accuracy_PRICE': 1.0}