In [30]:
import numpy as np

aspect_categories_array = np.load('aspect_categories.npy')
predictions_array = np.load('predictions.npy')
true_labels_array = np.load('true_labels.npy')

In [31]:
import constants
from OTE.evaluation import compute_metrics_for_subset, compute_popular_metrics


def calculate_f1_micro(metrics):
    tp_total = sum([metrics[f"tp_{ac}"] for ac in constants.ASPECT_CATEGORIES])
    fp_total = sum([metrics[f"fp_{ac}"] for ac in constants.ASPECT_CATEGORIES])
    fn_total = sum([metrics[f"fn_{ac}"] for ac in constants.ASPECT_CATEGORIES])
    precision_total = tp_total / (tp_total + fp_total)
    recall_total = tp_total / (tp_total + fn_total)

    return 2 * (precision_total * recall_total) / (precision_total + recall_total)

In [32]:
def compute_metrics():
    aspect_categories = aspect_categories_array
    predictions = predictions_array
    true_labels = true_labels_array

    metrics = {}

    for ac in constants.ASPECT_CATEGORIES:
        # Select all examples of a given aspect category
        ac_predictions = np.array([predictions[i] for i in range(
            len(predictions)) if aspect_categories[i] == ac])
        ac_predictions = (ac_predictions == ac_predictions.max(
            axis=2)[:, :, np.newaxis]).astype(int)

        ac_labels = np.array([true_labels[i] for i in range(
            len(true_labels)) if aspect_categories[i] == ac])

        tp, tn, fp, fn = compute_metrics_for_subset(
            ac_predictions, ac_labels)

        precision, recall, f1 = compute_popular_metrics(tp, tn, fp, fn)
        metrics[f"f1_{ac}"] = f1
        metrics[f"precision_{ac}"] = precision
        metrics[f"recall_{ac}"] = recall
        metrics[f"tp_{ac}"] = tp
        metrics[f"tn_{ac}"] = tn
        metrics[f"fp_{ac}"] = fp
        metrics[f"fn_{ac}"] = fn
        metrics[f"n_samples_{ac}"] = len(ac_predictions)

    # Calculate f1_micro
    metrics["f1_micro"] = calculate_f1_micro(metrics)

    print([metrics[key] for key in [f"f1_{ac}" for ac in constants.ASPECT_CATEGORIES]], (len(
        constants.ASPECT_CATEGORIES) - 1))

    # Calculate F1 macro score
    metrics["f1_macro"] = sum(metrics[key] for key in [
                              f"f1_{ac}" for ac in constants.ASPECT_CATEGORIES]) / len(constants.ASPECT_CATEGORIES)

    # Calculate toal scores
    tp, tn, fp, fn = compute_metrics_for_subset((predictions == predictions.max(
        axis=2)[:, :, np.newaxis]).astype(int), true_labels)
    metrics["precision"], metrics["recall"], metrics["f1"] = compute_popular_metrics(
        tp, tn, fp, fn)

    return metrics

In [33]:
compute_metrics()

[1.0, 0.9076086956521738, 1.0, 1.0, 1.0] 4


{'f1_GENERAL-IMPRESSION': 1.0,
 'precision_GENERAL-IMPRESSION': 1.0,
 'recall_GENERAL-IMPRESSION': 1.0,
 'tp_GENERAL-IMPRESSION': 33,
 'tn_GENERAL-IMPRESSION': 0,
 'fp_GENERAL-IMPRESSION': 0,
 'fn_GENERAL-IMPRESSION': 0,
 'n_samples_GENERAL-IMPRESSION': 33,
 'f1_FOOD': 0.9076086956521738,
 'precision_FOOD': 0.8308457711442786,
 'recall_FOOD': 1.0,
 'tp_FOOD': 167,
 'tn_FOOD': 0,
 'fp_FOOD': 34,
 'fn_FOOD': 0,
 'n_samples_FOOD': 167,
 'f1_SERVICE': 1.0,
 'precision_SERVICE': 1.0,
 'recall_SERVICE': 1.0,
 'tp_SERVICE': 166,
 'tn_SERVICE': 0,
 'fp_SERVICE': 0,
 'fn_SERVICE': 0,
 'n_samples_SERVICE': 166,
 'f1_AMBIENCE': 1.0,
 'precision_AMBIENCE': 1.0,
 'recall_AMBIENCE': 1.0,
 'tp_AMBIENCE': 131,
 'tn_AMBIENCE': 0,
 'fp_AMBIENCE': 0,
 'fn_AMBIENCE': 0,
 'n_samples_AMBIENCE': 100,
 'f1_PRICE': 1.0,
 'precision_PRICE': 1.0,
 'recall_PRICE': 1.0,
 'tp_PRICE': 33,
 'tn_PRICE': 0,
 'fp_PRICE': 0,
 'fn_PRICE': 0,
 'n_samples_PRICE': 33,
 'f1_micro': 0.9689213893967094,
 'f1_macro': 0.981521739