Fix bug where some metrics were aggregated above the label_key level (

#575)
Striveworks · May 7, 2024 · fdc46cd · fdc46cd
1 parent 6c9685b
commit fdc46cd
Show file tree

Hide file tree

Showing 10 changed files with 504 additions and 181 deletions.
diff --git a/api/tests/functional-tests/backend/metrics/test_detection.py b/api/tests/functional-tests/backend/metrics/test_detection.py
@@ -571,8 +571,8 @@ def _metric_to_dict(m) -> dict:
         {"iou": 0.75, "value": 1.0, "label": {"key": "class", "value": "4"}},
     ]
     expected_map_metrics = [
-        {"iou": 0.5, "value": 0.859},
-        {"iou": 0.75, "value": 0.761},
+        {"iou": 0.5, "value": 0.859, "label_key": "class"},
+        {"iou": 0.75, "value": 0.761, "label_key": "class"},
     ]
     expected_ap_metrics_ave_over_ious = [
         {
@@ -602,7 +602,7 @@ def _metric_to_dict(m) -> dict:
         },
     ]
     expected_map_metrics_ave_over_ious = [
-        {"ious": iou_thresholds, "value": 0.637}
+        {"ious": iou_thresholds, "value": 0.637, "label_key": "class"}
     ]
     expected_ar_metrics = [
         {
@@ -637,10 +637,7 @@ def _metric_to_dict(m) -> dict:
         },
     ]
     expected_mar_metrics = [
-        {
-            "ious": iou_thresholds,
-            "value": 0.652,
-        },
+        {"ious": iou_thresholds, "value": 0.652, "label_key": "class"},
     ]
 
     for metric_type, actual_metrics, expected_metrics in [
@@ -809,13 +806,10 @@ def test__compute_detection_metrics_with_rasters(
             "label": {"key": "class", "value": "label3"},
         },
         # mAP METRICS
-        {"iou": 0.5, "value": 0.667},
-        {"iou": 0.75, "value": 0.667},
+        {"iou": 0.5, "value": 0.667, "label_key": "class"},
+        {"iou": 0.75, "value": 0.667, "label_key": "class"},
         # mAP METRICS AVERAGED OVER IOUS
-        {
-            "ious": iou_thresholds,
-            "value": 0.667,
-        },
+        {"ious": iou_thresholds, "value": 0.667, "label_key": "class"},
         # AR METRICS
         {
             "ious": iou_thresholds,
@@ -833,10 +827,7 @@ def test__compute_detection_metrics_with_rasters(
             "label": {"key": "class", "value": "label3"},
         },
         # mAR METRICS
-        {
-            "ious": iou_thresholds,
-            "value": 0.667,
-        },
+        {"ious": iou_thresholds, "value": 0.667, "label_key": "class"},
     ]
 
     non_pr_metrics = metrics[:-1]

diff --git a/api/tests/functional-tests/backend/metrics/test_segmentation.py b/api/tests/functional-tests/backend/metrics/test_segmentation.py
@@ -416,13 +416,13 @@ def test__compute_segmentation_metrics(
         prediction_filter=prediction_filter,
         groundtruth_filter=groundtruth_filter,
     )
-    # should have five metrics (one IOU for each of the four labels, and one mIOU)
-    assert len(metrics) == 5
-    for metric in metrics[:-1]:
+    # should have five metrics (one IOU for each of the four labels, and three mIOUs)
+    assert len(metrics) == 7
+    for metric in metrics[:-3]:
         assert isinstance(metric, schemas.IOUMetric)
         assert metric.value < 1.0
-    assert isinstance(metrics[-1], schemas.mIOUMetric)
-    assert metrics[-1].value < 1.0
+    assert all([isinstance(m, schemas.mIOUMetric) for m in metrics[-3:]])
+    assert all([m.value < 1.0 for m in metrics[-3:]])
 
 
 def test_compute_semantic_segmentation_metrics(
@@ -477,11 +477,18 @@ def test_compute_semantic_segmentation_metrics(
         schemas.Label(key="k1", value="v1", score=None): 0.33,
     }
 
+    expected_mIOU_metrics = {"k1": (0.33 + 0) / 2, "k2": 0, "k3": 0}
+
     assert metrics
     for metric in metrics:
         assert isinstance(metric.value, float)
         if metric.type == "mIOU":
-            assert (metric.value - 0.084) <= 0.01
+            assert metric.parameters
+            assert metric.parameters["label_key"]
+            assert (
+                metric.value
+                - expected_mIOU_metrics[metric.parameters["label_key"]]
+            ) <= 0.01
         else:
             # the IOU value for (k1, v1) is bound between .327 and .336
             assert metric.label

diff --git a/api/tests/unit-tests/schemas/test_metrics.py b/api/tests/unit-tests/schemas/test_metrics.py
@@ -84,16 +84,19 @@ def test_APMetricAveragedOverIOUs():
 
 
 def test_mAPMetric():
-    map_metric = schemas.mAPMetric(iou=0.2, value=0.5)
+    map_metric = schemas.mAPMetric(iou=0.2, value=0.5, label_key="key")
 
     with pytest.raises(ValidationError):
-        schemas.mAPMetric(iou=None, value=0.5)  # type: ignore - purposefully throwing error
+        schemas.mAPMetric(iou=None, value=0.5, label_key="key")  # type: ignore - purposefully throwing error
 
     with pytest.raises(ValidationError):
-        schemas.mAPMetric(iou=0.1, value=None)  # type: ignore - purposefully throwing error
+        schemas.mAPMetric(iou=0.1, value=None, label_key="key")  # type: ignore - purposefully throwing error
 
     with pytest.raises(ValidationError):
-        schemas.mAPMetric(iou=0.1, value="value")  # type: ignore - purposefully throwing error
+        schemas.mAPMetric(iou=0.1, value="value", label_key="key")  # type: ignore - purposefully throwing error
+
+    with pytest.raises(ValidationError):
+        schemas.mAPMetric(iou=0.1, value=0.5, label_key=None)  # type: ignore - purposefully throwing error
 
     assert all(
         [
@@ -105,17 +108,22 @@ def test_mAPMetric():
 
 def test_mAPMetricAveragedOverIOUs():
     map_averaged_metric = schemas.mAPMetricAveragedOverIOUs(
-        ious=set([0.1, 0.2]), value=0.5
+        ious=set([0.1, 0.2]), value=0.5, label_key="key"
     )
 
     with pytest.raises(ValidationError):
-        schemas.mAPMetricAveragedOverIOUs(ious=None, value=0.5)  # type: ignore - purposefully throwing error
+        schemas.mAPMetricAveragedOverIOUs(ious=None, value=0.5, label_key="key")  # type: ignore - purposefully throwing error
+
+    with pytest.raises(ValidationError):
+        schemas.mAPMetricAveragedOverIOUs(ious=set([0.1, 0.2]), value=None, label_key="key")  # type: ignore - purposefully throwing error
 
     with pytest.raises(ValidationError):
-        schemas.mAPMetricAveragedOverIOUs(ious=set([0.1, 0.2]), value=None)  # type: ignore - purposefully throwing error
+        schemas.mAPMetricAveragedOverIOUs(ious=set([0.1, 0.2]), value="value", label_key="key")  # type: ignore - purposefully throwing error
 
     with pytest.raises(ValidationError):
-        schemas.mAPMetricAveragedOverIOUs(ious=set([0.1, 0.2]), value="value")  # type: ignore - purposefully throwing error
+        map_averaged_metric = schemas.mAPMetricAveragedOverIOUs(
+            ious=set([0.1, 0.2]), value=0.5, label_key=None  # type: ignore - purposefully throwing error
+        )
 
     assert all(
         [
@@ -357,17 +365,20 @@ def test_IOUMetric():
 
 
 def test_mIOUMetric():
-    iou_metric = schemas.mIOUMetric(value=0.2)
+    iou_metric = schemas.mIOUMetric(value=0.2, label_key="key")
+
+    with pytest.raises(ValidationError):
+        schemas.mIOUMetric(value=None, label_key="key")  # type: ignore - purposefully throwing error
 
     with pytest.raises(ValidationError):
-        schemas.mIOUMetric(value=None)  # type: ignore - purposefully throwing error
+        schemas.mIOUMetric(value="not a value", label_key="key")  # type: ignore - purposefully throwing error
 
     with pytest.raises(ValidationError):
         schemas.mIOUMetric(value="not a value")  # type: ignore - purposefully throwing error
 
     assert all(
         [
-            key in ["value", "type", "evaluation_id"]
+            key in ["value", "type", "evaluation_id", "parameters"]
             for key in iou_metric.db_mapping(evaluation_id=1)
         ]
     )
diff --git a/api/valor_api/backend/metrics/detection.py b/api/valor_api/backend/metrics/detection.py
@@ -785,18 +785,22 @@ def _compute_mean_ar_metrics(
     if len(ar_metrics) == 0:
         return []
 
-    ious_to_values = defaultdict(list)
+    value_dict = defaultdict(lambda: defaultdict(list))
     for metric in ar_metrics:
-        ious_to_values[frozenset(metric.ious)].append(metric.value)
+        value_dict[metric.label.key][frozenset(metric.ious)].append(
+            metric.value
+        )
 
     mean_metrics = []
-    for ious in ious_to_values.keys():
-        mean_metrics.append(
-            schemas.mARMetric(
-                ious=ious,
-                value=_average_ignore_minus_one(ious_to_values[ious]),
+    for label_key, nested_dict in value_dict.items():
+        for ious, values in nested_dict.items():
+            mean_metrics.append(
+                schemas.mARMetric(
+                    ious=ious,
+                    value=_average_ignore_minus_one(values),
+                    label_key=label_key,
+                )
             )
-        )
 
     return mean_metrics
 
@@ -810,29 +814,37 @@ def _compute_mean_detection_metrics_from_aps(
         return []
 
     # dictionary for mapping an iou threshold to set of APs
-    vals = {}
+    vals = defaultdict(lambda: defaultdict(list))
     for ap in ap_scores:
         if hasattr(ap, "iou"):
             iou = ap.iou  # type: ignore - pyright doesn't consider hasattr checks
         else:
             iou = frozenset(ap.ious)  # type: ignore - pyright doesn't consider hasattr checks
-        if iou not in vals:
-            vals[iou] = []
-        vals[iou].append(ap.value)
+        vals[ap.label.key][iou].append(ap.value)
 
     # get mAP metrics at the individual IOUs
-    mean_detection_metrics = [
-        (
-            schemas.mAPMetric(
-                iou=iou, value=_average_ignore_minus_one(vals[iou])
-            )
-            if isinstance(iou, float)
-            else schemas.mAPMetricAveragedOverIOUs(
-                ious=iou, value=_average_ignore_minus_one(vals[iou])
-            )
-        )
-        for iou in vals.keys()
-    ]
+    mean_detection_metrics = []
+
+    for label_key, nested_dict in vals.items():
+        for iou, values in nested_dict.items():
+            if isinstance(iou, float):
+                mean_detection_metrics.append(
+                    schemas.mAPMetric(
+                        iou=iou,
+                        value=_average_ignore_minus_one(values),
+                        label_key=label_key,
+                    )
+                )
+            else:
+                mean_detection_metrics.append(
+                    schemas.mAPMetricAveragedOverIOUs(
+                        ious=iou,
+                        value=_average_ignore_minus_one(
+                            values,
+                        ),
+                        label_key=label_key,
+                    )
+                )
 
     return mean_detection_metrics
 

diff --git a/api/valor_api/backend/metrics/segmentation.py b/api/valor_api/backend/metrics/segmentation.py
@@ -1,3 +1,5 @@
+from collections import defaultdict
+
 from geoalchemy2.functions import ST_Count, ST_MapAlgebra
 from sqlalchemy.orm import Session, aliased
 from sqlalchemy.sql import Select, func, select
@@ -178,21 +180,22 @@ def _compute_segmentation_metrics(
     )
 
     ret = []
+    ious_per_grouper_key = defaultdict(list)
     for grouper_id, label_ids in grouper_mappings[
         "grouper_id_to_label_ids_mapping"
     ].items():
         # set filter
         groundtruth_filter.label_ids = [label_id for label_id in label_ids]
         prediction_filter.label_ids = [label_id for label_id in label_ids]
 
-        _compute_iou_score = _compute_iou(
+        computed_iou_score = _compute_iou(
             db,
             groundtruth_filter,
             prediction_filter,
         )
 
         # only add an IOUMetric if the label ids associated with the grouper id have at least one gt raster
-        if _compute_iou_score is None:
+        if computed_iou_score is None:
             continue
 
         grouper_label = grouper_mappings[
@@ -202,19 +205,24 @@ def _compute_segmentation_metrics(
         ret.append(
             IOUMetric(
                 label=grouper_label,
-                value=_compute_iou_score,
+                value=computed_iou_score,
             )
         )
 
-    ret.append(
+        ious_per_grouper_key[grouper_label.key].append(computed_iou_score)
+
+    # aggregate IOUs by key
+    ret += [
         mIOUMetric(
             value=(
-                sum([metric.value for metric in ret]) / len(ret)
-                if len(ret) != 0
+                sum(iou_values) / len(iou_values)
+                if len(iou_values) != 0
                 else -1
-            )
+            ),
+            label_key=grouper_key,
         )
-    )
+        for grouper_key, iou_values in ious_per_grouper_key.items()
+    ]
 
     return ret