Add metrics_to_return to all evaluations (#595)

Striveworks · Jun 4, 2024 · d8d2a53 · d8d2a53
1 parent a03bb4e
commit d8d2a53
Show file tree

Hide file tree

Showing 17 changed files with 345 additions and 66 deletions.
diff --git a/api/tests/functional-tests/backend/core/test_evaluation.py b/api/tests/functional-tests/backend/core/test_evaluation.py
@@ -829,15 +829,31 @@ def test__fetch_evaluations_and_mark_for_deletion(
     db: Session, finalized_dataset: str, finalized_model: str
 ):
     # create two evaluations
-    for pr_curves in [True, False]:
+    for metrics_to_return in [
+        [
+            "Precision",
+            "Recall",
+            "F1",
+            "Accuracy",
+            "ROCAUC",
+        ],
+        [
+            "Precision",
+            "Recall",
+            "F1",
+            "Accuracy",
+            "ROCAUC",
+            "PrecisionRecallCurve",
+        ],
+    ]:
         core.create_or_get_evaluations(
             db,
             schemas.EvaluationRequest(
                 model_names=[finalized_model],
                 datum_filter=schemas.Filter(dataset_names=[finalized_dataset]),
                 parameters=schemas.EvaluationParameters(
                     task_type=enums.TaskType.CLASSIFICATION,
-                    compute_pr_curves=pr_curves,
+                    metrics_to_return=metrics_to_return,
                 ),
                 meta={},
             ),

diff --git a/api/tests/functional-tests/backend/metrics/test_classification.py b/api/tests/functional-tests/backend/metrics/test_classification.py
@@ -704,7 +704,18 @@ def test_compute_classification(
     )
 
     confusion, metrics = _compute_clf_metrics(
-        db, model_filter, datum_filter, label_map=None, compute_pr_curves=True
+        db,
+        model_filter,
+        datum_filter,
+        label_map=None,
+        metrics_to_return=[
+            "Precision",
+            "Recall",
+            "F1",
+            "Accuracy",
+            "ROCAUC",
+            "PrecisionRecallCurve",
+        ],
     )
 
     # Make matrices accessible by label_key

diff --git a/api/tests/functional-tests/backend/metrics/test_detection.py b/api/tests/functional-tests/backend/metrics/test_detection.py
@@ -511,7 +511,15 @@ def test__compute_detection_metrics(
             convert_annotations_to_type=enums.AnnotationType.BOX,
             iou_thresholds_to_compute=list(iou_thresholds),
             iou_thresholds_to_return=[0.5, 0.75],
-            compute_pr_curves=True,
+            metrics_to_return=[
+                "AP",
+                "AR",
+                "mAP",
+                "APAveragedOverIOUs",
+                "mAR",
+                "mAPAveragedOverIOUs",
+                "PrecisionRecallCurve",
+            ],
         ),
         prediction_filter=schemas.Filter(
             model_names=["test_model"],
@@ -734,7 +742,15 @@ def test__compute_detection_metrics_with_rasters(
             convert_annotations_to_type=enums.AnnotationType.RASTER,
             iou_thresholds_to_compute=list(iou_thresholds),
             iou_thresholds_to_return=[0.5, 0.75],
-            compute_pr_curves=True,
+            metrics_to_return=[
+                "AP",
+                "AR",
+                "mAP",
+                "APAveragedOverIOUs",
+                "mAR",
+                "mAPAveragedOverIOUs",
+                "PrecisionRecallCurve",
+            ],
         ),
         prediction_filter=schemas.Filter(
             model_names=["test_model"],

diff --git a/api/valor_api/backend/core/evaluation.py b/api/valor_api/backend/core/evaluation.py
@@ -271,7 +271,14 @@ def _create_response(
     """Converts a evaluation row into a response schema."""
     metrics = db.query(
         select(models.Metric)
-        .where(models.Metric.evaluation_id == evaluation.id)
+        .where(
+            and_(
+                models.Metric.evaluation_id == evaluation.id,
+                models.Metric.type.in_(
+                    evaluation.parameters["metrics_to_return"]
+                ),
+            )
+        )
         .subquery()
     ).all()
     confusion_matrices = db.query(

diff --git a/api/valor_api/backend/metrics/classification.py b/api/valor_api/backend/metrics/classification.py
@@ -627,7 +627,7 @@ def _compute_confusion_matrix_and_metrics_at_grouper_key(
     groundtruth_filter: schemas.Filter,
     grouper_key: str,
     grouper_mappings: dict[str, dict[str, dict]],
-    compute_pr_curves: bool,
+    metrics_to_return: list[str],
 ) -> (
     tuple[
         schemas.ConfusionMatrix,
@@ -654,8 +654,8 @@ def _compute_confusion_matrix_and_metrics_at_grouper_key(
         The filter to be used to query groundtruths.
     grouper_mappings: dict[str, dict[str, dict]]
         A dictionary of mappings that connect groupers to their related labels.
-    compute_pr_curves: bool
-        A boolean which determines whether we calculate precision-recall curves or not.
+    metrics: list[str]
+        The list of metrics to compute, store, and return to the user.
 
     Returns
     -------
@@ -728,7 +728,7 @@ def _compute_confusion_matrix_and_metrics_at_grouper_key(
         ),
     ]
 
-    if compute_pr_curves:
+    if "PrecisionRecallCurve" in metrics_to_return:
         # calculate the number of unique datums
         # used to determine the number of true negatives
         pd_datums = db.query(
@@ -795,7 +795,7 @@ def _compute_clf_metrics(
     db: Session,
     prediction_filter: schemas.Filter,
     groundtruth_filter: schemas.Filter,
-    compute_pr_curves: bool,
+    metrics_to_return: list[str],
     label_map: LabelMapType | None = None,
 ) -> tuple[
     list[schemas.ConfusionMatrix],
@@ -819,8 +819,8 @@ def _compute_clf_metrics(
         The filter to be used to query predictions.
     groundtruth_filter : schemas.Filter
         The filter to be used to query groundtruths.
-    compute_pr_curves: bool
-        A boolean which determines whether we calculate precision-recall curves or not.
+    metrics: list[str]
+        The list of metrics to compute, store, and return to the user.
     label_map: LabelMapType, optional
         Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
 
@@ -853,7 +853,7 @@ def _compute_clf_metrics(
             groundtruth_filter=groundtruth_filter,
             grouper_key=grouper_key,
             grouper_mappings=grouper_mappings,
-            compute_pr_curves=compute_pr_curves,
+            metrics_to_return=metrics_to_return,
         )
         if cm_and_metrics is not None:
             confusion_matrices.append(cm_and_metrics[0])
@@ -909,11 +909,7 @@ def compute_clf_metrics(
         prediction_filter=prediction_filter,
         groundtruth_filter=groundtruth_filter,
         label_map=parameters.label_map,
-        compute_pr_curves=(
-            parameters.compute_pr_curves
-            if parameters.compute_pr_curves is not None
-            else False
-        ),
+        metrics_to_return=parameters.metrics_to_return,  # type: ignore - metrics_to_return is guaranteed not to be None
     )
 
     confusion_matrices_mappings = create_metric_mappings(

diff --git a/api/valor_api/backend/metrics/detection.py b/api/valor_api/backend/metrics/detection.py
@@ -663,7 +663,7 @@ def _annotation_type_to_geojson(
         number_of_groundtruths_per_grouper[grouper_id] += 1
 
     # Optionally compute precision-recall curves
-    if parameters.compute_pr_curves:
+    if "PrecisionRecallCurve" in parameters.metrics_to_return:  # type: ignore - metrics_to_return is guaranteed not to be None
         false_positive_entries = db.query(
             select(
                 joint.c.dataset_name,

diff --git a/api/valor_api/schemas/evaluation.py b/api/valor_api/schemas/evaluation.py
@@ -16,6 +16,8 @@ class EvaluationParameters(BaseModel):
 
     Attributes
     ----------
+    metrics: list[str], optional
+        The list of metrics to compute, store, and return to the user.
     convert_annotations_to_type: AnnotationType | None = None
         The type to convert all annotations to.
     iou_thresholds_to_compute: List[float], optional
@@ -26,20 +28,18 @@ class EvaluationParameters(BaseModel):
         Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
     recall_score_threshold: float, default=0
         The confidence score threshold for use when determining whether to count a prediction as a true positive or not while calculating Average Recall.
-    compute_pr_curves: bool
-        A boolean which determines whether we calculate precision-recall curves or not.
     pr_curve_iou_threshold: float, optional
-            The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5. Does nothing when compute_pr_curves is set to False or None.
+            The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5.
     """
 
     task_type: TaskType
 
+    metrics_to_return: list[str] | None = None
     convert_annotations_to_type: AnnotationType | None = None
     iou_thresholds_to_compute: list[float] | None = None
     iou_thresholds_to_return: list[float] | None = None
     label_map: LabelMapType | None = None
     recall_score_threshold: float | None = 0
-    compute_pr_curves: bool | None = None
     pr_curve_iou_threshold: float | None = 0.5
 
     # pydantic setting
@@ -50,6 +50,29 @@ class EvaluationParameters(BaseModel):
     def _validate_by_task_type(cls, values):
         """Validate the IOU thresholds."""
 
+        # set default metrics for each task type
+        if values.metrics_to_return is None:
+            match values.task_type:
+                case TaskType.CLASSIFICATION:
+                    values.metrics_to_return = [
+                        "Accuracy",
+                        "Precision",
+                        "Recall",
+                        "F1",
+                        "ROCAUC",
+                    ]
+                case TaskType.OBJECT_DETECTION:
+                    values.metrics_to_return = [
+                        "AP",
+                        "AR",
+                        "mAP",
+                        "APAveragedOverIOUs",
+                        "mAR",
+                        "mAPAveragedOverIOUs",
+                    ]
+                case TaskType.SEMANTIC_SEGMENTATION:
+                    values.metrics_to_return = ["IOU", "mIOU"]
+
         match values.task_type:
             case TaskType.CLASSIFICATION | TaskType.SEMANTIC_SEGMENTATION:
                 if values.convert_annotations_to_type is not None:

diff --git a/api/valor_api/schemas/metrics.py b/api/valor_api/schemas/metrics.py
@@ -429,7 +429,7 @@ class PrecisionRecallCurve(BaseModel):
     value: dict
         A nested dictionary where the first key is the class label, the second key is the confidence threshold (e.g., 0.05), the third key is the metric name (e.g., "precision"), and the final key is either the value itself (for precision, recall, etc.) or a list of tuples containing data for each observation.
     pr_curve_iou_threshold: float, optional
-        The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5. Does nothing when compute_pr_curves is set to False or None.
+        The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5.
     """
 
     label_key: str

diff --git a/client/valor/coretypes.py b/client/valor/coretypes.py
@@ -908,7 +908,7 @@ def evaluate_classification(
         datasets: Optional[Union[Dataset, List[Dataset]]] = None,
         filter_by: Optional[FilterType] = None,
         label_map: Optional[Dict[Label, Label]] = None,
-        compute_pr_curves: bool = False,
+        metrics_to_return: Optional[List[str]] = None,
         allow_retries: bool = False,
     ) -> Evaluation:
         """
@@ -922,8 +922,8 @@ def evaluate_classification(
             Optional set of constraints to filter evaluation by.
         label_map : Dict[Label, Label], optional
             Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
-        compute_pr_curves: bool
-            A boolean which determines whether we calculate precision-recall curves or not.
+        metrics: List[str], optional
+            The list of metrics to compute, store, and return to the user.
         allow_retries : bool, default = False
             Option to retry previously failed evaluations.
 
@@ -945,7 +945,7 @@ def evaluate_classification(
             parameters=EvaluationParameters(
                 task_type=TaskType.CLASSIFICATION,
                 label_map=self._create_label_map(label_map=label_map),
-                compute_pr_curves=compute_pr_curves,
+                metrics_to_return=metrics_to_return,
             ),
             meta={},
         )
@@ -967,7 +967,7 @@ def evaluate_detection(
         iou_thresholds_to_return: Optional[List[float]] = None,
         label_map: Optional[Dict[Label, Label]] = None,
         recall_score_threshold: float = 0,
-        compute_pr_curves: bool = False,
+        metrics_to_return: Optional[List[str]] = None,
         pr_curve_iou_threshold: float = 0.5,
         allow_retries: bool = False,
     ) -> Evaluation:
@@ -990,10 +990,10 @@ def evaluate_detection(
             Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
         recall_score_threshold: float, default=0
             The confidence score threshold for use when determining whether to count a prediction as a true positive or not while calculating Average Recall.
-        compute_pr_curves: bool, optional
-            A boolean which determines whether we calculate precision-recall curves or not.
+        metrics: List[str], optional
+            The list of metrics to compute, store, and return to the user.
         pr_curve_iou_threshold: float, optional
-            The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5. Does nothing when compute_pr_curves is set to False or None.
+            The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5.
         allow_retries : bool, default = False
             Option to retry previously failed evaluations.
 
@@ -1018,7 +1018,7 @@ def evaluate_detection(
             iou_thresholds_to_return=iou_thresholds_to_return,
             label_map=self._create_label_map(label_map=label_map),
             recall_score_threshold=recall_score_threshold,
-            compute_pr_curves=compute_pr_curves,
+            metrics_to_return=metrics_to_return,
             pr_curve_iou_threshold=pr_curve_iou_threshold,
         )
         datum_filter = self._format_constraints(datasets, filter_by)
@@ -1042,6 +1042,7 @@ def evaluate_segmentation(
         datasets: Optional[Union[Dataset, List[Dataset]]] = None,
         filter_by: Optional[FilterType] = None,
         label_map: Optional[Dict[Label, Label]] = None,
+        metrics_to_return: Optional[List[str]] = None,
         allow_retries: bool = False,
     ) -> Evaluation:
         """
@@ -1055,6 +1056,8 @@ def evaluate_segmentation(
             Optional set of constraints to filter evaluation by.
         label_map : Dict[Label, Label], optional
             Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
+        metrics: List[str], optional
+            The list of metrics to compute, store, and return to the user.
         allow_retries : bool, default = False
             Option to retry previously failed evaluations.
 
@@ -1071,6 +1074,7 @@ def evaluate_segmentation(
             parameters=EvaluationParameters(
                 task_type=TaskType.SEMANTIC_SEGMENTATION,
                 label_map=self._create_label_map(label_map=label_map),
+                metrics_to_return=metrics_to_return,
             ),
             meta={},
         )

diff --git a/client/valor/schemas/evaluation.py b/client/valor/schemas/evaluation.py
@@ -20,10 +20,10 @@ class EvaluationParameters:
         Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
     recall_score_threshold: float, default=0
         The confidence score threshold for use when determining whether to count a prediction as a true positive or not while calculating Average Recall.
-    compute_pr_curves: bool
-        A boolean which determines whether we calculate precision-recall curves or not.
+    metrics: List[str], optional
+        The list of metrics to compute, store, and return to the user.
     pr_curve_iou_threshold: float, optional
-            The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5. Does nothing when compute_pr_curves is set to False or None.
+            The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5.
 
     """
 
@@ -35,7 +35,7 @@ class EvaluationParameters:
     iou_thresholds_to_return: Optional[List[float]] = None
     label_map: Optional[List[List[List[str]]]] = None
     recall_score_threshold: float = 0
-    compute_pr_curves: bool = False
+    metrics_to_return: Optional[List[str]] = None
     pr_curve_iou_threshold: float = 0.5