Implement quick follow-ups on precision-recall curve PR (#486)

Striveworks · Mar 18, 2024 · ea48d25 · ea48d25
1 parent 0c44dfe
commit ea48d25
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 11 deletions.
diff --git a/api/tests/unit-tests/backend/metrics/test_detection.py b/api/tests/unit-tests/backend/metrics/test_detection.py
@@ -123,14 +123,11 @@ def test__calculate_ap_and_ar():
         ),
     ]
 
-    grouper_ids_associated_with_gts = set(["0", "1", "2"])
-
     ap_metrics, ar_metrics = _calculate_ap_and_ar(
         sorted_ranked_pairs=pairs,
         number_of_groundtruths_per_grouper=number_of_groundtruths_per_grouper,
         grouper_mappings=grouper_mappings,
         iou_thresholds=iou_thresholds,
-        grouper_ids_associated_with_gts=grouper_ids_associated_with_gts,
         recall_score_threshold=0.0,
     )
 
@@ -153,7 +150,6 @@ def test__calculate_ap_and_ar():
                 number_of_groundtruths_per_grouper=number_of_groundtruths_per_grouper,
                 grouper_mappings=grouper_mappings,
                 iou_thresholds=iou_thresholds + [0],
-                grouper_ids_associated_with_gts=grouper_ids_associated_with_gts,
                 recall_score_threshold=0.0,
             )
 
@@ -165,6 +161,5 @@ def test__calculate_ap_and_ar():
                 number_of_groundtruths_per_grouper=number_of_groundtruths_per_grouper,
                 grouper_mappings=grouper_mappings,
                 iou_thresholds=iou_thresholds,
-                grouper_ids_associated_with_gts=grouper_ids_associated_with_gts,
                 recall_score_threshold=illegal_thresh,
             )
diff --git a/api/valor_api/backend/metrics/detection.py b/api/valor_api/backend/metrics/detection.py
@@ -161,7 +161,6 @@ def _calculate_ap_and_ar(
     number_of_groundtruths_per_grouper: dict[int, int],
     grouper_mappings: dict[str, dict[str, schemas.Label]],
     iou_thresholds: list[float],
-    grouper_ids_associated_with_gts: set[int],
     recall_score_threshold: float,
 ) -> Tuple[list[schemas.APMetric], list[schemas.ARMetric]]:
     """
@@ -188,7 +187,7 @@ def _calculate_ap_and_ar(
         recalls_across_thresholds = []
 
         for iou_threshold in iou_thresholds:
-            if grouper_id not in grouper_ids_associated_with_gts:
+            if grouper_id not in number_of_groundtruths_per_grouper.keys():
                 continue
 
             precisions = []
@@ -523,8 +522,6 @@ def _annotation_type_to_column(
         .groundtruths()  # type: ignore - SQLAlchemy type issue
     ).all()  # type: ignore - SQLAlchemy type issue
 
-    grouper_ids_associated_with_gts = set([row[1] for row in groundtruths])
-
     for gt_id, grouper_id in groundtruths:
         number_of_groundtruths_per_grouper[grouper_id] += 1
 
@@ -562,7 +559,6 @@ def _annotation_type_to_column(
         number_of_groundtruths_per_grouper=number_of_groundtruths_per_grouper,
         iou_thresholds=parameters.iou_thresholds_to_compute,
         grouper_mappings=grouper_mappings,
-        grouper_ids_associated_with_gts=grouper_ids_associated_with_gts,
         recall_score_threshold=parameters.recall_score_threshold,
     )
 

diff --git a/docs/metrics.md b/docs/metrics.md
@@ -12,6 +12,7 @@ If we're missing an important metric for your particular use case, please [write
 | F1 | A weighted average of precision and recall. | $\frac{2 * Precision * Recall}{Precision + Recall}$ |
 | Accuracy | The number of true predictions divided by the total number of predictions. | $\dfrac{\|TP\|+\|TN\|}{\|TP\|+\|TN\|+\|FP\|+\|FN\|}$ |
 | ROC AUC | The area under the Receiver Operating Characteristic (ROC) curve for the predictions generated by a given model. | See [ROCAUC methods](#binary-roc-auc). |
+| Precision-Recall Curves | Outputs a nested dictionary containing the true positives, false positives, true negatives, false negatives, precision, recall, and F1 score for each (label key, label value, confidence threshold) combination. Computing this output requires setting the `compute_pr_curves` argument to `True` at evaluation time. | See [precision-recall curve methods](#precision-recall-curves)|
 
 ## Object Detection and Instance Segmentation Metrics**
 
@@ -23,6 +24,8 @@ If we're missing an important metric for your particular use case, please [write
 | mAP Averaged Over IOUs | The average of several  mAP metrics across class labels. | $\dfrac{1}{\text{number of thresholds}} \sum\limits_{iou \in thresholds} mAP_{iou}$ |
 | Average Recall (AR) | The average of several recall metrics across IOU thresholds, grouped by class labels. | See [AR methods](#average-recall-ar). |
 | Mean Average Recall (mAR) | The average of several AR metrics across class labels. | $\dfrac{1}{\text{number of classes}} \sum\limits_{class \in classes} AR_{class}$ |
+| Precision-Recall Curves | Outputs a nested dictionary containing the true positives, false positives, true negatives, false negatives, precision, recall, and F1 score for each (label key, label value, confidence threshold) combination. Computing this output requires setting the `compute_pr_curves` argument to `True` at evaluation time. These curves are calculated using a default IOU threshold of 0.5; you can set your own threshold by passing a float between 0 and 1 to the `pr_curve_iou_threshold` parameter at evaluation time. | See [precision-recall curve methods](#precision-recall-curves)|
+
 
 **When calculating IOUs for object detection metrics, Valor handles the necessary conversion between different types of geometric annotations. For example, if your model prediction is a polygon and your groundtruth is a raster, then the raster will be converted to a polygon prior to calculating the IOU.
 
@@ -160,3 +163,50 @@ Note that this metric differs from COCO's calculation in two ways:
 
 - COCO averages across classes while calculating AR, while we calculate AR separately for each class. Our AR calculations matches the original FAIR definition of AR, while our mAR calculations match what COCO calls AR.
 - COCO calculates three different AR metrics (AR@1, AR@5, AR@100)) by considering only the top 1/5/100 most confident predictions during the matching process. Valor, on the other hand, allows users to input a `recall_score_threshold` value that will prevent low-confidence predictions from being counted as true positives when calculating AR.
+
+## Precision-Recall Curves
+
+Precision-recall curves offer insight into which confidence threshold you should pick for your production pipeline. To compute these curves for your classification or object detection workflow, simply set the `compute_pr_curves` parameter to `True` when initiating your evaluation. Valor will then tabulate the true positives, false positives, true negatives, false negatives, precision, recall, and F1 score for each (label key, label value, confidence threshold) combination, and store them in a nested dictionary for your use. When using the Valor Python client, the output will be formatted as follows:
+
+```python
+{
+    "type": "PrecisionRecallCurve",
+    "parameters": {
+        "label_key": "class", # The key of the label.
+        "pr_curve_iou_threshold": 0.5, # Note that this value will be None for classification tasks. For detection tasks, we use 0.5 as the default threshold, but allow users to pass an optional `pr_curve_iou_threshold` parameter in their evaluation call.
+    },
+    "value": {
+        "cat": { # The value of the label.
+            "0.05": { # The confidence score threshold, ranging from 0.05 to 0.95 in increments of 0.05.
+                "fn": 0,
+                "fp": 1,
+                "tp": 0,
+                "recall": -1,
+                "f1_score": -1,
+                "precision": 0.0,
+            },
+            "0.1": {
+                "fn": 0,
+                "fp": 1,
+                "tp": 0,
+                "recall": -1,
+                "f1_score": -1,
+                "precision": 0.0,
+            },
+            ...
+```
+
+It's important to note that these curves are computed slightly differently from our other aggregate metrics above:
+
+### Classification Tasks
+
+Valor calculates its aggregate precision, recall, and F1 metrics by matching the highest confidence prediction with each groundtruth. One issue with this approach is that we may throw away useful information in cases where prediction labels all have similarly strong confidence scores. For example: if our top two predictions for a given ground truth are `{“label”: cat, “score”:.91}` and `{“label”: dog, “score”:.90}`, then our aggregated precision and recall metrics would penalize the `dog` label even though its confidence score was nearly equal to the `cat` label.
+
+We think the approach above makes sense when calculating aggregate precision and recall metrics, but, when calculating the `PrecisionRecallCurve` value for each label, we consider all ground truth-prediction matches in order to treat each label as its own, separate binary classification problem.
+
+### Detection Tasks
+
+The `PrecisionRecallCurve` values differ from the precision-recall curves used to calculate [Average Precsion](#average-precision-ap) in two subtle ways:
+
+- The `PrecisionRecallCurve` values visualize how precision and recall change as confidence thresholds vary from 0.05 to 0.95 in increments of 0.05. In contrast, the precision-recall curves used to calculate Average Precision are non-uniform; they vary over the actual confidence scores for each ground truth-prediction match.
+- If your pipeline predicts a label on an image, but that label doesn't exist on any ground truths in that particular image, then the `PrecisionRecallCurve` values will consider that prediction to be a false positive, whereas the other detection metrics will ignore that particular prediction.
diff --git a/migrations/sql/00000001_initialize_schema.up.sql b/migrations/sql/00000001_initialize_schema.up.sql
@@ -98,7 +98,7 @@ create table metric
     evaluation_id integer not null references evaluation,
     label_id      integer references label,
     type          varchar not null,
-    value         jsonb,
+    value         double precision,
     parameters    jsonb,
     created_at    timestamp not null
 );

diff --git a/migrations/sql/00000002_alter_metric_value_type.down.sql b/migrations/sql/00000002_alter_metric_value_type.down.sql
@@ -0,0 +1,3 @@
+-- note: if you've already created a PrecisionRecallCurve metric in your db, the line below will fail with ERROR:  cannot cast jsonb object to type double precision
+-- you'll have to delete all metrics with type = "PrecisionRecallCurve" before running this line
+ALTER TABLE if exists metric ALTER COLUMN value TYPE double precision USING value::double precision
diff --git a/migrations/sql/00000002_alter_metric_value_type.up.sql b/migrations/sql/00000002_alter_metric_value_type.up.sql
@@ -0,0 +1 @@
+ALTER TABLE if exists metric ALTER COLUMN value TYPE JSONB USING (value)::text::jsonb;