Skip to content

Commit

Permalink
Implement quick follow-ups on precision-recall curve PR (#486)
Browse files Browse the repository at this point in the history
  • Loading branch information
ntlind committed Mar 18, 2024
1 parent 0c44dfe commit ea48d25
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 11 deletions.
5 changes: 0 additions & 5 deletions api/tests/unit-tests/backend/metrics/test_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,14 +123,11 @@ def test__calculate_ap_and_ar():
),
]

grouper_ids_associated_with_gts = set(["0", "1", "2"])

ap_metrics, ar_metrics = _calculate_ap_and_ar(
sorted_ranked_pairs=pairs,
number_of_groundtruths_per_grouper=number_of_groundtruths_per_grouper,
grouper_mappings=grouper_mappings,
iou_thresholds=iou_thresholds,
grouper_ids_associated_with_gts=grouper_ids_associated_with_gts,
recall_score_threshold=0.0,
)

Expand All @@ -153,7 +150,6 @@ def test__calculate_ap_and_ar():
number_of_groundtruths_per_grouper=number_of_groundtruths_per_grouper,
grouper_mappings=grouper_mappings,
iou_thresholds=iou_thresholds + [0],
grouper_ids_associated_with_gts=grouper_ids_associated_with_gts,
recall_score_threshold=0.0,
)

Expand All @@ -165,6 +161,5 @@ def test__calculate_ap_and_ar():
number_of_groundtruths_per_grouper=number_of_groundtruths_per_grouper,
grouper_mappings=grouper_mappings,
iou_thresholds=iou_thresholds,
grouper_ids_associated_with_gts=grouper_ids_associated_with_gts,
recall_score_threshold=illegal_thresh,
)
6 changes: 1 addition & 5 deletions api/valor_api/backend/metrics/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,6 @@ def _calculate_ap_and_ar(
number_of_groundtruths_per_grouper: dict[int, int],
grouper_mappings: dict[str, dict[str, schemas.Label]],
iou_thresholds: list[float],
grouper_ids_associated_with_gts: set[int],
recall_score_threshold: float,
) -> Tuple[list[schemas.APMetric], list[schemas.ARMetric]]:
"""
Expand All @@ -188,7 +187,7 @@ def _calculate_ap_and_ar(
recalls_across_thresholds = []

for iou_threshold in iou_thresholds:
if grouper_id not in grouper_ids_associated_with_gts:
if grouper_id not in number_of_groundtruths_per_grouper.keys():
continue

precisions = []
Expand Down Expand Up @@ -523,8 +522,6 @@ def _annotation_type_to_column(
.groundtruths() # type: ignore - SQLAlchemy type issue
).all() # type: ignore - SQLAlchemy type issue

grouper_ids_associated_with_gts = set([row[1] for row in groundtruths])

for gt_id, grouper_id in groundtruths:
number_of_groundtruths_per_grouper[grouper_id] += 1

Expand Down Expand Up @@ -562,7 +559,6 @@ def _annotation_type_to_column(
number_of_groundtruths_per_grouper=number_of_groundtruths_per_grouper,
iou_thresholds=parameters.iou_thresholds_to_compute,
grouper_mappings=grouper_mappings,
grouper_ids_associated_with_gts=grouper_ids_associated_with_gts,
recall_score_threshold=parameters.recall_score_threshold,
)

Expand Down
50 changes: 50 additions & 0 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ If we're missing an important metric for your particular use case, please [write
| F1 | A weighted average of precision and recall. | $\frac{2 * Precision * Recall}{Precision + Recall}$ |
| Accuracy | The number of true predictions divided by the total number of predictions. | $\dfrac{\|TP\|+\|TN\|}{\|TP\|+\|TN\|+\|FP\|+\|FN\|}$ |
| ROC AUC | The area under the Receiver Operating Characteristic (ROC) curve for the predictions generated by a given model. | See [ROCAUC methods](#binary-roc-auc). |
| Precision-Recall Curves | Outputs a nested dictionary containing the true positives, false positives, true negatives, false negatives, precision, recall, and F1 score for each (label key, label value, confidence threshold) combination. Computing this output requires setting the `compute_pr_curves` argument to `True` at evaluation time. | See [precision-recall curve methods](#precision-recall-curves)|

## Object Detection and Instance Segmentation Metrics**

Expand All @@ -23,6 +24,8 @@ If we're missing an important metric for your particular use case, please [write
| mAP Averaged Over IOUs | The average of several mAP metrics across class labels. | $\dfrac{1}{\text{number of thresholds}} \sum\limits_{iou \in thresholds} mAP_{iou}$ |
| Average Recall (AR) | The average of several recall metrics across IOU thresholds, grouped by class labels. | See [AR methods](#average-recall-ar). |
| Mean Average Recall (mAR) | The average of several AR metrics across class labels. | $\dfrac{1}{\text{number of classes}} \sum\limits_{class \in classes} AR_{class}$ |
| Precision-Recall Curves | Outputs a nested dictionary containing the true positives, false positives, true negatives, false negatives, precision, recall, and F1 score for each (label key, label value, confidence threshold) combination. Computing this output requires setting the `compute_pr_curves` argument to `True` at evaluation time. These curves are calculated using a default IOU threshold of 0.5; you can set your own threshold by passing a float between 0 and 1 to the `pr_curve_iou_threshold` parameter at evaluation time. | See [precision-recall curve methods](#precision-recall-curves)|


**When calculating IOUs for object detection metrics, Valor handles the necessary conversion between different types of geometric annotations. For example, if your model prediction is a polygon and your groundtruth is a raster, then the raster will be converted to a polygon prior to calculating the IOU.

Expand Down Expand Up @@ -160,3 +163,50 @@ Note that this metric differs from COCO's calculation in two ways:

- COCO averages across classes while calculating AR, while we calculate AR separately for each class. Our AR calculations matches the original FAIR definition of AR, while our mAR calculations match what COCO calls AR.
- COCO calculates three different AR metrics (AR@1, AR@5, AR@100)) by considering only the top 1/5/100 most confident predictions during the matching process. Valor, on the other hand, allows users to input a `recall_score_threshold` value that will prevent low-confidence predictions from being counted as true positives when calculating AR.

## Precision-Recall Curves

Precision-recall curves offer insight into which confidence threshold you should pick for your production pipeline. To compute these curves for your classification or object detection workflow, simply set the `compute_pr_curves` parameter to `True` when initiating your evaluation. Valor will then tabulate the true positives, false positives, true negatives, false negatives, precision, recall, and F1 score for each (label key, label value, confidence threshold) combination, and store them in a nested dictionary for your use. When using the Valor Python client, the output will be formatted as follows:

```python
{
"type": "PrecisionRecallCurve",
"parameters": {
"label_key": "class", # The key of the label.
"pr_curve_iou_threshold": 0.5, # Note that this value will be None for classification tasks. For detection tasks, we use 0.5 as the default threshold, but allow users to pass an optional `pr_curve_iou_threshold` parameter in their evaluation call.
},
"value": {
"cat": { # The value of the label.
"0.05": { # The confidence score threshold, ranging from 0.05 to 0.95 in increments of 0.05.
"fn": 0,
"fp": 1,
"tp": 0,
"recall": -1,
"f1_score": -1,
"precision": 0.0,
},
"0.1": {
"fn": 0,
"fp": 1,
"tp": 0,
"recall": -1,
"f1_score": -1,
"precision": 0.0,
},
...
```

It's important to note that these curves are computed slightly differently from our other aggregate metrics above:

### Classification Tasks

Valor calculates its aggregate precision, recall, and F1 metrics by matching the highest confidence prediction with each groundtruth. One issue with this approach is that we may throw away useful information in cases where prediction labels all have similarly strong confidence scores. For example: if our top two predictions for a given ground truth are `{“label”: cat, “score”:.91}` and `{“label”: dog, “score”:.90}`, then our aggregated precision and recall metrics would penalize the `dog` label even though its confidence score was nearly equal to the `cat` label.

We think the approach above makes sense when calculating aggregate precision and recall metrics, but, when calculating the `PrecisionRecallCurve` value for each label, we consider all ground truth-prediction matches in order to treat each label as its own, separate binary classification problem.

### Detection Tasks

The `PrecisionRecallCurve` values differ from the precision-recall curves used to calculate [Average Precsion](#average-precision-ap) in two subtle ways:

- The `PrecisionRecallCurve` values visualize how precision and recall change as confidence thresholds vary from 0.05 to 0.95 in increments of 0.05. In contrast, the precision-recall curves used to calculate Average Precision are non-uniform; they vary over the actual confidence scores for each ground truth-prediction match.
- If your pipeline predicts a label on an image, but that label doesn't exist on any ground truths in that particular image, then the `PrecisionRecallCurve` values will consider that prediction to be a false positive, whereas the other detection metrics will ignore that particular prediction.
2 changes: 1 addition & 1 deletion migrations/sql/00000001_initialize_schema.up.sql
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ create table metric
evaluation_id integer not null references evaluation,
label_id integer references label,
type varchar not null,
value jsonb,
value double precision,
parameters jsonb,
created_at timestamp not null
);
Expand Down
3 changes: 3 additions & 0 deletions migrations/sql/00000002_alter_metric_value_type.down.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-- note: if you've already created a PrecisionRecallCurve metric in your db, the line below will fail with ERROR: cannot cast jsonb object to type double precision
-- you'll have to delete all metrics with type = "PrecisionRecallCurve" before running this line
ALTER TABLE if exists metric ALTER COLUMN value TYPE double precision USING value::double precision
1 change: 1 addition & 0 deletions migrations/sql/00000002_alter_metric_value_type.up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ALTER TABLE if exists metric ALTER COLUMN value TYPE JSONB USING (value)::text::jsonb;

0 comments on commit ea48d25

Please sign in to comment.