Skip to content

Commit

Permalink
Add metrics_to_return to all evaluations (#595)
Browse files Browse the repository at this point in the history
  • Loading branch information
ntlind committed Jun 4, 2024
1 parent a03bb4e commit d8d2a53
Show file tree
Hide file tree
Showing 17 changed files with 345 additions and 66 deletions.
20 changes: 18 additions & 2 deletions api/tests/functional-tests/backend/core/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -829,15 +829,31 @@ def test__fetch_evaluations_and_mark_for_deletion(
db: Session, finalized_dataset: str, finalized_model: str
):
# create two evaluations
for pr_curves in [True, False]:
for metrics_to_return in [
[
"Precision",
"Recall",
"F1",
"Accuracy",
"ROCAUC",
],
[
"Precision",
"Recall",
"F1",
"Accuracy",
"ROCAUC",
"PrecisionRecallCurve",
],
]:
core.create_or_get_evaluations(
db,
schemas.EvaluationRequest(
model_names=[finalized_model],
datum_filter=schemas.Filter(dataset_names=[finalized_dataset]),
parameters=schemas.EvaluationParameters(
task_type=enums.TaskType.CLASSIFICATION,
compute_pr_curves=pr_curves,
metrics_to_return=metrics_to_return,
),
meta={},
),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -704,7 +704,18 @@ def test_compute_classification(
)

confusion, metrics = _compute_clf_metrics(
db, model_filter, datum_filter, label_map=None, compute_pr_curves=True
db,
model_filter,
datum_filter,
label_map=None,
metrics_to_return=[
"Precision",
"Recall",
"F1",
"Accuracy",
"ROCAUC",
"PrecisionRecallCurve",
],
)

# Make matrices accessible by label_key
Expand Down
20 changes: 18 additions & 2 deletions api/tests/functional-tests/backend/metrics/test_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,15 @@ def test__compute_detection_metrics(
convert_annotations_to_type=enums.AnnotationType.BOX,
iou_thresholds_to_compute=list(iou_thresholds),
iou_thresholds_to_return=[0.5, 0.75],
compute_pr_curves=True,
metrics_to_return=[
"AP",
"AR",
"mAP",
"APAveragedOverIOUs",
"mAR",
"mAPAveragedOverIOUs",
"PrecisionRecallCurve",
],
),
prediction_filter=schemas.Filter(
model_names=["test_model"],
Expand Down Expand Up @@ -734,7 +742,15 @@ def test__compute_detection_metrics_with_rasters(
convert_annotations_to_type=enums.AnnotationType.RASTER,
iou_thresholds_to_compute=list(iou_thresholds),
iou_thresholds_to_return=[0.5, 0.75],
compute_pr_curves=True,
metrics_to_return=[
"AP",
"AR",
"mAP",
"APAveragedOverIOUs",
"mAR",
"mAPAveragedOverIOUs",
"PrecisionRecallCurve",
],
),
prediction_filter=schemas.Filter(
model_names=["test_model"],
Expand Down
9 changes: 8 additions & 1 deletion api/valor_api/backend/core/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,14 @@ def _create_response(
"""Converts a evaluation row into a response schema."""
metrics = db.query(
select(models.Metric)
.where(models.Metric.evaluation_id == evaluation.id)
.where(
and_(
models.Metric.evaluation_id == evaluation.id,
models.Metric.type.in_(
evaluation.parameters["metrics_to_return"]
),
)
)
.subquery()
).all()
confusion_matrices = db.query(
Expand Down
22 changes: 9 additions & 13 deletions api/valor_api/backend/metrics/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,7 +627,7 @@ def _compute_confusion_matrix_and_metrics_at_grouper_key(
groundtruth_filter: schemas.Filter,
grouper_key: str,
grouper_mappings: dict[str, dict[str, dict]],
compute_pr_curves: bool,
metrics_to_return: list[str],
) -> (
tuple[
schemas.ConfusionMatrix,
Expand All @@ -654,8 +654,8 @@ def _compute_confusion_matrix_and_metrics_at_grouper_key(
The filter to be used to query groundtruths.
grouper_mappings: dict[str, dict[str, dict]]
A dictionary of mappings that connect groupers to their related labels.
compute_pr_curves: bool
A boolean which determines whether we calculate precision-recall curves or not.
metrics: list[str]
The list of metrics to compute, store, and return to the user.
Returns
-------
Expand Down Expand Up @@ -728,7 +728,7 @@ def _compute_confusion_matrix_and_metrics_at_grouper_key(
),
]

if compute_pr_curves:
if "PrecisionRecallCurve" in metrics_to_return:
# calculate the number of unique datums
# used to determine the number of true negatives
pd_datums = db.query(
Expand Down Expand Up @@ -795,7 +795,7 @@ def _compute_clf_metrics(
db: Session,
prediction_filter: schemas.Filter,
groundtruth_filter: schemas.Filter,
compute_pr_curves: bool,
metrics_to_return: list[str],
label_map: LabelMapType | None = None,
) -> tuple[
list[schemas.ConfusionMatrix],
Expand All @@ -819,8 +819,8 @@ def _compute_clf_metrics(
The filter to be used to query predictions.
groundtruth_filter : schemas.Filter
The filter to be used to query groundtruths.
compute_pr_curves: bool
A boolean which determines whether we calculate precision-recall curves or not.
metrics: list[str]
The list of metrics to compute, store, and return to the user.
label_map: LabelMapType, optional
Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
Expand Down Expand Up @@ -853,7 +853,7 @@ def _compute_clf_metrics(
groundtruth_filter=groundtruth_filter,
grouper_key=grouper_key,
grouper_mappings=grouper_mappings,
compute_pr_curves=compute_pr_curves,
metrics_to_return=metrics_to_return,
)
if cm_and_metrics is not None:
confusion_matrices.append(cm_and_metrics[0])
Expand Down Expand Up @@ -909,11 +909,7 @@ def compute_clf_metrics(
prediction_filter=prediction_filter,
groundtruth_filter=groundtruth_filter,
label_map=parameters.label_map,
compute_pr_curves=(
parameters.compute_pr_curves
if parameters.compute_pr_curves is not None
else False
),
metrics_to_return=parameters.metrics_to_return, # type: ignore - metrics_to_return is guaranteed not to be None
)

confusion_matrices_mappings = create_metric_mappings(
Expand Down
2 changes: 1 addition & 1 deletion api/valor_api/backend/metrics/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,7 +663,7 @@ def _annotation_type_to_geojson(
number_of_groundtruths_per_grouper[grouper_id] += 1

# Optionally compute precision-recall curves
if parameters.compute_pr_curves:
if "PrecisionRecallCurve" in parameters.metrics_to_return: # type: ignore - metrics_to_return is guaranteed not to be None
false_positive_entries = db.query(
select(
joint.c.dataset_name,
Expand Down
31 changes: 27 additions & 4 deletions api/valor_api/schemas/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ class EvaluationParameters(BaseModel):
Attributes
----------
metrics: list[str], optional
The list of metrics to compute, store, and return to the user.
convert_annotations_to_type: AnnotationType | None = None
The type to convert all annotations to.
iou_thresholds_to_compute: List[float], optional
Expand All @@ -26,20 +28,18 @@ class EvaluationParameters(BaseModel):
Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
recall_score_threshold: float, default=0
The confidence score threshold for use when determining whether to count a prediction as a true positive or not while calculating Average Recall.
compute_pr_curves: bool
A boolean which determines whether we calculate precision-recall curves or not.
pr_curve_iou_threshold: float, optional
The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5. Does nothing when compute_pr_curves is set to False or None.
The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5.
"""

task_type: TaskType

metrics_to_return: list[str] | None = None
convert_annotations_to_type: AnnotationType | None = None
iou_thresholds_to_compute: list[float] | None = None
iou_thresholds_to_return: list[float] | None = None
label_map: LabelMapType | None = None
recall_score_threshold: float | None = 0
compute_pr_curves: bool | None = None
pr_curve_iou_threshold: float | None = 0.5

# pydantic setting
Expand All @@ -50,6 +50,29 @@ class EvaluationParameters(BaseModel):
def _validate_by_task_type(cls, values):
"""Validate the IOU thresholds."""

# set default metrics for each task type
if values.metrics_to_return is None:
match values.task_type:
case TaskType.CLASSIFICATION:
values.metrics_to_return = [
"Accuracy",
"Precision",
"Recall",
"F1",
"ROCAUC",
]
case TaskType.OBJECT_DETECTION:
values.metrics_to_return = [
"AP",
"AR",
"mAP",
"APAveragedOverIOUs",
"mAR",
"mAPAveragedOverIOUs",
]
case TaskType.SEMANTIC_SEGMENTATION:
values.metrics_to_return = ["IOU", "mIOU"]

match values.task_type:
case TaskType.CLASSIFICATION | TaskType.SEMANTIC_SEGMENTATION:
if values.convert_annotations_to_type is not None:
Expand Down
2 changes: 1 addition & 1 deletion api/valor_api/schemas/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ class PrecisionRecallCurve(BaseModel):
value: dict
A nested dictionary where the first key is the class label, the second key is the confidence threshold (e.g., 0.05), the third key is the metric name (e.g., "precision"), and the final key is either the value itself (for precision, recall, etc.) or a list of tuples containing data for each observation.
pr_curve_iou_threshold: float, optional
The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5. Does nothing when compute_pr_curves is set to False or None.
The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5.
"""

label_key: str
Expand Down
22 changes: 13 additions & 9 deletions client/valor/coretypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -908,7 +908,7 @@ def evaluate_classification(
datasets: Optional[Union[Dataset, List[Dataset]]] = None,
filter_by: Optional[FilterType] = None,
label_map: Optional[Dict[Label, Label]] = None,
compute_pr_curves: bool = False,
metrics_to_return: Optional[List[str]] = None,
allow_retries: bool = False,
) -> Evaluation:
"""
Expand All @@ -922,8 +922,8 @@ def evaluate_classification(
Optional set of constraints to filter evaluation by.
label_map : Dict[Label, Label], optional
Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
compute_pr_curves: bool
A boolean which determines whether we calculate precision-recall curves or not.
metrics: List[str], optional
The list of metrics to compute, store, and return to the user.
allow_retries : bool, default = False
Option to retry previously failed evaluations.
Expand All @@ -945,7 +945,7 @@ def evaluate_classification(
parameters=EvaluationParameters(
task_type=TaskType.CLASSIFICATION,
label_map=self._create_label_map(label_map=label_map),
compute_pr_curves=compute_pr_curves,
metrics_to_return=metrics_to_return,
),
meta={},
)
Expand All @@ -967,7 +967,7 @@ def evaluate_detection(
iou_thresholds_to_return: Optional[List[float]] = None,
label_map: Optional[Dict[Label, Label]] = None,
recall_score_threshold: float = 0,
compute_pr_curves: bool = False,
metrics_to_return: Optional[List[str]] = None,
pr_curve_iou_threshold: float = 0.5,
allow_retries: bool = False,
) -> Evaluation:
Expand All @@ -990,10 +990,10 @@ def evaluate_detection(
Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
recall_score_threshold: float, default=0
The confidence score threshold for use when determining whether to count a prediction as a true positive or not while calculating Average Recall.
compute_pr_curves: bool, optional
A boolean which determines whether we calculate precision-recall curves or not.
metrics: List[str], optional
The list of metrics to compute, store, and return to the user.
pr_curve_iou_threshold: float, optional
The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5. Does nothing when compute_pr_curves is set to False or None.
The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5.
allow_retries : bool, default = False
Option to retry previously failed evaluations.
Expand All @@ -1018,7 +1018,7 @@ def evaluate_detection(
iou_thresholds_to_return=iou_thresholds_to_return,
label_map=self._create_label_map(label_map=label_map),
recall_score_threshold=recall_score_threshold,
compute_pr_curves=compute_pr_curves,
metrics_to_return=metrics_to_return,
pr_curve_iou_threshold=pr_curve_iou_threshold,
)
datum_filter = self._format_constraints(datasets, filter_by)
Expand All @@ -1042,6 +1042,7 @@ def evaluate_segmentation(
datasets: Optional[Union[Dataset, List[Dataset]]] = None,
filter_by: Optional[FilterType] = None,
label_map: Optional[Dict[Label, Label]] = None,
metrics_to_return: Optional[List[str]] = None,
allow_retries: bool = False,
) -> Evaluation:
"""
Expand All @@ -1055,6 +1056,8 @@ def evaluate_segmentation(
Optional set of constraints to filter evaluation by.
label_map : Dict[Label, Label], optional
Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
metrics: List[str], optional
The list of metrics to compute, store, and return to the user.
allow_retries : bool, default = False
Option to retry previously failed evaluations.
Expand All @@ -1071,6 +1074,7 @@ def evaluate_segmentation(
parameters=EvaluationParameters(
task_type=TaskType.SEMANTIC_SEGMENTATION,
label_map=self._create_label_map(label_map=label_map),
metrics_to_return=metrics_to_return,
),
meta={},
)
Expand Down
8 changes: 4 additions & 4 deletions client/valor/schemas/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ class EvaluationParameters:
Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models.
recall_score_threshold: float, default=0
The confidence score threshold for use when determining whether to count a prediction as a true positive or not while calculating Average Recall.
compute_pr_curves: bool
A boolean which determines whether we calculate precision-recall curves or not.
metrics: List[str], optional
The list of metrics to compute, store, and return to the user.
pr_curve_iou_threshold: float, optional
The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5. Does nothing when compute_pr_curves is set to False or None.
The IOU threshold to use when calculating precision-recall curves for object detection tasks. Defaults to 0.5.
"""

Expand All @@ -35,7 +35,7 @@ class EvaluationParameters:
iou_thresholds_to_return: Optional[List[float]] = None
label_map: Optional[List[List[List[str]]]] = None
recall_score_threshold: float = 0
compute_pr_curves: bool = False
metrics_to_return: Optional[List[str]] = None
pr_curve_iou_threshold: float = 0.5


Expand Down
Loading

0 comments on commit d8d2a53

Please sign in to comment.