diff --git a/api/tests/functional-tests/backend/core/test_dataset.py b/api/tests/functional-tests/backend/core/test_dataset.py index b868e5788..2e6981dfb 100644 --- a/api/tests/functional-tests/backend/core/test_dataset.py +++ b/api/tests/functional-tests/backend/core/test_dataset.py @@ -153,7 +153,7 @@ def test_dataset_status_with_evaluations( ): # create an evaluation core.set_dataset_status(db, created_dataset, enums.TableStatus.FINALIZED) - evaluations, _ = core.create_or_get_evaluations( + evaluations = core.create_or_get_evaluations( db, schemas.EvaluationRequest( model_names=[created_model], @@ -165,6 +165,7 @@ def test_dataset_status_with_evaluations( ), ) assert len(evaluations) == 1 + assert evaluations[0].status == enums.EvaluationStatus.PENDING evaluation_id = evaluations[0].id # set the evaluation to the running state diff --git a/api/tests/functional-tests/backend/core/test_evaluation.py b/api/tests/functional-tests/backend/core/test_evaluation.py index ccb75751d..46234c1a5 100644 --- a/api/tests/functional-tests/backend/core/test_evaluation.py +++ b/api/tests/functional-tests/backend/core/test_evaluation.py @@ -185,7 +185,7 @@ def test__fetch_evaluation_from_subrequest( ), meta={}, ) - created_1, _ = core.create_or_get_evaluations(db, job_request_1) + created_1 = core.create_or_get_evaluations(db, job_request_1) assert len(created_1) == 1 # create evaluation 2 @@ -197,7 +197,7 @@ def test__fetch_evaluation_from_subrequest( ), meta={}, ) - created_2, _ = core.create_or_get_evaluations(db, job_request_2) + created_2 = core.create_or_get_evaluations(db, job_request_2) assert len(created_2) == 1 # test fetching a subrequest @@ -245,9 +245,9 @@ def test_create_evaluation( ), meta={}, ) - created, existing = core.create_or_get_evaluations(db, job_request_1) - assert len(existing) == 0 + created = core.create_or_get_evaluations(db, job_request_1) assert len(created) == 1 + assert created[0].status == enums.EvaluationStatus.PENDING evaluation_id = created[0].id assert ( @@ -256,9 +256,9 @@ def test_create_evaluation( ) # test duplication check - created, existing = core.create_or_get_evaluations(db, job_request_1) - assert len(created) == 0 + existing = core.create_or_get_evaluations(db, job_request_1) assert len(existing) == 1 + assert existing[0].status == enums.EvaluationStatus.PENDING assert existing[0].id == evaluation_id assert ( @@ -322,8 +322,9 @@ def test_fetch_evaluation_from_id( ), meta={}, ) - created_1, _ = core.create_or_get_evaluations(db, job_request_1) + created_1 = core.create_or_get_evaluations(db, job_request_1) assert len(created_1) == 1 + assert created_1[0].status == enums.EvaluationStatus.PENDING evaluation_id_1 = created_1[0].id # create evaluation 2 @@ -335,8 +336,9 @@ def test_fetch_evaluation_from_id( ), meta={}, ) - created_2, _ = core.create_or_get_evaluations(db, job_request_2) + created_2 = core.create_or_get_evaluations(db, job_request_2) assert len(created_2) == 1 + assert created_2[0].status == enums.EvaluationStatus.PENDING evaluation_id_2 = created_2[0].id fetched_evaluation = core.fetch_evaluation_from_id(db, evaluation_id_1) @@ -369,8 +371,9 @@ def test_get_evaluations( ), meta={}, ) - created_1, _ = core.create_or_get_evaluations(db, job_request_1) + created_1 = core.create_or_get_evaluations(db, job_request_1) assert len(created_1) == 1 + assert created_1[0].status == enums.EvaluationStatus.PENDING # create evaluation 2 job_request_2 = schemas.EvaluationRequest( @@ -381,8 +384,9 @@ def test_get_evaluations( ), meta={}, ) - created_2, _ = core.create_or_get_evaluations(db, job_request_2) + created_2 = core.create_or_get_evaluations(db, job_request_2) assert len(created_2) == 1 + assert created_2[0].status == enums.EvaluationStatus.PENDING # test get by dataset evaluations_by_dataset = core.get_paginated_evaluations( @@ -542,10 +546,10 @@ def test_evaluation_status( ), meta={}, ) - created_1, existing = core.create_or_get_evaluations(db, job_request_1) - assert len(existing) == 0 - assert len(created_1) == 1 - evaluation_id = created_1[0].id + evaluations = core.create_or_get_evaluations(db, job_request_1) + assert len(evaluations) == 1 + assert evaluations[0].status == enums.EvaluationStatus.PENDING + evaluation_id = evaluations[0].id # check that evaluation is created with PENDING status. assert ( @@ -656,7 +660,7 @@ def test_count_active_evaluations( ), meta={}, ) - created, _ = core.create_or_get_evaluations(db, job_request_1) + created = core.create_or_get_evaluations(db, job_request_1) assert len(created) == 1 evaluation_1 = created[0].id @@ -669,7 +673,7 @@ def test_count_active_evaluations( ), meta={}, ) - created, _ = core.create_or_get_evaluations(db, job_request_2) + created = core.create_or_get_evaluations(db, job_request_2) assert len(created) == 1 evaluation_2 = created[0].id @@ -716,7 +720,8 @@ def test_count_active_evaluations( ), meta={}, ) - evaluation_3, _ = core.create_or_get_evaluations(db, job_request_3) + evaluation_3 = core.create_or_get_evaluations(db, job_request_3) + assert len(evaluation_3) == 1 evaluation_3 = evaluation_3[0].id assert ( diff --git a/api/tests/functional-tests/backend/core/test_model.py b/api/tests/functional-tests/backend/core/test_model.py index 9424a6b0d..b358129be 100644 --- a/api/tests/functional-tests/backend/core/test_model.py +++ b/api/tests/functional-tests/backend/core/test_model.py @@ -191,7 +191,7 @@ def test_model_status_with_evaluations( ): # create an evaluation core.set_dataset_status(db, created_dataset, enums.TableStatus.FINALIZED) - created, _ = core.create_or_get_evaluations( + created = core.create_or_get_evaluations( db, schemas.EvaluationRequest( model_names=[created_model], diff --git a/api/tests/functional-tests/backend/metrics/test_classification.py b/api/tests/functional-tests/backend/metrics/test_classification.py index a1f8a8dc9..ce3744241 100644 --- a/api/tests/functional-tests/backend/metrics/test_classification.py +++ b/api/tests/functional-tests/backend/metrics/test_classification.py @@ -781,27 +781,26 @@ def test_classification( ) # creates evaluation job - created_evaluations, existing_evaluations = create_or_get_evaluations( - db=db, job_request=job_request - ) - assert len(created_evaluations) == 1 - assert len(existing_evaluations) == 0 + evaluations = create_or_get_evaluations(db=db, job_request=job_request) + assert len(evaluations) == 1 + assert evaluations[0].status == enums.EvaluationStatus.PENDING # computation, normally run as background task _ = compute_clf_metrics( db=db, - evaluation_id=created_evaluations[0].id, + evaluation_id=evaluations[0].id, ) # get evaluations - created_evaluations, existing_evaluations = create_or_get_evaluations( - db=db, job_request=job_request - ) - assert len(created_evaluations) == 0 - assert len(existing_evaluations) == 1 + evaluations = create_or_get_evaluations(db=db, job_request=job_request) + assert len(evaluations) == 1 + assert evaluations[0].status in { + enums.EvaluationStatus.RUNNING, + enums.EvaluationStatus.DONE, + } - metrics = existing_evaluations[0].metrics - confusion = existing_evaluations[0].confusion_matrices + metrics = evaluations[0].metrics + confusion = evaluations[0].confusion_matrices # Make matrices accessible by label_key assert confusion diff --git a/api/tests/functional-tests/backend/metrics/test_metric_utils.py b/api/tests/functional-tests/backend/metrics/test_metric_utils.py index 2e062b169..ebf2875e5 100644 --- a/api/tests/functional-tests/backend/metrics/test_metric_utils.py +++ b/api/tests/functional-tests/backend/metrics/test_metric_utils.py @@ -23,7 +23,7 @@ def test_validate_computation( ): # create evaluation core.set_dataset_status(db, created_dataset, enums.TableStatus.FINALIZED) - created, _ = core.create_or_get_evaluations( + created = core.create_or_get_evaluations( db, schemas.EvaluationRequest( model_names=[created_model], diff --git a/api/tests/functional-tests/backend/metrics/test_segmentation.py b/api/tests/functional-tests/backend/metrics/test_segmentation.py index 36dae8dcf..8a8be2cad 100644 --- a/api/tests/functional-tests/backend/metrics/test_segmentation.py +++ b/api/tests/functional-tests/backend/metrics/test_segmentation.py @@ -447,23 +447,22 @@ def test_compute_semantic_segmentation_metrics( meta={}, ) - created_evaluations, existing_evaluations = create_or_get_evaluations( - db=db, job_request=job_request - ) - assert len(created_evaluations) == 1 - assert len(existing_evaluations) == 0 + evaluations = create_or_get_evaluations(db=db, job_request=job_request) + assert len(evaluations) == 1 + assert evaluations[0].status == enums.EvaluationStatus.PENDING _ = compute_semantic_segmentation_metrics( - db=db, evaluation_id=created_evaluations[0].id + db=db, evaluation_id=evaluations[0].id ) - created_evaluations, existing_evaluations = create_or_get_evaluations( - db=db, job_request=job_request - ) - assert len(created_evaluations) == 0 - assert len(existing_evaluations) == 1 + evaluations = create_or_get_evaluations(db=db, job_request=job_request) + assert len(evaluations) == 1 + assert evaluations[0].status in { + enums.EvaluationStatus.RUNNING, + enums.EvaluationStatus.DONE, + } - metrics = existing_evaluations[0].metrics + metrics = evaluations[0].metrics expected_metrics = { # none of these three labels have a predicted label diff --git a/api/tests/functional-tests/crud/test_evaluation_crud.py b/api/tests/functional-tests/crud/test_evaluation_crud.py new file mode 100644 index 000000000..78bc1f0b5 --- /dev/null +++ b/api/tests/functional-tests/crud/test_evaluation_crud.py @@ -0,0 +1,69 @@ +from sqlalchemy.exc import IntegrityError +from sqlalchemy.orm import Session + +from valor_api import crud, enums, schemas +from valor_api.backend import core + + +def test_restart_failed_evaluation(db: Session): + crud.create_dataset(db=db, dataset=schemas.Dataset(name="dataset")) + crud.create_model(db=db, model=schemas.Model(name="model")) + crud.finalize(db=db, dataset_name="dataset") + + # create evaluation and overwrite status to failed + evaluations1 = core.create_or_get_evaluations( + db=db, + job_request=schemas.EvaluationRequest( + model_names=["model"], + datum_filter=schemas.Filter(dataset_names=["dataset"]), + parameters=schemas.EvaluationParameters( + task_type=enums.TaskType.CLASSIFICATION + ), + meta=None, + ), + allow_retries=False, + ) + assert len(evaluations1) == 1 + try: + evaluation = core.fetch_evaluation_from_id( + db=db, evaluation_id=evaluations1[0].id + ) + evaluation.status = enums.EvaluationStatus.FAILED + db.commit() + except IntegrityError as e: + db.rollback() + raise e + + # get evaluation and verify it is failed + evaluations2 = crud.create_or_get_evaluations( + db=db, + job_request=schemas.EvaluationRequest( + model_names=["model"], + datum_filter=schemas.Filter(dataset_names=["dataset"]), + parameters=schemas.EvaluationParameters( + task_type=enums.TaskType.CLASSIFICATION + ), + meta=None, + ), + allow_retries=False, + ) + assert len(evaluations2) == 1 + assert evaluations2[0].status == enums.EvaluationStatus.FAILED + assert evaluations2[0].id == evaluations1[0].id + + # get evaluation and allow retries, this should result in a finished eval + evaluations3 = crud.create_or_get_evaluations( + db=db, + job_request=schemas.EvaluationRequest( + model_names=["model"], + datum_filter=schemas.Filter(dataset_names=["dataset"]), + parameters=schemas.EvaluationParameters( + task_type=enums.TaskType.CLASSIFICATION + ), + meta=None, + ), + allow_retries=True, + ) + assert len(evaluations3) == 1 + assert evaluations3[0].status == enums.EvaluationStatus.DONE + assert evaluations3[0].id == evaluations1[0].id diff --git a/api/valor_api/backend/core/evaluation.py b/api/valor_api/backend/core/evaluation.py index 513e2d816..e3d15ab21 100644 --- a/api/valor_api/backend/core/evaluation.py +++ b/api/valor_api/backend/core/evaluation.py @@ -11,35 +11,6 @@ from valor_api.backend.query import Query -def _validate_classification_task( - db: Session, - evaluation: models.Evaluation, -): - """ - Validate that a classification evaluation is possible. - - Parameters - ---------- - db : Session - The database session. - evaluation : models.Evaluation - The uncommitted evaluation row. - """ - # unpack filters and params - groundtruth_filter = schemas.Filter(**evaluation.datum_filter) - prediction_filter = groundtruth_filter.model_copy() - prediction_filter.model_names = [evaluation.model_name] - parameters = schemas.EvaluationParameters(**evaluation.parameters) - - # check that prediction label keys match ground truth label keys - core.validate_matching_label_keys( - db=db, - label_map=parameters.label_map, - groundtruth_filter=groundtruth_filter, - prediction_filter=prediction_filter, - ) - - def _create_dataset_expr_from_list( dataset_names: list[str], ) -> BinaryExpression | None: @@ -431,17 +402,62 @@ def _fetch_evaluation_from_subrequest( return evaluation +def _validate_create_or_get_evaluations( + db: Session, + job_request: schemas.EvaluationRequest, + evaluation: models.Evaluation, +) -> models.Evaluation: + """ + Validates whether a new or failed should proceed to a computation. + + Parameters + ---------- + db : Session + The database session. + job_request : schemas.EvaluationRequest + The evaluations to create. + evaluation : models.Evaluation + The evaluation row to validate. + + Returns + ------- + model.Evaluation + The row that was passed as input. + + """ + + # unpack filters and params + groundtruth_filter = job_request.datum_filter + prediction_filter = groundtruth_filter.model_copy() + prediction_filter.model_names = [evaluation.model_name] + parameters = job_request.parameters + + datasets = db.query(Query(models.Dataset).filter(groundtruth_filter).any()).distinct().all() # type: ignore - SQLAlchemy type issue + model = db.query(Query(models.Model).filter(prediction_filter).any()).distinct().one_or_none() # type: ignore - SQLAlchemy type issue + + # verify model and datasets have data for this evaluation + if len(datasets) == 0 or model is None: + evaluation.status = enums.EvaluationStatus.DONE + elif job_request.parameters.task_type == enums.TaskType.CLASSIFICATION: + # check that prediction label keys match ground truth label keys + core.validate_matching_label_keys( + db=db, + label_map=parameters.label_map, + groundtruth_filter=groundtruth_filter, + prediction_filter=prediction_filter, + ) + return evaluation + + def create_or_get_evaluations( db: Session, job_request: schemas.EvaluationRequest, -) -> tuple[ - list[schemas.EvaluationResponse], - list[schemas.EvaluationResponse], -]: + allow_retries: bool = False, +) -> list[schemas.EvaluationResponse]: """ Creates evaluations from evaluation request. - If an evaluation already exists, it will be returned as running. + If an evaluation already exists, it will be returned with its existing status. Parameters ---------- @@ -452,8 +468,8 @@ def create_or_get_evaluations( Returns ------- - tuple[list[schemas.EvaluationResponse], list[schemas.EvaluationResponse]] - A tuple of evaluation response lists following the pattern (list[created_evaluations], list[existing_evaluations]) + list[schemas.EvaluationResponse] + A list of evaluation responses. """ created_rows = [] @@ -469,6 +485,15 @@ def create_or_get_evaluations( db=db, subrequest=subrequest, ): + if ( + allow_retries + and evaluation.status == enums.EvaluationStatus.FAILED + ): + evaluation = _validate_create_or_get_evaluations( + db=db, + job_request=subrequest, + evaluation=evaluation, + ) existing_rows.append(evaluation) # create evaluation row @@ -480,13 +505,11 @@ def create_or_get_evaluations( status=enums.EvaluationStatus.PENDING, meta={}, # meta stores data about the run after it completes; should be an empty dictionary at creation time ) - - if ( - subrequest.parameters.task_type - == enums.TaskType.CLASSIFICATION - ): - _validate_classification_task(db=db, evaluation=evaluation) - + evaluation = _validate_create_or_get_evaluations( + db=db, + job_request=subrequest, + evaluation=evaluation, + ) created_rows.append(evaluation) try: @@ -496,10 +519,7 @@ def create_or_get_evaluations( db.rollback() raise exceptions.EvaluationAlreadyExistsError() - return ( - _create_responses(db, created_rows), - _create_responses(db, existing_rows), - ) + return _create_responses(db, created_rows + existing_rows) def _fetch_evaluations_and_mark_for_deletion( diff --git a/api/valor_api/crud/_create.py b/api/valor_api/crud/_create.py index b0756e138..375bfd16c 100644 --- a/api/valor_api/crud/_create.py +++ b/api/valor_api/crud/_create.py @@ -91,6 +91,7 @@ def create_or_get_evaluations( db: Session, job_request: schemas.EvaluationRequest, task_handler: BackgroundTasks | None = None, + allow_retries: bool = False, ) -> list[schemas.EvaluationResponse]: """ Create or get evaluations. @@ -101,33 +102,48 @@ def create_or_get_evaluations( The database Session to query against. job_request: schemas.EvaluationRequest The evaluation request. + task_handler: BackgroundTasks, optional + An optional FastAPI background task handler. + allow_retries: bool, default = False + Allow restarting of failed evaluations. Returns ---------- - tuple[list[schemas.EvaluatationResponse], list[schemas.EvaluatationResponse]] - Tuple of evaluation id lists following the form ([created], [existing]) + list[schemas.EvaluatationResponse] + A list of evaluations in response format. """ - created, existing = backend.create_or_get_evaluations(db, job_request) - - # start computations - for evaluation in created: - match evaluation.parameters.task_type: - case enums.TaskType.CLASSIFICATION: - compute_func = backend.compute_clf_metrics - case enums.TaskType.OBJECT_DETECTION: - compute_func = backend.compute_detection_metrics - case enums.TaskType.SEMANTIC_SEGMENTATION: - compute_func = backend.compute_semantic_segmentation_metrics - case _: - raise RuntimeError - - if task_handler: - task_handler.add_task( - compute_func, - db=db, - evaluation_id=evaluation.id, - ) - else: - compute_func(db=db, evaluation_id=evaluation.id) - - return created + existing + evaluations = backend.create_or_get_evaluations( + db=db, job_request=job_request, allow_retries=allow_retries + ) + run_conditions = ( + { + enums.EvaluationStatus.PENDING, + enums.EvaluationStatus.FAILED, + } + if allow_retries + else {enums.EvaluationStatus.PENDING} + ) + for evaluation in evaluations: + if evaluation.status in run_conditions: + match evaluation.parameters.task_type: + case enums.TaskType.CLASSIFICATION: + compute_func = backend.compute_clf_metrics + case enums.TaskType.OBJECT_DETECTION: + compute_func = backend.compute_detection_metrics + case enums.TaskType.SEMANTIC_SEGMENTATION: + compute_func = ( + backend.compute_semantic_segmentation_metrics + ) + case _: + raise RuntimeError + + if task_handler: + task_handler.add_task( + compute_func, + db=db, + evaluation_id=evaluation.id, + ) + else: + compute_func(db=db, evaluation_id=evaluation.id) + + return evaluations diff --git a/api/valor_api/main.py b/api/valor_api/main.py index 483e45a90..eedb7e70e 100644 --- a/api/valor_api/main.py +++ b/api/valor_api/main.py @@ -1034,8 +1034,10 @@ def delete_model( tags=["Evaluations"], ) def create_or_get_evaluations( + response: Response, job_request: schemas.EvaluationRequest, background_tasks: BackgroundTasks, + allow_retries: bool = False, db: Session = Depends(get_db), ) -> list[schemas.EvaluationResponse]: """ @@ -1045,10 +1047,14 @@ def create_or_get_evaluations( Parameters ---------- + response: Response + The FastAPI response object. Used to return a content-range header to the user. job_request: schemas.EvaluationJob The job request for the evaluation. background_tasks: BackgroundTasks A FastAPI `BackgroundTasks` object to process the creation asyncronously. This parameter is a FastAPI dependency and shouldn't be submitted by the user. + allow_retries: bool, default = False + Determines whether failed evaluations are restarted. db : Session The database session to use. This parameter is a sqlalchemy dependency and shouldn't be submitted by the user. @@ -1073,6 +1079,7 @@ def create_or_get_evaluations( db=db, job_request=job_request, task_handler=background_tasks, + allow_retries=allow_retries, ) except Exception as e: raise exceptions.create_http_error(e) diff --git a/client/valor/client.py b/client/valor/client.py index 22ea483c0..9b8ea8995 100644 --- a/client/valor/client.py +++ b/client/valor/client.py @@ -798,7 +798,9 @@ def delete_model(self, name: str) -> None: """ self._requests_delete_rel_host(f"models/{name}") - def evaluate(self, request: EvaluationRequest) -> List[dict]: + def evaluate( + self, request: EvaluationRequest, allow_retries: bool = False + ) -> List[dict]: """ Creates as many evaluations as necessary to fulfill the request. @@ -808,14 +810,18 @@ def evaluate(self, request: EvaluationRequest) -> List[dict]: ---------- request : schemas.EvaluationRequest The requested evaluation parameters. + allow_retries : bool, default = False + Option to retry previously failed evaluations. Returns ------- List[dict] A list of evaluations that meet the parameters. """ + query_str = urlencode({"allow_retries": allow_retries}) + endpoint = f"evaluations?{query_str}" return self._requests_post_rel_host( - "evaluations", json=asdict(request) + endpoint, json=asdict(request) ).json() def get_evaluations( diff --git a/client/valor/coretypes.py b/client/valor/coretypes.py index 9100f8bec..e0801bf3a 100644 --- a/client/valor/coretypes.py +++ b/client/valor/coretypes.py @@ -900,6 +900,7 @@ def evaluate_classification( filter_by: Optional[FilterType] = None, label_map: Optional[Dict[Label, Label]] = None, compute_pr_curves: bool = False, + allow_retries: bool = False, ) -> Evaluation: """ Start a classification evaluation job. @@ -914,6 +915,8 @@ def evaluate_classification( Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models. compute_pr_curves: bool A boolean which determines whether we calculate precision-recall curves or not. + allow_retries : bool, default = False + Option to retry previously failed evaluations. Returns ------- @@ -939,7 +942,9 @@ def evaluate_classification( ) # create evaluation - evaluation = Client(self.conn).evaluate(request) + evaluation = Client(self.conn).evaluate( + request, allow_retries=allow_retries + ) if len(evaluation) != 1: raise RuntimeError return evaluation[0] @@ -955,6 +960,7 @@ def evaluate_detection( recall_score_threshold: float = 0, compute_pr_curves: bool = False, pr_curve_iou_threshold: float = 0.5, + allow_retries: bool = False, ) -> Evaluation: """ Start an object-detection evaluation job. @@ -979,6 +985,8 @@ def evaluate_detection( A boolean which determines whether we calculate precision-recall curves or not. pr_curve_iou_threshold: float, optional The IOU threshold to use when calculating precision-recall curves. Defaults to 0.5. Does nothing when compute_pr_curves is set to False or None. + allow_retries : bool, default = False + Option to retry previously failed evaluations. Returns @@ -1013,7 +1021,9 @@ def evaluate_detection( ) # create evaluation - evaluation = Client(self.conn).evaluate(request) + evaluation = Client(self.conn).evaluate( + request, allow_retries=allow_retries + ) if len(evaluation) != 1: raise RuntimeError return evaluation[0] @@ -1023,6 +1033,7 @@ def evaluate_segmentation( datasets: Optional[Union[Dataset, List[Dataset]]] = None, filter_by: Optional[FilterType] = None, label_map: Optional[Dict[Label, Label]] = None, + allow_retries: bool = False, ) -> Evaluation: """ Start a semantic-segmentation evaluation job. @@ -1035,6 +1046,8 @@ def evaluate_segmentation( Optional set of constraints to filter evaluation by. label_map : Dict[Label, Label], optional Optional mapping of individual labels to a grouper label. Useful when you need to evaluate performance using labels that differ across datasets and models. + allow_retries : bool, default = False + Option to retry previously failed evaluations. Returns ------- @@ -1054,7 +1067,9 @@ def evaluate_segmentation( ) # create evaluation - evaluation = Client(self.conn).evaluate(request) + evaluation = Client(self.conn).evaluate( + request, allow_retries=allow_retries + ) if len(evaluation) != 1: raise RuntimeError return evaluation[0] @@ -1744,7 +1759,9 @@ def get_evaluations( ) ] - def evaluate(self, request: EvaluationRequest) -> List[Evaluation]: + def evaluate( + self, request: EvaluationRequest, allow_retries: bool = False + ) -> List[Evaluation]: """ Creates as many evaluations as necessary to fulfill the request. @@ -1752,6 +1769,8 @@ def evaluate(self, request: EvaluationRequest) -> List[Evaluation]: ---------- request : schemas.EvaluationRequest The requested evaluation parameters. + allow_retries : bool, default = False + Option to retry previously failed evaluations. Returns ------- @@ -1760,5 +1779,7 @@ def evaluate(self, request: EvaluationRequest) -> List[Evaluation]: """ return [ Evaluation(**evaluation) - for evaluation in self.conn.evaluate(request) + for evaluation in self.conn.evaluate( + request, allow_retries=allow_retries + ) ] diff --git a/integration_tests/client/metrics/test_detection.py b/integration_tests/client/metrics/test_detection.py index 649de071c..cd358dab4 100644 --- a/integration_tests/client/metrics/test_detection.py +++ b/integration_tests/client/metrics/test_detection.py @@ -328,7 +328,7 @@ def test_evaluate_detection( ], convert_annotations_to_type=AnnotationType.BOX, ) - # this computation will return 'EvaluationStatus.FAILED' as no predictions exist that meet the filter requirements. + # this computation will return 'EvaluationStatus.DONE' immediately as no predictions exist that meet the filter requirements. eval_job_max_area_1200.wait_for_completion(timeout=30) result = eval_job_max_area_1200.to_dict() result.pop("meta") @@ -358,7 +358,7 @@ def test_evaluate_detection( "pr_curve_iou_threshold": 0.5, }, # check metrics below - "status": EvaluationStatus.FAILED.value, + "status": EvaluationStatus.DONE.value, "confusion_matrices": [], "missing_pred_labels": [{"key": "k1", "value": "v1"}], "ignored_pred_labels": [], diff --git a/integration_tests/client/metrics/test_evaluations.py b/integration_tests/client/metrics/test_evaluations.py new file mode 100644 index 000000000..144122ecc --- /dev/null +++ b/integration_tests/client/metrics/test_evaluations.py @@ -0,0 +1,41 @@ +from sqlalchemy.exc import IntegrityError +from sqlalchemy.orm import Session + +from valor import Client, Dataset, Model +from valor_api import crud, enums, schemas +from valor_api.backend import core + + +def test_restart_failed_evaluation(db: Session, client: Client): + crud.create_dataset(db=db, dataset=schemas.Dataset(name="dataset")) + crud.create_model(db=db, model=schemas.Model(name="model")) + crud.finalize(db=db, dataset_name="dataset") + + # retrieve dataset and model on the client-side + dataset = Dataset.get("dataset") + model = Model.get("model") + assert dataset + assert model + + # create evaluation and overwrite status to failed + eval1 = model.evaluate_classification(dataset, allow_retries=False) + assert eval1.status == enums.EvaluationStatus.DONE + try: + evaluation = core.fetch_evaluation_from_id( + db=db, evaluation_id=eval1.id + ) + evaluation.status = enums.EvaluationStatus.FAILED + db.commit() + except IntegrityError as e: + db.rollback() + raise e + + # get evaluation and verify it is failed + eval2 = model.evaluate_classification(dataset, allow_retries=False) + assert eval2.id == eval1.id + assert eval2.status == enums.EvaluationStatus.FAILED + + # get evaluation and allow retries, this should result in a finished eval + eval3 = model.evaluate_classification(dataset, allow_retries=True) + assert eval3.id == eval1.id + assert eval3.status == enums.EvaluationStatus.DONE