Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluation validation & restart #557

Merged
merged 11 commits into from
Apr 23, 2024
3 changes: 2 additions & 1 deletion api/tests/functional-tests/backend/core/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def test_dataset_status_with_evaluations(
):
# create an evaluation
core.set_dataset_status(db, created_dataset, enums.TableStatus.FINALIZED)
evaluations, _ = core.create_or_get_evaluations(
evaluations = core.create_or_get_evaluations(
db,
schemas.EvaluationRequest(
model_names=[created_model],
Expand All @@ -165,6 +165,7 @@ def test_dataset_status_with_evaluations(
),
)
assert len(evaluations) == 1
assert evaluations[0].status == enums.EvaluationStatus.PENDING
evaluation_id = evaluations[0].id

# set the evaluation to the running state
Expand Down
39 changes: 22 additions & 17 deletions api/tests/functional-tests/backend/core/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def test__fetch_evaluation_from_subrequest(
),
meta={},
)
created_1, _ = core.create_or_get_evaluations(db, job_request_1)
created_1 = core.create_or_get_evaluations(db, job_request_1)
assert len(created_1) == 1

# create evaluation 2
Expand All @@ -197,7 +197,7 @@ def test__fetch_evaluation_from_subrequest(
),
meta={},
)
created_2, _ = core.create_or_get_evaluations(db, job_request_2)
created_2 = core.create_or_get_evaluations(db, job_request_2)
assert len(created_2) == 1

# test fetching a subrequest
Expand Down Expand Up @@ -245,9 +245,9 @@ def test_create_evaluation(
),
meta={},
)
created, existing = core.create_or_get_evaluations(db, job_request_1)
assert len(existing) == 0
created = core.create_or_get_evaluations(db, job_request_1)
assert len(created) == 1
assert created[0].status == enums.EvaluationStatus.PENDING
evaluation_id = created[0].id

assert (
Expand All @@ -256,9 +256,9 @@ def test_create_evaluation(
)

# test duplication check
created, existing = core.create_or_get_evaluations(db, job_request_1)
assert len(created) == 0
existing = core.create_or_get_evaluations(db, job_request_1)
assert len(existing) == 1
assert existing[0].status == enums.EvaluationStatus.PENDING
assert existing[0].id == evaluation_id

assert (
Expand Down Expand Up @@ -322,8 +322,9 @@ def test_fetch_evaluation_from_id(
),
meta={},
)
created_1, _ = core.create_or_get_evaluations(db, job_request_1)
created_1 = core.create_or_get_evaluations(db, job_request_1)
assert len(created_1) == 1
assert created_1[0].status == enums.EvaluationStatus.PENDING
evaluation_id_1 = created_1[0].id

# create evaluation 2
Expand All @@ -335,8 +336,9 @@ def test_fetch_evaluation_from_id(
),
meta={},
)
created_2, _ = core.create_or_get_evaluations(db, job_request_2)
created_2 = core.create_or_get_evaluations(db, job_request_2)
assert len(created_2) == 1
assert created_2[0].status == enums.EvaluationStatus.PENDING
evaluation_id_2 = created_2[0].id

fetched_evaluation = core.fetch_evaluation_from_id(db, evaluation_id_1)
Expand Down Expand Up @@ -369,8 +371,9 @@ def test_get_evaluations(
),
meta={},
)
created_1, _ = core.create_or_get_evaluations(db, job_request_1)
created_1 = core.create_or_get_evaluations(db, job_request_1)
assert len(created_1) == 1
assert created_1[0].status == enums.EvaluationStatus.PENDING

# create evaluation 2
job_request_2 = schemas.EvaluationRequest(
Expand All @@ -381,8 +384,9 @@ def test_get_evaluations(
),
meta={},
)
created_2, _ = core.create_or_get_evaluations(db, job_request_2)
created_2 = core.create_or_get_evaluations(db, job_request_2)
assert len(created_2) == 1
assert created_2[0].status == enums.EvaluationStatus.PENDING

# test get by dataset
evaluations_by_dataset = core.get_paginated_evaluations(
Expand Down Expand Up @@ -542,10 +546,10 @@ def test_evaluation_status(
),
meta={},
)
created_1, existing = core.create_or_get_evaluations(db, job_request_1)
assert len(existing) == 0
assert len(created_1) == 1
evaluation_id = created_1[0].id
evaluations = core.create_or_get_evaluations(db, job_request_1)
assert len(evaluations) == 1
assert evaluations[0].status == enums.EvaluationStatus.PENDING
evaluation_id = evaluations[0].id

# check that evaluation is created with PENDING status.
assert (
Expand Down Expand Up @@ -656,7 +660,7 @@ def test_count_active_evaluations(
),
meta={},
)
created, _ = core.create_or_get_evaluations(db, job_request_1)
created = core.create_or_get_evaluations(db, job_request_1)
assert len(created) == 1
evaluation_1 = created[0].id

Expand All @@ -669,7 +673,7 @@ def test_count_active_evaluations(
),
meta={},
)
created, _ = core.create_or_get_evaluations(db, job_request_2)
created = core.create_or_get_evaluations(db, job_request_2)
assert len(created) == 1
evaluation_2 = created[0].id

Expand Down Expand Up @@ -716,7 +720,8 @@ def test_count_active_evaluations(
),
meta={},
)
evaluation_3, _ = core.create_or_get_evaluations(db, job_request_3)
evaluation_3 = core.create_or_get_evaluations(db, job_request_3)
assert len(evaluation_3) == 1
evaluation_3 = evaluation_3[0].id

assert (
Expand Down
2 changes: 1 addition & 1 deletion api/tests/functional-tests/backend/core/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def test_model_status_with_evaluations(
):
# create an evaluation
core.set_dataset_status(db, created_dataset, enums.TableStatus.FINALIZED)
created, _ = core.create_or_get_evaluations(
created = core.create_or_get_evaluations(
db,
schemas.EvaluationRequest(
model_names=[created_model],
Expand Down
25 changes: 12 additions & 13 deletions api/tests/functional-tests/backend/metrics/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,27 +782,26 @@ def test_classification(
)

# creates evaluation job
created_evaluations, existing_evaluations = create_or_get_evaluations(
db=db, job_request=job_request
)
assert len(created_evaluations) == 1
assert len(existing_evaluations) == 0
evaluations = create_or_get_evaluations(db=db, job_request=job_request)
assert len(evaluations) == 1
assert evaluations[0].status == enums.EvaluationStatus.PENDING

# computation, normally run as background task
_ = compute_clf_metrics(
db=db,
evaluation_id=created_evaluations[0].id,
evaluation_id=evaluations[0].id,
)

# get evaluations
created_evaluations, existing_evaluations = create_or_get_evaluations(
db=db, job_request=job_request
)
assert len(created_evaluations) == 0
assert len(existing_evaluations) == 1
evaluations = create_or_get_evaluations(db=db, job_request=job_request)
assert len(evaluations) == 1
assert evaluations[0].status in {
enums.EvaluationStatus.RUNNING,
enums.EvaluationStatus.DONE,
}

metrics = existing_evaluations[0].metrics
confusion = existing_evaluations[0].confusion_matrices
metrics = evaluations[0].metrics
confusion = evaluations[0].confusion_matrices

# Make matrices accessible by label_key
assert confusion
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_validate_computation(
):
# create evaluation
core.set_dataset_status(db, created_dataset, enums.TableStatus.FINALIZED)
created, _ = core.create_or_get_evaluations(
created = core.create_or_get_evaluations(
db,
schemas.EvaluationRequest(
model_names=[created_model],
Expand Down
23 changes: 11 additions & 12 deletions api/tests/functional-tests/backend/metrics/test_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,23 +447,22 @@ def test_compute_semantic_segmentation_metrics(
meta={},
)

created_evaluations, existing_evaluations = create_or_get_evaluations(
db=db, job_request=job_request
)
assert len(created_evaluations) == 1
assert len(existing_evaluations) == 0
evaluations = create_or_get_evaluations(db=db, job_request=job_request)
assert len(evaluations) == 1
assert evaluations[0].status == enums.EvaluationStatus.PENDING

_ = compute_semantic_segmentation_metrics(
db=db, evaluation_id=created_evaluations[0].id
db=db, evaluation_id=evaluations[0].id
)

created_evaluations, existing_evaluations = create_or_get_evaluations(
db=db, job_request=job_request
)
assert len(created_evaluations) == 0
assert len(existing_evaluations) == 1
evaluations = create_or_get_evaluations(db=db, job_request=job_request)
assert len(evaluations) == 1
assert evaluations[0].status in {
enums.EvaluationStatus.RUNNING,
enums.EvaluationStatus.DONE,
}

metrics = existing_evaluations[0].metrics
metrics = evaluations[0].metrics

expected_metrics = {
# none of these three labels have a predicted label
Expand Down
73 changes: 32 additions & 41 deletions api/valor_api/backend/core/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,35 +11,6 @@
from valor_api.backend.query import Query


def _validate_classification_task(
db: Session,
evaluation: models.Evaluation,
):
"""
Validate that a classification evaluation is possible.

Parameters
----------
db : Session
The database session.
evaluation : models.Evaluation
The uncommitted evaluation row.
"""
# unpack filters and params
groundtruth_filter = schemas.Filter(**evaluation.datum_filter)
prediction_filter = groundtruth_filter.model_copy()
prediction_filter.model_names = [evaluation.model_name]
parameters = schemas.EvaluationParameters(**evaluation.parameters)

# check that prediction label keys match ground truth label keys
core.validate_matching_label_keys(
db=db,
label_map=parameters.label_map,
groundtruth_filter=groundtruth_filter,
prediction_filter=prediction_filter,
)


def _create_dataset_expr_from_list(
dataset_names: list[str],
) -> BinaryExpression | None:
Expand Down Expand Up @@ -434,14 +405,11 @@ def _fetch_evaluation_from_subrequest(
def create_or_get_evaluations(
db: Session,
job_request: schemas.EvaluationRequest,
) -> tuple[
list[schemas.EvaluationResponse],
list[schemas.EvaluationResponse],
]:
) -> list[schemas.EvaluationResponse]:
"""
Creates evaluations from evaluation request.

If an evaluation already exists, it will be returned as running.
If an evaluation already exists, it will be returned with a non-pending status.
czaloom marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
Expand All @@ -452,8 +420,8 @@ def create_or_get_evaluations(

Returns
-------
tuple[list[schemas.EvaluationResponse], list[schemas.EvaluationResponse]]
A tuple of evaluation response lists following the pattern (list[created_evaluations], list[existing_evaluations])
list[schemas.EvaluationResponse]
A list of evaluation responses.
"""

created_rows = []
Expand Down Expand Up @@ -481,11 +449,37 @@ def create_or_get_evaluations(
meta={}, # meta stores data about the run after it completes; should be an empty dictionary at creation time
)

# unpack filters and params
groundtruth_filter = job_request.datum_filter
prediction_filter = groundtruth_filter.model_copy()
prediction_filter.model_names = [evaluation.model_name]
parameters = job_request.parameters

# verify model and datasets have data for this evaluation
if not (
czaloom marked this conversation as resolved.
Show resolved Hide resolved
db.query(Query(models.Dataset).filter(groundtruth_filter).any()) # type: ignore - SQLAlchemy type issue
.distinct()
.all()
):
evaluation.status = enums.EvaluationStatus.DONE
if (
db.query(Query(models.Model).filter(prediction_filter).any()) # type: ignore - SQLAlchemy type issue
.distinct()
.one_or_none()
) is None:
evaluation.status = enums.EvaluationStatus.DONE
ntlind marked this conversation as resolved.
Show resolved Hide resolved

if (
subrequest.parameters.task_type
== enums.TaskType.CLASSIFICATION
):
_validate_classification_task(db=db, evaluation=evaluation)
# check that prediction label keys match ground truth label keys
core.validate_matching_label_keys(
db=db,
label_map=parameters.label_map,
groundtruth_filter=groundtruth_filter,
prediction_filter=prediction_filter,
)

created_rows.append(evaluation)

Expand All @@ -496,10 +490,7 @@ def create_or_get_evaluations(
db.rollback()
raise exceptions.EvaluationAlreadyExistsError()

return (
_create_responses(db, created_rows),
_create_responses(db, existing_rows),
)
return _create_responses(db, created_rows + existing_rows)


def _fetch_evaluations_and_mark_for_deletion(
Expand Down
Loading
Loading