Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluation validation & restart #557

Merged
merged 11 commits into from
Apr 23, 2024
3 changes: 2 additions & 1 deletion api/tests/functional-tests/backend/core/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def test_dataset_status_with_evaluations(
):
# create an evaluation
core.set_dataset_status(db, created_dataset, enums.TableStatus.FINALIZED)
evaluations, _ = core.create_or_get_evaluations(
evaluations = core.create_or_get_evaluations(
db,
schemas.EvaluationRequest(
model_names=[created_model],
Expand All @@ -165,6 +165,7 @@ def test_dataset_status_with_evaluations(
),
)
assert len(evaluations) == 1
assert evaluations[0].status == enums.EvaluationStatus.PENDING
evaluation_id = evaluations[0].id

# set the evaluation to the running state
Expand Down
39 changes: 22 additions & 17 deletions api/tests/functional-tests/backend/core/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def test__fetch_evaluation_from_subrequest(
),
meta={},
)
created_1, _ = core.create_or_get_evaluations(db, job_request_1)
created_1 = core.create_or_get_evaluations(db, job_request_1)
assert len(created_1) == 1

# create evaluation 2
Expand All @@ -197,7 +197,7 @@ def test__fetch_evaluation_from_subrequest(
),
meta={},
)
created_2, _ = core.create_or_get_evaluations(db, job_request_2)
created_2 = core.create_or_get_evaluations(db, job_request_2)
assert len(created_2) == 1

# test fetching a subrequest
Expand Down Expand Up @@ -245,9 +245,9 @@ def test_create_evaluation(
),
meta={},
)
created, existing = core.create_or_get_evaluations(db, job_request_1)
assert len(existing) == 0
created = core.create_or_get_evaluations(db, job_request_1)
assert len(created) == 1
assert created[0].status == enums.EvaluationStatus.PENDING
evaluation_id = created[0].id

assert (
Expand All @@ -256,9 +256,9 @@ def test_create_evaluation(
)

# test duplication check
created, existing = core.create_or_get_evaluations(db, job_request_1)
assert len(created) == 0
existing = core.create_or_get_evaluations(db, job_request_1)
assert len(existing) == 1
assert existing[0].status == enums.EvaluationStatus.PENDING
assert existing[0].id == evaluation_id

assert (
Expand Down Expand Up @@ -322,8 +322,9 @@ def test_fetch_evaluation_from_id(
),
meta={},
)
created_1, _ = core.create_or_get_evaluations(db, job_request_1)
created_1 = core.create_or_get_evaluations(db, job_request_1)
assert len(created_1) == 1
assert created_1[0].status == enums.EvaluationStatus.PENDING
evaluation_id_1 = created_1[0].id

# create evaluation 2
Expand All @@ -335,8 +336,9 @@ def test_fetch_evaluation_from_id(
),
meta={},
)
created_2, _ = core.create_or_get_evaluations(db, job_request_2)
created_2 = core.create_or_get_evaluations(db, job_request_2)
assert len(created_2) == 1
assert created_2[0].status == enums.EvaluationStatus.PENDING
evaluation_id_2 = created_2[0].id

fetched_evaluation = core.fetch_evaluation_from_id(db, evaluation_id_1)
Expand Down Expand Up @@ -369,8 +371,9 @@ def test_get_evaluations(
),
meta={},
)
created_1, _ = core.create_or_get_evaluations(db, job_request_1)
created_1 = core.create_or_get_evaluations(db, job_request_1)
assert len(created_1) == 1
assert created_1[0].status == enums.EvaluationStatus.PENDING

# create evaluation 2
job_request_2 = schemas.EvaluationRequest(
Expand All @@ -381,8 +384,9 @@ def test_get_evaluations(
),
meta={},
)
created_2, _ = core.create_or_get_evaluations(db, job_request_2)
created_2 = core.create_or_get_evaluations(db, job_request_2)
assert len(created_2) == 1
assert created_2[0].status == enums.EvaluationStatus.PENDING

# test get by dataset
evaluations_by_dataset = core.get_paginated_evaluations(
Expand Down Expand Up @@ -542,10 +546,10 @@ def test_evaluation_status(
),
meta={},
)
created_1, existing = core.create_or_get_evaluations(db, job_request_1)
assert len(existing) == 0
assert len(created_1) == 1
evaluation_id = created_1[0].id
evaluations = core.create_or_get_evaluations(db, job_request_1)
assert len(evaluations) == 1
assert evaluations[0].status == enums.EvaluationStatus.PENDING
evaluation_id = evaluations[0].id

# check that evaluation is created with PENDING status.
assert (
Expand Down Expand Up @@ -656,7 +660,7 @@ def test_count_active_evaluations(
),
meta={},
)
created, _ = core.create_or_get_evaluations(db, job_request_1)
created = core.create_or_get_evaluations(db, job_request_1)
assert len(created) == 1
evaluation_1 = created[0].id

Expand All @@ -669,7 +673,7 @@ def test_count_active_evaluations(
),
meta={},
)
created, _ = core.create_or_get_evaluations(db, job_request_2)
created = core.create_or_get_evaluations(db, job_request_2)
assert len(created) == 1
evaluation_2 = created[0].id

Expand Down Expand Up @@ -716,7 +720,8 @@ def test_count_active_evaluations(
),
meta={},
)
evaluation_3, _ = core.create_or_get_evaluations(db, job_request_3)
evaluation_3 = core.create_or_get_evaluations(db, job_request_3)
assert len(evaluation_3) == 1
evaluation_3 = evaluation_3[0].id

assert (
Expand Down
2 changes: 1 addition & 1 deletion api/tests/functional-tests/backend/core/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def test_model_status_with_evaluations(
):
# create an evaluation
core.set_dataset_status(db, created_dataset, enums.TableStatus.FINALIZED)
created, _ = core.create_or_get_evaluations(
created = core.create_or_get_evaluations(
db,
schemas.EvaluationRequest(
model_names=[created_model],
Expand Down
25 changes: 12 additions & 13 deletions api/tests/functional-tests/backend/metrics/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,27 +781,26 @@ def test_classification(
)

# creates evaluation job
created_evaluations, existing_evaluations = create_or_get_evaluations(
db=db, job_request=job_request
)
assert len(created_evaluations) == 1
assert len(existing_evaluations) == 0
evaluations = create_or_get_evaluations(db=db, job_request=job_request)
assert len(evaluations) == 1
assert evaluations[0].status == enums.EvaluationStatus.PENDING

# computation, normally run as background task
_ = compute_clf_metrics(
db=db,
evaluation_id=created_evaluations[0].id,
evaluation_id=evaluations[0].id,
)

# get evaluations
created_evaluations, existing_evaluations = create_or_get_evaluations(
db=db, job_request=job_request
)
assert len(created_evaluations) == 0
assert len(existing_evaluations) == 1
evaluations = create_or_get_evaluations(db=db, job_request=job_request)
assert len(evaluations) == 1
assert evaluations[0].status in {
enums.EvaluationStatus.RUNNING,
enums.EvaluationStatus.DONE,
}

metrics = existing_evaluations[0].metrics
confusion = existing_evaluations[0].confusion_matrices
metrics = evaluations[0].metrics
confusion = evaluations[0].confusion_matrices

# Make matrices accessible by label_key
assert confusion
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_validate_computation(
):
# create evaluation
core.set_dataset_status(db, created_dataset, enums.TableStatus.FINALIZED)
created, _ = core.create_or_get_evaluations(
created = core.create_or_get_evaluations(
db,
schemas.EvaluationRequest(
model_names=[created_model],
Expand Down
23 changes: 11 additions & 12 deletions api/tests/functional-tests/backend/metrics/test_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,23 +447,22 @@ def test_compute_semantic_segmentation_metrics(
meta={},
)

created_evaluations, existing_evaluations = create_or_get_evaluations(
db=db, job_request=job_request
)
assert len(created_evaluations) == 1
assert len(existing_evaluations) == 0
evaluations = create_or_get_evaluations(db=db, job_request=job_request)
assert len(evaluations) == 1
assert evaluations[0].status == enums.EvaluationStatus.PENDING

_ = compute_semantic_segmentation_metrics(
db=db, evaluation_id=created_evaluations[0].id
db=db, evaluation_id=evaluations[0].id
)

created_evaluations, existing_evaluations = create_or_get_evaluations(
db=db, job_request=job_request
)
assert len(created_evaluations) == 0
assert len(existing_evaluations) == 1
evaluations = create_or_get_evaluations(db=db, job_request=job_request)
assert len(evaluations) == 1
assert evaluations[0].status in {
enums.EvaluationStatus.RUNNING,
enums.EvaluationStatus.DONE,
}

metrics = existing_evaluations[0].metrics
metrics = evaluations[0].metrics

expected_metrics = {
# none of these three labels have a predicted label
Expand Down
69 changes: 69 additions & 0 deletions api/tests/functional-tests/crud/test_evaluation_crud.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session

from valor_api import crud, enums, schemas
from valor_api.backend import core


def test_restart_failed_evaluation(db: Session):
crud.create_dataset(db=db, dataset=schemas.Dataset(name="dataset"))
crud.create_model(db=db, model=schemas.Model(name="model"))
crud.finalize(db=db, dataset_name="dataset")

# create evaluation and overwrite status to failed
evaluations1 = core.create_or_get_evaluations(
db=db,
job_request=schemas.EvaluationRequest(
model_names=["model"],
datum_filter=schemas.Filter(dataset_names=["dataset"]),
parameters=schemas.EvaluationParameters(
task_type=enums.TaskType.CLASSIFICATION
),
meta=None,
),
allow_retries=False,
)
assert len(evaluations1) == 1
try:
evaluation = core.fetch_evaluation_from_id(
db=db, evaluation_id=evaluations1[0].id
)
evaluation.status = enums.EvaluationStatus.FAILED
db.commit()
except IntegrityError as e:
db.rollback()
raise e

# get evaluation and verify it is failed
evaluations2 = crud.create_or_get_evaluations(
db=db,
job_request=schemas.EvaluationRequest(
model_names=["model"],
datum_filter=schemas.Filter(dataset_names=["dataset"]),
parameters=schemas.EvaluationParameters(
task_type=enums.TaskType.CLASSIFICATION
),
meta=None,
),
allow_retries=False,
)
assert len(evaluations2) == 1
assert evaluations2[0].status == enums.EvaluationStatus.FAILED
assert evaluations2[0].id == evaluations1[0].id

# get evaluation and allow retries, this should result in a finished eval
evaluations3 = crud.create_or_get_evaluations(
db=db,
job_request=schemas.EvaluationRequest(
model_names=["model"],
datum_filter=schemas.Filter(dataset_names=["dataset"]),
parameters=schemas.EvaluationParameters(
task_type=enums.TaskType.CLASSIFICATION
),
meta=None,
),
allow_retries=True,
)
assert len(evaluations3) == 1
assert evaluations3[0].status == enums.EvaluationStatus.DONE
assert evaluations3[0].id == evaluations1[0].id
Loading
Loading