ServiceNow · gabegma · Jul 11, 2022 · Jun 3, 2022 · Jun 3, 2022 · Jun 6, 2022
diff --git a/CITATION.cff b/CITATION.cff
@@ -1,4 +1,4 @@
-cff-version: 1.1.0
+cff-version: 1.0.0
 message: "If you use Azimuth in your projects, please cite it as below."
 authors:
 - family-names: "Branchaud-Charron"
@@ -18,7 +18,7 @@ authors:
 - family-names: "Babu"
   given-names: "Nandhini"
 title: "Azimuth, an open-source dataset and error analysis tool for text classification"
-version: 2.1
+version: 2.2
 doi: 10.5281/zenodo.6511558
-date-released: 2022-05-27
+date-released: 2022-07-08
 url: "https://github.com/ServiceNow/azimuth"
diff --git a/azimuth/app.py b/azimuth/app.py
@@ -3,7 +3,7 @@
 # in the root directory of this source tree.
 import logging
 from threading import Event
-from typing import Dict, Optional, cast
+from typing import Dict, Optional
 
 import structlog
 from fastapi import APIRouter, Depends, FastAPI, HTTPException
@@ -18,10 +18,9 @@
 from azimuth.modules.utilities.validation import ValidationModule
 from azimuth.task_manager import TaskManager
 from azimuth.types import DatasetSplitName, ModuleOptions
-from azimuth.types.validation import ValidationResponse
 from azimuth.utils.cluster import default_cluster
 from azimuth.utils.conversion import JSONResponseIgnoreNan
-from azimuth.utils.logs import MultipleException, set_logger_config
+from azimuth.utils.logs import set_logger_config
 from azimuth.utils.project import load_dataset_split_managers_from_config
 from azimuth.utils.validation import assert_not_none
 
@@ -267,20 +266,15 @@ def run_validation(
         MultipleException if the validation failed.
     """
 
-    def raise_exception_if_needed(validation_module):
-        validation_module.result()
-        response = cast(ValidationResponse, validation_module.result()[0])
-        if response.exceptions:
-            raise MultipleException(response.exceptions)
-
     def run_validation_module(pipeline_index=None):
         validation_module = ValidationModule(
             config=config,
             dataset_split_name=dataset_split,
             mod_options=ModuleOptions(pipeline_index=pipeline_index),
         )
         validation_module.start_task_on_dataset_split(task_manager.client)
-        raise_exception_if_needed(validation_module)
+        # Will raise exceptions as needed.
+        validation_module.result()
 
     if config.pipelines is None:
         run_validation_module()

diff --git a/azimuth/config.py b/azimuth/config.py
@@ -257,6 +257,11 @@ class ModelContractConfig(CommonFieldsConfig):
             kwargs={"path": "recall"},
             additional_kwargs={"average": "weighted"},
         ),
+        "F1": MetricDefinition(
+            class_name="datasets.load_metric",
+            kwargs={"path": "f1"},
+            additional_kwargs={"average": "weighted"},
+        ),
     }
 
     @validator("pipelines", pre=True)

diff --git a/azimuth/modules/base_classes/dask_module.py b/azimuth/modules/base_classes/dask_module.py
@@ -138,7 +138,9 @@ def start_task(self, client: Client, custom_query: Dict[str, Any]) -> "DaskModul
         log.info(f"Starting custom query {self.name}")
         # pure=false to be sure that everything is rerun.
         # Using self.name as key as we don't have indices
-        self.future = client.submit(self.compute, custom_query, key=self.name, pure=False)
+        self.future = client.submit(
+            self.compute, custom_query, key=f"{self.name}_{hash(str(custom_query))}", pure=False
+        )
         # Tell that this future is for custom use only.
         self.future.is_custom = True
         self.add_done_callback(self.on_end)

diff --git a/azimuth/modules/model_performance/confidence_binning.py b/azimuth/modules/model_performance/confidence_binning.py
@@ -38,13 +38,13 @@ def compute_on_dataset_split(self) -> List[ConfidenceHistogramResponse]:  # type
 
         ds: Dataset = assert_not_none(self.get_dataset_split())
 
-        result = []
         if len(ds) > 0:
             # Get the bin index for each prediction.
             confidences = np.max(self._get_confidences_from_ds(), axis=1)
             bin_indices = np.floor(confidences * CONFIDENCE_BINS_COUNT)
 
             # Create the records. We drop the last bin as it's the maximum.
+            result = []
             for bin_index, bin_min_value in enumerate(bins[:-1]):
                 bin_mask = bin_indices == bin_index
                 outcome_count = defaultdict(int)
@@ -65,17 +65,17 @@ def compute_on_dataset_split(self) -> List[ConfidenceHistogramResponse]:  # type
                 )
         else:
             # Create empty bins
-            for bin_index, bin_min_value in enumerate(bins[:-1]):
-                result.append(
-                    ConfidenceBinDetails(
-                        bin_index=bin_index,
-                        bin_confidence=0,
-                        mean_bin_confidence=0,
-                        outcome_count={outcome: 0 for outcome in ALL_OUTCOMES},
-                    )
+            result = [
+                ConfidenceBinDetails(
+                    bin_index=bin_index,
+                    bin_confidence=0,
+                    mean_bin_confidence=0,
+                    outcome_count={outcome: 0 for outcome in ALL_OUTCOMES},
                 )
+                for bin_index, bin_min_value in enumerate(bins[:-1])
+            ]
 
-        return [ConfidenceHistogramResponse(details_all_bins=result)]
+        return [ConfidenceHistogramResponse(bins=result, confidence_threshold=self.get_threshold())]
 
 
 class ConfidenceBinIndexModule(DatasetResultModule[ModelContractConfig]):

diff --git a/azimuth/modules/model_performance/confusion_matrix.py b/azimuth/modules/model_performance/confusion_matrix.py
@@ -38,4 +38,6 @@ def compute_on_dataset_split(self) -> List[ConfusionMatrixResponse]:  # type: ig
             labels=class_ids,
             normalize="true" if self.mod_options.cf_normalized else None,
         )
-        return [ConfusionMatrixResponse(confusion_matrix=cf)]
+        return [
+            ConfusionMatrixResponse(confusion_matrix=cf, normalized=self.mod_options.cf_normalized)
+        ]
diff --git a/azimuth/modules/model_performance/metrics.py b/azimuth/modules/model_performance/metrics.py
@@ -5,7 +5,7 @@
 import json
 import warnings
 from collections import Counter
-from typing import Dict, List, Optional, Sequence
+from typing import Dict, List, Optional
 
 import numpy as np
 from datasets import Dataset, Metric
@@ -27,7 +27,12 @@
     MetricsPerFilterValue,
 )
 from azimuth.types.outcomes import ALL_OUTCOMES
-from azimuth.types.tag import ALL_DATA_ACTION_FILTERS, ALL_SMART_TAG_FILTERS
+from azimuth.types.tag import (
+    ALL_DATA_ACTION_FILTERS,
+    SMART_TAGS_FAMILY_MAPPING,
+    SmartTag,
+    SmartTagFamily,
+)
 from azimuth.utils.ml.ece import compute_ece_from_bins
 from azimuth.utils.ml.model_performance import sorted_by_utterance_count_with_last
 from azimuth.utils.validation import assert_not_none
@@ -70,7 +75,7 @@ def compute_on_dataset_split(self) -> List[MetricsModuleResponse]:  # type: igno
             config=self.config,
             mod_options=self.mod_options,
         )
-        bins = conf_hist_mod.compute_on_dataset_split()[0].details_all_bins
+        bins = conf_hist_mod.compute_on_dataset_split()[0].bins
         ece, acc, expected = compute_ece_from_bins(bins)
         count_per_bin = [sum(b.outcome_count.values()) for b in bins]
 
@@ -157,7 +162,7 @@ class MetricsPerFilterModule(AggregationModule[AzimuthConfig]):
     """Computes the metrics for each filter."""
 
     def get_metrics_for_filter(
-        self, filters_dict: Dict[str, Sequence[DatasetFilters]]
+        self, filters_dict: Dict[str, DatasetFilters]
     ) -> List[MetricsPerFilterValue]:
         """Get metrics for a list of filters.
 
@@ -218,16 +223,24 @@ def compute_on_dataset_split(self) -> List[MetricsPerFilterModuleResponse]:  # t
             )
             pbar.update()
 
-            smart_tag_filters = {
-                smart_tag: self.edit_filter(self.mod_options.filters, smart_tag=smart_tag)
-                for smart_tag in ALL_SMART_TAG_FILTERS
+            smart_tag_filters: Dict[SmartTagFamily, Dict[str, DatasetFilters]] = {
+                tag_family: {
+                    smart_tag: self.edit_filter(
+                        self.mod_options.filters, smart_tag={tag_family: [smart_tag]}
+                    )
+                    for smart_tag in tags + [SmartTag.no_smart_tag]
+                }
+                for tag_family, tags in SMART_TAGS_FAMILY_MAPPING.items()
+            }
+            metrics_per_smart_tag = {
+                tag_family.value: sorted_by_utterance_count_with_last(
+                    self.get_metrics_for_filter(filters_for_family), -1
+                )
+                for tag_family, filters_for_family in smart_tag_filters.items()
             }
-            metrics_per_smart_tag = sorted_by_utterance_count_with_last(
-                self.get_metrics_for_filter(smart_tag_filters), -1
-            )
             pbar.update()
 
-            outcomes_filters: Dict[str, Sequence[DatasetFilters]] = {
+            outcomes_filters: Dict[str, DatasetFilters] = {
                 outcome: self.edit_filter(self.mod_options.filters, outcome=outcome)
                 for outcome in ALL_OUTCOMES
             }
@@ -240,7 +253,7 @@ def compute_on_dataset_split(self) -> List[MetricsPerFilterModuleResponse]:  # t
                     label=metrics_per_label,
                     prediction=metrics_per_prediction,
                     data_action=metrics_per_data_action,
-                    smart_tag=metrics_per_smart_tag,
+                    **metrics_per_smart_tag,
                     outcome=metrics_per_outcome,
                 ),
                 utterance_count=len(ds),
@@ -279,5 +292,5 @@ def edit_filter(
         if outcome is not None:
             filter_copy.outcomes.append(outcome)
         if smart_tag is not None:
-            filter_copy.smart_tags.append(smart_tag)
+            filter_copy.smart_tags.update(smart_tag)
         return filter_copy
diff --git a/azimuth/modules/model_performance/outcome_count.py b/azimuth/modules/model_performance/outcome_count.py
@@ -18,7 +18,11 @@
     OutcomeCountPerThresholdValue,
 )
 from azimuth.types.outcomes import ALL_OUTCOMES, OutcomeName
-from azimuth.types.tag import ALL_DATA_ACTION_FILTERS, ALL_SMART_TAG_FILTERS
+from azimuth.types.tag import (
+    ALL_DATA_ACTION_FILTERS,
+    SMART_TAGS_FAMILY_MAPPING,
+    SmartTag,
+)
 from azimuth.utils.ml.model_performance import (
     sorted_by_utterance_count,
     sorted_by_utterance_count_with_last,
@@ -152,7 +156,12 @@ def compute_on_dataset_split(self) -> List[OutcomeCountPerFilterResponse]:  # ty
                     ),
                     data_action=self.get_outcome_count_per_tag(dm, ds, ALL_DATA_ACTION_FILTERS),
                     outcome=self.get_outcome_count_per_outcome(ds),
-                    smart_tag=self.get_outcome_count_per_tag(dm, ds, ALL_SMART_TAG_FILTERS),
+                    **{
+                        family.value: self.get_outcome_count_per_tag(
+                            dm, ds, [t.value for t in tags + [SmartTag.no_smart_tag]]
+                        )
+                        for family, tags in SMART_TAGS_FAMILY_MAPPING.items()
+                    },
                 ),
                 utterance_count=len(ds),
             )

diff --git a/azimuth/modules/utilities/validation.py b/azimuth/modules/utilities/validation.py
@@ -9,6 +9,7 @@
 from azimuth.modules.model_contract_task_mapping import model_contract_task_mapping
 from azimuth.types import ModuleOptions, SupportedMethod, SupportedModelContract
 from azimuth.types.validation import ValidationResponse
+from azimuth.utils.logs import MultipleExceptions
 from azimuth.utils.validation import assert_not_none
 
 
@@ -69,6 +70,8 @@ def compute_on_dataset_split(self) -> List[ValidationResponse]:  # type: ignore
             can_make_saliency = False
 
         # Should we raise instead?
+        if exception_gatherer.exceptions:
+            raise MultipleExceptions(exceptions=exception_gatherer.exceptions)
         return [
             ValidationResponse(
                 is_cuda_available=cuda_available,
@@ -77,7 +80,6 @@ def compute_on_dataset_split(self) -> List[ValidationResponse]:  # type: ignore
                 model_has_correct_type=model_has_correct_type,
                 can_make_prediction=can_make_prediction,
                 can_make_saliency=can_make_saliency,
-                exceptions=exception_gatherer.exceptions,
             )
         ]
 

diff --git a/azimuth/routers/v1/app.py b/azimuth/routers/v1/app.py
@@ -35,7 +35,6 @@
 from azimuth.utils.project import (
     perturbation_testing_available,
     postprocessing_editable,
-    postprocessing_known,
     predictions_available,
     similarity_available,
 )
@@ -96,15 +95,6 @@ def get_dataset_info(
 
     model_contract = task_manager.config.model_contract
 
-    threshold = (
-        None
-        if config.pipelines is None
-        else [
-            pipeline.threshold if postprocessing_known(task_manager.config, idx) else None
-            for idx, pipeline in enumerate(config.pipelines)
-        ]
-    )
-
     return DatasetInfoResponse(
         project_name=config.name,
         class_names=eval_dm.get_class_names(),
@@ -115,7 +105,6 @@ def get_dataset_info(
         if training_dm is not None
         else [],
         startup_tasks={k: v.status() for k, v in startup_tasks.items()},
-        default_threshold=threshold,
         model_contract=model_contract,
         prediction_available=predictions_available(task_manager.config),
         perturbation_testing_available=perturbation_testing_available(task_manager.config),

diff --git a/azimuth/routers/v1/custom_utterances.py b/azimuth/routers/v1/custom_utterances.py
@@ -13,7 +13,7 @@
 from azimuth.config import AzimuthConfig
 from azimuth.modules.perturbation_testing import PerturbationTestingModule
 from azimuth.task_manager import TaskManager
-from azimuth.types import DatasetSplitName, SupportedMethod
+from azimuth.types import DatasetSplitName, ModuleOptions, SupportedMethod
 from azimuth.types.perturbation_testing import (
     PRETTY_PERTURBATION_TYPES,
     PerturbationTestFailureReason,
@@ -22,7 +22,7 @@
 )
 from azimuth.types.task import SaliencyResponse
 from azimuth.utils.conversion import orjson_dumps
-from azimuth.utils.routers import get_custom_task_result, require_available_model
+from azimuth.utils.routers import get_custom_task_result, require_pipeline_index
 
 router = APIRouter()
 
@@ -92,16 +92,17 @@ def get_perturbed_utterances(
     description="Get saliency for custom utterances.",
     tags=TAGS,
     response_model=List[SaliencyResponse],
-    dependencies=[Depends(require_available_model)],
 )
 def get_saliency(
     utterances: List[str] = Query([], title="Utterances"),
+    pipeline_index: int = Depends(require_pipeline_index),
     task_manager: TaskManager = Depends(get_task_manager),
 ) -> List[SaliencyResponse]:
     task_result: List[SaliencyResponse] = get_custom_task_result(
         SupportedMethod.Saliency,
         task_manager=task_manager,
         custom_query={task_manager.config.columns.text_input: utterances},
+        mod_options=ModuleOptions(pipeline_index=pipeline_index),
     )
 
     return task_result
diff --git a/azimuth/routers/v1/model_performance/confidence_histogram.py b/azimuth/routers/v1/model_performance/confidence_histogram.py
@@ -2,8 +2,6 @@
 # This source code is licensed under the Apache 2.0 license found in the LICENSE file
 # in the root directory of this source tree.
 
-from typing import List
-
 from fastapi import APIRouter, Depends, Query
 
 from azimuth.app import get_dataset_split_manager, get_task_manager
@@ -15,10 +13,7 @@
     NamedDatasetFilters,
     SupportedModule,
 )
-from azimuth.types.model_performance import (
-    ConfidenceBinDetails,
-    ConfidenceHistogramResponse,
-)
+from azimuth.types.model_performance import ConfidenceHistogramResponse
 from azimuth.utils.routers import (
     build_named_dataset_filters,
     get_standard_task_result,
@@ -35,7 +30,7 @@
     summary="Get confidence histogram values",
     description="Get all confidence bins with their confidence and the outcome count",
     tags=TAGS,
-    response_model=List[ConfidenceBinDetails],
+    response_model=ConfidenceHistogramResponse,
 )
 def get_confidence_histogram(
     dataset_split_name: DatasetSplitName,
@@ -46,7 +41,7 @@ def get_confidence_histogram(
     without_postprocessing: bool = Query(
         False, title="Without Postprocessing", alias="withoutPostprocessing"
     ),
-) -> List[ConfidenceBinDetails]:
+) -> ConfidenceHistogramResponse:
     mod_options = ModuleOptions(
         filters=named_filters.to_dataset_filters(dataset_split_manager.get_class_names()),
         pipeline_index=pipeline_index,
@@ -61,4 +56,4 @@ def get_confidence_histogram(
         last_update=dataset_split_manager.last_update,
     )[0]
 
-    return result.details_all_bins
+    return result