Hotfix (#56)

* add add_tasks in model and only running scores when evaluation triggered * fix auc roc * more tests for metrics * more tests * more unit
senwu · Mar 16, 2020 · 368a95a · 368a95a
1 parent 474aac5
commit 368a95a
Show file tree

Hide file tree

Showing 11 changed files with 186 additions and 53 deletions.
diff --git a/.isort.cfg b/.isort.cfg
@@ -6,4 +6,4 @@ force_grid_wrap=0
 combine_as_imports=True
 line_length=88
 known_first_party = emmental,tests
-known_third_party = numpy,scipy,setuptools,sklearn,torch,yaml
+known_third_party = numpy,pytest,scipy,setuptools,sklearn,torch,yaml
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,9 +1,15 @@
 Unreleased_
 -----------
 
-dded
+added
 ^^^^^
 * `@senwu`_: Support probabilistic gold label in scorer.
+* `@senwu`_: Add `add_tasks` to support adding one task or mulitple tasks into model.
+
+Changed
+^^^^^^^
+* `@senwu`_: Change running evaluation only when evaluation is triggered.
+
 
 0.0.5_ - 2020-03-01
 -------------------

diff --git a/src/emmental/learner.py b/src/emmental/learner.py
@@ -364,11 +364,12 @@ def _logging(
 
         self.logging_manager.update(batch_size)
 
+        trigger_evaluation = self.logging_manager.trigger_evaluation()
+
         # Log the loss and lr
-        metric_dict.update(self._aggregate_running_metrics(model))
+        metric_dict.update(self._aggregate_running_metrics(model, trigger_evaluation))
 
         # Evaluate the model and log the metric
-        trigger_evaluation = self.logging_manager.trigger_evaluation()
         if trigger_evaluation:
 
             # Log task specific metric
@@ -407,11 +408,14 @@ def _logging(
 
         return metric_dict
 
-    def _aggregate_running_metrics(self, model: EmmentalModel) -> Dict[str, float]:
+    def _aggregate_running_metrics(
+        self, model: EmmentalModel, calc_running_scores: bool = False
+    ) -> Dict[str, float]:
         r"""Calculate the running overall and task specific metrics.
 
         Args:
           model(EmmentalModel): The model to evaluate.
+          calc_running_scores(bool): Whether to calc running scores
 
         Returns:
           dict: The score dict.
@@ -435,36 +439,43 @@ def _aggregate_running_metrics(self, model: EmmentalModel) -> Dict[str, float]:
             total_loss = sum(self.running_losses.values())
             metric_dict["model/all/train/loss"] = total_loss / total_count
 
-        micro_score_dict: Dict[str, List[ndarray]] = defaultdict(list)
-        macro_score_dict: Dict[str, List[ndarray]] = defaultdict(list)
+        if calc_running_scores:
+            micro_score_dict: Dict[str, List[ndarray]] = defaultdict(list)
+            macro_score_dict: Dict[str, List[ndarray]] = defaultdict(list)
 
-        # Calculate training metric
-        for identifier in self.running_uids.keys():
-            task_name, data_name, split = identifier.split("/")
+            # Calculate training metric
+            for identifier in self.running_uids.keys():
+                task_name, data_name, split = identifier.split("/")
 
-            metric_score = model.scorers[task_name].score(
-                self.running_golds[identifier],
-                self.running_probs[identifier],
-                prob_to_pred(self.running_probs[identifier]),
-                self.running_uids[identifier],
-            )
-            for metric_name, metric_value in metric_score.items():
-                metric_dict[f"{identifier}/{metric_name}"] = metric_value
+                metric_score = model.scorers[task_name].score(
+                    self.running_golds[identifier],
+                    self.running_probs[identifier],
+                    prob_to_pred(self.running_probs[identifier]),
+                    self.running_uids[identifier],
+                )
+                for metric_name, metric_value in metric_score.items():
+                    metric_dict[f"{identifier}/{metric_name}"] = metric_value
 
-            # Collect average score
-            identifier = construct_identifier(task_name, data_name, split, "average")
+                # Collect average score
+                identifier = construct_identifier(
+                    task_name, data_name, split, "average"
+                )
 
-            metric_dict[identifier] = np.mean(list(metric_score.values()))
+                metric_dict[identifier] = np.mean(list(metric_score.values()))
 
-            micro_score_dict[split].extend(list(metric_score.values()))
-            macro_score_dict[split].append(metric_dict[identifier])
+                micro_score_dict[split].extend(list(metric_score.values()))
+                macro_score_dict[split].append(metric_dict[identifier])
 
-        # Collect split-wise micro/macro average score
-        for split in micro_score_dict.keys():
-            identifier = construct_identifier("model", "all", split, "micro_average")
-            metric_dict[identifier] = np.mean(micro_score_dict[split])
-            identifier = construct_identifier("model", "all", split, "macro_average")
-            metric_dict[identifier] = np.mean(macro_score_dict[split])
+            # Collect split-wise micro/macro average score
+            for split in micro_score_dict.keys():
+                identifier = construct_identifier(
+                    "model", "all", split, "micro_average"
+                )
+                metric_dict[identifier] = np.mean(micro_score_dict[split])
+                identifier = construct_identifier(
+                    "model", "all", split, "macro_average"
+                )
+                metric_dict[identifier] = np.mean(macro_score_dict[split])
 
         # Log the learning rate
         metric_dict["model/all/train/lr"] = self.optimizer.param_groups[0]["lr"]
@@ -539,7 +550,6 @@ def learn(
             )
 
             for batch_num, batch in batches:
-
                 # Covert single batch into a batch list
                 if not isinstance(batch, list):
                     batch = [batch]

diff --git a/src/emmental/metrics/pearson_correlation.py b/src/emmental/metrics/pearson_correlation.py
@@ -28,9 +28,6 @@ def pearson_correlation_scorer(
 
     probs = np.vstack(probs).squeeze()
     correlation, pvalue = pearsonr(golds, probs)
-    if np.isnan(correlation):
-        correlation = 0.0
-        pvalue = 0.0
 
     if return_pvalue:
         return {"pearson_correlation": correlation, "pearson_pvalue": pvalue}

diff --git a/src/emmental/metrics/roc_auc.py b/src/emmental/metrics/roc_auc.py
@@ -29,10 +29,20 @@ def roc_auc_scorer(
 
     """
 
-    if len(golds.shape) == 1:
-        golds = pred_to_prob(golds, n_classes=probs.shape[1])
+    if len(probs.shape) == 2 and probs.shape[1] == 1:
+        probs = probs.reshape(probs.shape[0])
+
+    if len(golds.shape) == 2 and golds.shape[1] == 1:
+        golds = golds.reshape(golds.shape[0])
+
+    if len(probs.shape) > 1:
+        if len(golds.shape) > 1:
+            golds = pred_to_prob(prob_to_pred(golds), n_classes=probs.shape[1])
+        else:
+            golds = pred_to_prob(golds, n_classes=probs.shape[1])
     else:
-        golds = pred_to_prob(prob_to_pred(golds), n_classes=probs.shape[1])
+        if len(golds.shape) > 1:
+            golds = prob_to_pred(golds)
 
     try:
         roc_auc = roc_auc_score(golds, probs)

diff --git a/src/emmental/metrics/spearman_correlation.py b/src/emmental/metrics/spearman_correlation.py
@@ -29,9 +29,6 @@ def spearman_correlation_scorer(
 
     probs = np.vstack(probs).squeeze()
     correlation, pvalue = spearmanr(golds, probs)
-    if np.isnan(correlation):
-        correlation = 0.0
-        pvalue = 0.0
 
     if return_pvalue:
         return {"spearman_correlation": correlation, "spearman_pvalue": pvalue}

diff --git a/src/emmental/model.py b/src/emmental/model.py
@@ -49,7 +49,7 @@ def __init__(
 
         # Build network with given tasks
         if tasks is not None:
-            self._build_network(tasks)
+            self.add_tasks(tasks)
 
         if Meta.config["meta_config"]["verbose"]:
             logger.info(
@@ -75,7 +75,7 @@ def _move_to_device(self) -> None:
                 if Meta.config["meta_config"]["verbose"]:
                     logger.info("No cuda device available. Switch to cpu instead.")
 
-    def _build_network(self, tasks: Union[EmmentalTask, List[EmmentalTask]]) -> None:
+    def add_tasks(self, tasks: Union[EmmentalTask, List[EmmentalTask]]) -> None:
         r"""Build the MTL network using all tasks.
 
         Args:
@@ -86,13 +86,6 @@ def _build_network(self, tasks: Union[EmmentalTask, List[EmmentalTask]]) -> None
         if not isinstance(tasks, Iterable):
             tasks = [tasks]
         for task in tasks:
-            if task.name in self.task_names:
-                raise ValueError(
-                    f"Found duplicate task {task.name}, different task should use "
-                    f"different task name."
-                )
-            if not isinstance(task, EmmentalTask):
-                raise ValueError(f"Unrecognized task type {task}.")
             self.add_task(task)
 
     def add_task(self, task: EmmentalTask) -> None:
@@ -102,6 +95,14 @@ def add_task(self, task: EmmentalTask) -> None:
           task(EmmentalTask): A task to add.
 
         """
+        if not isinstance(task, EmmentalTask):
+            raise ValueError(f"Unrecognized task type {task}.")
+
+        if task.name in self.task_names:
+            raise ValueError(
+                f"Found duplicate task {task.name}, different task should use "
+                f"different task name."
+            )
 
         # Combine module_pool from all tasks
         for key in task.module_pool.keys():

diff --git a/tests/data/test_data.py b/tests/data/test_data.py
@@ -1,5 +1,6 @@
 import logging
 
+import pytest
 import torch
 
 from emmental.data import EmmentalDataLoader, EmmentalDataset
@@ -38,13 +39,21 @@ def test_emmental_dataset(caplog):
 
     dataset.add_features(X_dict={"data2": x2})
 
+    dataset.remove_feature("data2")
+    assert "data2" not in dataset.X_dict
+
+    dataset.add_features(X_dict={"data2": x2})
+
     # Check add one more feature to dataset
     assert torch.equal(dataset[0][0]["data2"], x2[0])
 
     y2 = torch.Tensor([1, 1, 1, 1, 1])
 
     dataset.add_labels(Y_dict={"label2": y2})
 
+    with pytest.raises(ValueError):
+        dataset.add_labels(Y_dict={"label2": x2})
+
     # Check add one more label to dataset
     assert torch.equal(dataset[0][1]["label2"], y2[0])
 
@@ -53,6 +62,25 @@ def test_emmental_dataset(caplog):
     # Check remove one more label to dataset
     assert "label1" not in dataset.Y_dict
 
+    with pytest.raises(ValueError):
+        dataset = EmmentalDataset(
+            X_dict={"data1": x1}, Y_dict={"label1": y1}, name="new_data", uid="ids"
+        )
+
+    dataset = EmmentalDataset(
+        X_dict={"_uids_": x1}, Y_dict={"label1": y1}, name="new_data"
+    )
+
+    with pytest.raises(ValueError):
+        dataset = EmmentalDataset(
+            X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data"
+        )
+
+    with pytest.raises(ValueError):
+        dataset = EmmentalDataset(
+            X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data"
+        )
+
 
 def test_emmental_dataloader(caplog):
     """Unit test of emmental dataloader"""

diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
@@ -52,6 +52,14 @@ def test_accuracy(caplog):
 
     assert isequal(metric_dict, {"accuracy@2": 1.0})
 
+    metric_dict = accuracy_scorer(golds, None, preds, normalize=False)
+
+    assert isequal(metric_dict, {"accuracy": 4})
+
+    metric_dict = accuracy_scorer(gold_probs, probs, preds, topk=2, normalize=False)
+
+    assert isequal(metric_dict, {"accuracy@2": 6})
+
 
 def test_precision(caplog):
     """Unit test of precision_scorer"""
@@ -238,27 +246,36 @@ def test_roc_auc(caplog):
 
     caplog.set_level(logging.INFO)
 
-    golds = np.array([1, 0, 1, 0, 1, 0])
+    golds = np.array([[1], [0], [1], [0], [1], [0]])
     gold_probs = np.array(
         [[0.4, 0.6], [0.9, 0.1], [0.3, 0.7], [0.8, 0.2], [0.1, 0.9], [0.6, 0.4]]
     )
     probs = np.array(
         [[0.2, 0.8], [0.4, 0.6], [0.1, 0.9], [0.3, 0.7], [0.3, 0.7], [0.8, 0.2]]
     )
+    preds = np.array([[0.8], [0.6], [0.9], [0.7], [0.7], [0.2]])
 
     metric_dict = roc_auc_scorer(golds, probs, None)
 
     assert isequal(metric_dict, {"roc_auc": 0.9444444444444444})
 
-    golds = np.array([1, 1, 1, 1, 1, 1])
+    metric_dict = roc_auc_scorer(gold_probs, probs, None)
 
-    metric_dict = roc_auc_scorer(golds, probs, None)
-    assert isequal(metric_dict, {"roc_auc": float("nan")})
+    assert isequal(metric_dict, {"roc_auc": 0.9444444444444444})
 
-    metric_dict = roc_auc_scorer(gold_probs, probs, None)
+    metric_dict = roc_auc_scorer(golds, preds, None)
 
     assert isequal(metric_dict, {"roc_auc": 0.9444444444444444})
 
+    metric_dict = roc_auc_scorer(gold_probs, preds, None)
+
+    assert isequal(metric_dict, {"roc_auc": 0.9444444444444444})
+
+    golds = np.array([1, 1, 1, 1, 1, 1])
+
+    metric_dict = roc_auc_scorer(golds, probs, None)
+    assert isequal(metric_dict, {"roc_auc": float("nan")})
+
 
 def test_accuracy_f1(caplog):
     """Unit test of accuracy_f1_scorer"""