Skip to content

Commit

Permalink
Hotfix (#56)
Browse files Browse the repository at this point in the history
* add add_tasks in model and only running scores when evaluation triggered

* fix auc roc

* more tests for metrics

* more tests

* more unit
  • Loading branch information
senwu committed Mar 16, 2020
1 parent 474aac5 commit 368a95a
Show file tree
Hide file tree
Showing 11 changed files with 186 additions and 53 deletions.
2 changes: 1 addition & 1 deletion .isort.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ force_grid_wrap=0
combine_as_imports=True
line_length=88
known_first_party = emmental,tests
known_third_party = numpy,scipy,setuptools,sklearn,torch,yaml
known_third_party = numpy,pytest,scipy,setuptools,sklearn,torch,yaml
8 changes: 7 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
Unreleased_
-----------

dded
added
^^^^^
* `@senwu`_: Support probabilistic gold label in scorer.
* `@senwu`_: Add `add_tasks` to support adding one task or mulitple tasks into model.

Changed
^^^^^^^
* `@senwu`_: Change running evaluation only when evaluation is triggered.


0.0.5_ - 2020-03-01
-------------------
Expand Down
66 changes: 38 additions & 28 deletions src/emmental/learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,11 +364,12 @@ def _logging(

self.logging_manager.update(batch_size)

trigger_evaluation = self.logging_manager.trigger_evaluation()

# Log the loss and lr
metric_dict.update(self._aggregate_running_metrics(model))
metric_dict.update(self._aggregate_running_metrics(model, trigger_evaluation))

# Evaluate the model and log the metric
trigger_evaluation = self.logging_manager.trigger_evaluation()
if trigger_evaluation:

# Log task specific metric
Expand Down Expand Up @@ -407,11 +408,14 @@ def _logging(

return metric_dict

def _aggregate_running_metrics(self, model: EmmentalModel) -> Dict[str, float]:
def _aggregate_running_metrics(
self, model: EmmentalModel, calc_running_scores: bool = False
) -> Dict[str, float]:
r"""Calculate the running overall and task specific metrics.
Args:
model(EmmentalModel): The model to evaluate.
calc_running_scores(bool): Whether to calc running scores
Returns:
dict: The score dict.
Expand All @@ -435,36 +439,43 @@ def _aggregate_running_metrics(self, model: EmmentalModel) -> Dict[str, float]:
total_loss = sum(self.running_losses.values())
metric_dict["model/all/train/loss"] = total_loss / total_count

micro_score_dict: Dict[str, List[ndarray]] = defaultdict(list)
macro_score_dict: Dict[str, List[ndarray]] = defaultdict(list)
if calc_running_scores:
micro_score_dict: Dict[str, List[ndarray]] = defaultdict(list)
macro_score_dict: Dict[str, List[ndarray]] = defaultdict(list)

# Calculate training metric
for identifier in self.running_uids.keys():
task_name, data_name, split = identifier.split("/")
# Calculate training metric
for identifier in self.running_uids.keys():
task_name, data_name, split = identifier.split("/")

metric_score = model.scorers[task_name].score(
self.running_golds[identifier],
self.running_probs[identifier],
prob_to_pred(self.running_probs[identifier]),
self.running_uids[identifier],
)
for metric_name, metric_value in metric_score.items():
metric_dict[f"{identifier}/{metric_name}"] = metric_value
metric_score = model.scorers[task_name].score(
self.running_golds[identifier],
self.running_probs[identifier],
prob_to_pred(self.running_probs[identifier]),
self.running_uids[identifier],
)
for metric_name, metric_value in metric_score.items():
metric_dict[f"{identifier}/{metric_name}"] = metric_value

# Collect average score
identifier = construct_identifier(task_name, data_name, split, "average")
# Collect average score
identifier = construct_identifier(
task_name, data_name, split, "average"
)

metric_dict[identifier] = np.mean(list(metric_score.values()))
metric_dict[identifier] = np.mean(list(metric_score.values()))

micro_score_dict[split].extend(list(metric_score.values()))
macro_score_dict[split].append(metric_dict[identifier])
micro_score_dict[split].extend(list(metric_score.values()))
macro_score_dict[split].append(metric_dict[identifier])

# Collect split-wise micro/macro average score
for split in micro_score_dict.keys():
identifier = construct_identifier("model", "all", split, "micro_average")
metric_dict[identifier] = np.mean(micro_score_dict[split])
identifier = construct_identifier("model", "all", split, "macro_average")
metric_dict[identifier] = np.mean(macro_score_dict[split])
# Collect split-wise micro/macro average score
for split in micro_score_dict.keys():
identifier = construct_identifier(
"model", "all", split, "micro_average"
)
metric_dict[identifier] = np.mean(micro_score_dict[split])
identifier = construct_identifier(
"model", "all", split, "macro_average"
)
metric_dict[identifier] = np.mean(macro_score_dict[split])

# Log the learning rate
metric_dict["model/all/train/lr"] = self.optimizer.param_groups[0]["lr"]
Expand Down Expand Up @@ -539,7 +550,6 @@ def learn(
)

for batch_num, batch in batches:

# Covert single batch into a batch list
if not isinstance(batch, list):
batch = [batch]
Expand Down
3 changes: 0 additions & 3 deletions src/emmental/metrics/pearson_correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@ def pearson_correlation_scorer(

probs = np.vstack(probs).squeeze()
correlation, pvalue = pearsonr(golds, probs)
if np.isnan(correlation):
correlation = 0.0
pvalue = 0.0

if return_pvalue:
return {"pearson_correlation": correlation, "pearson_pvalue": pvalue}
Expand Down
16 changes: 13 additions & 3 deletions src/emmental/metrics/roc_auc.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,20 @@ def roc_auc_scorer(
"""

if len(golds.shape) == 1:
golds = pred_to_prob(golds, n_classes=probs.shape[1])
if len(probs.shape) == 2 and probs.shape[1] == 1:
probs = probs.reshape(probs.shape[0])

if len(golds.shape) == 2 and golds.shape[1] == 1:
golds = golds.reshape(golds.shape[0])

if len(probs.shape) > 1:
if len(golds.shape) > 1:
golds = pred_to_prob(prob_to_pred(golds), n_classes=probs.shape[1])
else:
golds = pred_to_prob(golds, n_classes=probs.shape[1])
else:
golds = pred_to_prob(prob_to_pred(golds), n_classes=probs.shape[1])
if len(golds.shape) > 1:
golds = prob_to_pred(golds)

try:
roc_auc = roc_auc_score(golds, probs)
Expand Down
3 changes: 0 additions & 3 deletions src/emmental/metrics/spearman_correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@ def spearman_correlation_scorer(

probs = np.vstack(probs).squeeze()
correlation, pvalue = spearmanr(golds, probs)
if np.isnan(correlation):
correlation = 0.0
pvalue = 0.0

if return_pvalue:
return {"spearman_correlation": correlation, "spearman_pvalue": pvalue}
Expand Down
19 changes: 10 additions & 9 deletions src/emmental/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def __init__(

# Build network with given tasks
if tasks is not None:
self._build_network(tasks)
self.add_tasks(tasks)

if Meta.config["meta_config"]["verbose"]:
logger.info(
Expand All @@ -75,7 +75,7 @@ def _move_to_device(self) -> None:
if Meta.config["meta_config"]["verbose"]:
logger.info("No cuda device available. Switch to cpu instead.")

def _build_network(self, tasks: Union[EmmentalTask, List[EmmentalTask]]) -> None:
def add_tasks(self, tasks: Union[EmmentalTask, List[EmmentalTask]]) -> None:
r"""Build the MTL network using all tasks.
Args:
Expand All @@ -86,13 +86,6 @@ def _build_network(self, tasks: Union[EmmentalTask, List[EmmentalTask]]) -> None
if not isinstance(tasks, Iterable):
tasks = [tasks]
for task in tasks:
if task.name in self.task_names:
raise ValueError(
f"Found duplicate task {task.name}, different task should use "
f"different task name."
)
if not isinstance(task, EmmentalTask):
raise ValueError(f"Unrecognized task type {task}.")
self.add_task(task)

def add_task(self, task: EmmentalTask) -> None:
Expand All @@ -102,6 +95,14 @@ def add_task(self, task: EmmentalTask) -> None:
task(EmmentalTask): A task to add.
"""
if not isinstance(task, EmmentalTask):
raise ValueError(f"Unrecognized task type {task}.")

if task.name in self.task_names:
raise ValueError(
f"Found duplicate task {task.name}, different task should use "
f"different task name."
)

# Combine module_pool from all tasks
for key in task.module_pool.keys():
Expand Down
28 changes: 28 additions & 0 deletions tests/data/test_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging

import pytest
import torch

from emmental.data import EmmentalDataLoader, EmmentalDataset
Expand Down Expand Up @@ -38,13 +39,21 @@ def test_emmental_dataset(caplog):

dataset.add_features(X_dict={"data2": x2})

dataset.remove_feature("data2")
assert "data2" not in dataset.X_dict

dataset.add_features(X_dict={"data2": x2})

# Check add one more feature to dataset
assert torch.equal(dataset[0][0]["data2"], x2[0])

y2 = torch.Tensor([1, 1, 1, 1, 1])

dataset.add_labels(Y_dict={"label2": y2})

with pytest.raises(ValueError):
dataset.add_labels(Y_dict={"label2": x2})

# Check add one more label to dataset
assert torch.equal(dataset[0][1]["label2"], y2[0])

Expand All @@ -53,6 +62,25 @@ def test_emmental_dataset(caplog):
# Check remove one more label to dataset
assert "label1" not in dataset.Y_dict

with pytest.raises(ValueError):
dataset = EmmentalDataset(
X_dict={"data1": x1}, Y_dict={"label1": y1}, name="new_data", uid="ids"
)

dataset = EmmentalDataset(
X_dict={"_uids_": x1}, Y_dict={"label1": y1}, name="new_data"
)

with pytest.raises(ValueError):
dataset = EmmentalDataset(
X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data"
)

with pytest.raises(ValueError):
dataset = EmmentalDataset(
X_dict={"data1": x1}, Y_dict={"label1": x1}, name="new_data"
)


def test_emmental_dataloader(caplog):
"""Unit test of emmental dataloader"""
Expand Down
27 changes: 22 additions & 5 deletions tests/metrics/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ def test_accuracy(caplog):

assert isequal(metric_dict, {"accuracy@2": 1.0})

metric_dict = accuracy_scorer(golds, None, preds, normalize=False)

assert isequal(metric_dict, {"accuracy": 4})

metric_dict = accuracy_scorer(gold_probs, probs, preds, topk=2, normalize=False)

assert isequal(metric_dict, {"accuracy@2": 6})


def test_precision(caplog):
"""Unit test of precision_scorer"""
Expand Down Expand Up @@ -238,27 +246,36 @@ def test_roc_auc(caplog):

caplog.set_level(logging.INFO)

golds = np.array([1, 0, 1, 0, 1, 0])
golds = np.array([[1], [0], [1], [0], [1], [0]])
gold_probs = np.array(
[[0.4, 0.6], [0.9, 0.1], [0.3, 0.7], [0.8, 0.2], [0.1, 0.9], [0.6, 0.4]]
)
probs = np.array(
[[0.2, 0.8], [0.4, 0.6], [0.1, 0.9], [0.3, 0.7], [0.3, 0.7], [0.8, 0.2]]
)
preds = np.array([[0.8], [0.6], [0.9], [0.7], [0.7], [0.2]])

metric_dict = roc_auc_scorer(golds, probs, None)

assert isequal(metric_dict, {"roc_auc": 0.9444444444444444})

golds = np.array([1, 1, 1, 1, 1, 1])
metric_dict = roc_auc_scorer(gold_probs, probs, None)

metric_dict = roc_auc_scorer(golds, probs, None)
assert isequal(metric_dict, {"roc_auc": float("nan")})
assert isequal(metric_dict, {"roc_auc": 0.9444444444444444})

metric_dict = roc_auc_scorer(gold_probs, probs, None)
metric_dict = roc_auc_scorer(golds, preds, None)

assert isequal(metric_dict, {"roc_auc": 0.9444444444444444})

metric_dict = roc_auc_scorer(gold_probs, preds, None)

assert isequal(metric_dict, {"roc_auc": 0.9444444444444444})

golds = np.array([1, 1, 1, 1, 1, 1])

metric_dict = roc_auc_scorer(golds, probs, None)
assert isequal(metric_dict, {"roc_auc": float("nan")})


def test_accuracy_f1(caplog):
"""Unit test of accuracy_f1_scorer"""
Expand Down

0 comments on commit 368a95a

Please sign in to comment.