-
Notifications
You must be signed in to change notification settings - Fork 4.5k
/
model_testing.py
405 lines (325 loc) 路 12.6 KB
/
model_testing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
import logging
import os
from typing import Text, Dict, Optional, List, Any, Iterable, Tuple, Union
from pathlib import Path
import rasa.shared.utils.cli
import rasa.shared.utils.common
import rasa.shared.utils.io
import rasa.utils.common
from rasa.constants import RESULTS_FILE, NUMBER_OF_TRAINING_STORIES_FILE
from rasa.shared.constants import DEFAULT_RESULTS_PATH
from rasa.exceptions import ModelNotFound
import rasa.shared.nlu.training_data.loading
import rasa.shared.importers.autoconfig
from rasa.shared.nlu.training_data.training_data import TrainingData
logger = logging.getLogger(__name__)
def test_core_models_in_directory(
model_directory: Text,
stories: Text,
output: Text,
use_conversation_test_files: bool = False,
) -> None:
"""Evaluates a directory with multiple Core models using test data.
Args:
model_directory: Directory containing multiple model files.
stories: Path to a conversation test file.
output: Output directory to store results to.
use_conversation_test_files: `True` if conversation test files should be used
for testing instead of regular Core story files.
"""
from rasa.core.test import compare_models_in_dir
model_directory = _get_sanitized_model_directory(model_directory)
rasa.utils.common.run_in_loop(
compare_models_in_dir(
model_directory,
stories,
output,
use_conversation_test_files=use_conversation_test_files,
)
)
story_n_path = os.path.join(model_directory, NUMBER_OF_TRAINING_STORIES_FILE)
number_of_stories = rasa.shared.utils.io.read_json_file(story_n_path)
plot_core_results(output, number_of_stories)
def plot_core_results(output_directory: Text, number_of_examples: List[int]) -> None:
"""Plot core model comparison graph.
Args:
output_directory: path to the output directory
number_of_examples: number of examples per run
"""
import rasa.utils.plotting as plotting_utils
graph_path = os.path.join(output_directory, "core_model_comparison_graph.pdf")
plotting_utils.plot_curve(
output_directory,
number_of_examples,
x_label_text="Number of stories present during training",
y_label_text="Number of correct test stories",
graph_path=graph_path,
)
def _get_sanitized_model_directory(model_directory: Text) -> Text:
"""Adjusts the `--model` argument of `rasa test core` when called with
`--evaluate-model-directory`.
By default rasa uses the latest model for the `--model` parameter. However, for
`--evaluate-model-directory` we need a directory. This function checks if the
passed parameter is a model or an individual model file.
Args:
model_directory: The model_directory argument that was given to
`test_core_models_in_directory`.
Returns: The adjusted model_directory that should be used in
`test_core_models_in_directory`.
"""
import rasa.model
p = Path(model_directory)
if p.is_file():
if model_directory != rasa.model.get_latest_model():
rasa.shared.utils.cli.print_warning(
"You passed a file as '--model'. Will use the directory containing "
"this file instead."
)
model_directory = str(p.parent)
return model_directory
def test_core_models(
models: List[Text],
stories: Text,
output: Text,
use_conversation_test_files: bool = False,
) -> None:
"""Compares multiple Core models based on test data.
Args:
models: A list of models files.
stories: Path to test data.
output: Path to output directory for test results.
use_conversation_test_files: `True` if conversation test files should be used
for testing instead of regular Core story files.
"""
from rasa.core.test import compare_models
rasa.utils.common.run_in_loop(
compare_models(
models,
stories,
output,
use_conversation_test_files=use_conversation_test_files,
)
)
def test_core(
model: Optional[Text] = None,
stories: Optional[Text] = None,
output: Text = DEFAULT_RESULTS_PATH,
additional_arguments: Optional[Dict] = None,
use_conversation_test_files: bool = False,
) -> None:
"""Tests a trained Core model against a set of test stories."""
import rasa.model
from rasa.shared.nlu.interpreter import RegexInterpreter
from rasa.core.agent import Agent
if additional_arguments is None:
additional_arguments = {}
if output:
rasa.shared.utils.io.create_directory(output)
try:
unpacked_model = rasa.model.get_model(model)
except ModelNotFound:
rasa.shared.utils.cli.print_error(
"Unable to test: could not find a model. Use 'rasa train' to train a "
"Rasa model and provide it via the '--model' argument."
)
return
_agent = Agent.load(unpacked_model)
if _agent.policy_ensemble is None:
rasa.shared.utils.cli.print_error(
"Unable to test: could not find a Core model. Use 'rasa train' to train a "
"Rasa model and provide it via the '--model' argument."
)
if isinstance(_agent.interpreter, RegexInterpreter):
rasa.shared.utils.cli.print_warning(
"No NLU model found. Using default 'RegexInterpreter' for end-to-end "
"evaluation. If you added actual user messages to your test stories "
"this will likely lead to the tests failing. In that case, you need "
"to train a NLU model first, e.g. using `rasa train`."
)
from rasa.core.test import test as core_test
kwargs = rasa.shared.utils.common.minimal_kwargs(
additional_arguments, core_test, ["stories", "agent", "e2e"]
)
rasa.utils.common.run_in_loop(
core_test(
stories,
_agent,
e2e=use_conversation_test_files,
out_directory=output,
**kwargs,
)
)
async def test_nlu(
model: Optional[Text],
nlu_data: Optional[Text],
output_directory: Text = DEFAULT_RESULTS_PATH,
additional_arguments: Optional[Dict] = None,
) -> None:
"""Tests the NLU Model."""
from rasa.nlu.test import run_evaluation
from rasa.model import get_model
try:
unpacked_model = get_model(model)
except ModelNotFound:
rasa.shared.utils.cli.print_error(
"Could not find any model. Use 'rasa train nlu' to train a "
"Rasa model and provide it via the '--model' argument."
)
return
rasa.shared.utils.io.create_directory(output_directory)
nlu_model = os.path.join(unpacked_model, "nlu")
if os.path.exists(nlu_model):
kwargs = rasa.shared.utils.common.minimal_kwargs(
additional_arguments, run_evaluation, ["data_path", "model"]
)
await run_evaluation(
nlu_data, nlu_model, output_directory=output_directory, **kwargs
)
else:
rasa.shared.utils.cli.print_error(
"Could not find any model. Use 'rasa train nlu' to train a "
"Rasa model and provide it via the '--model' argument."
)
async def compare_nlu_models(
configs: List[Text],
test_data: TrainingData,
output: Text,
runs: int,
exclusion_percentages: List[int],
) -> None:
"""Trains multiple models, compares them and saves the results."""
from rasa.nlu.test import drop_intents_below_freq
from rasa.nlu.utils import write_json_to_file
from rasa.utils.io import create_path
from rasa.nlu.test import compare_nlu
test_data = drop_intents_below_freq(test_data, cutoff=5)
create_path(output)
bases = [os.path.basename(nlu_config) for nlu_config in configs]
model_names = [os.path.splitext(base)[0] for base in bases]
f1_score_results = {
model_name: [[] for _ in range(runs)] for model_name in model_names
}
training_examples_per_run = await compare_nlu(
configs,
test_data,
exclusion_percentages,
f1_score_results,
model_names,
output,
runs,
)
f1_path = os.path.join(output, RESULTS_FILE)
write_json_to_file(f1_path, f1_score_results)
plot_nlu_results(output, training_examples_per_run)
def plot_nlu_results(output_directory: Text, number_of_examples: List[int]) -> None:
"""Plot NLU model comparison graph.
Args:
output_directory: path to the output directory
number_of_examples: number of examples per run
"""
import rasa.utils.plotting as plotting_utils
graph_path = os.path.join(output_directory, "nlu_model_comparison_graph.pdf")
plotting_utils.plot_curve(
output_directory,
number_of_examples,
x_label_text="Number of intent examples present during training",
y_label_text="Label-weighted average F1 score on test set",
graph_path=graph_path,
)
def perform_nlu_cross_validation(
config: Dict[Text, Any],
data: TrainingData,
output: Text,
additional_arguments: Optional[Dict[Text, Any]],
) -> None:
"""Runs cross-validation on test data.
Args:
config: The model configuration.
data: The data which is used for the cross-validation.
output: Output directory for the cross-validation results.
additional_arguments: Additional arguments which are passed to the
cross-validation, like number of `disable_plotting`.
"""
import rasa.nlu.config
from rasa.nlu.test import (
drop_intents_below_freq,
cross_validate,
log_results,
log_entity_results,
)
additional_arguments = additional_arguments or {}
folds = int(additional_arguments.get("folds", 3))
data = drop_intents_below_freq(data, cutoff=folds)
kwargs = rasa.shared.utils.common.minimal_kwargs(
additional_arguments, cross_validate
)
results, entity_results, response_selection_results = cross_validate(
data, folds, config, output, **kwargs
)
logger.info(f"CV evaluation (n={folds})")
if any(results):
logger.info("Intent evaluation results")
log_results(results.train, "train")
log_results(results.test, "test")
if any(entity_results):
logger.info("Entity evaluation results")
log_entity_results(entity_results.train, "train")
log_entity_results(entity_results.test, "test")
if any(response_selection_results):
logger.info("Response Selection evaluation results")
log_results(response_selection_results.train, "train")
log_results(response_selection_results.test, "test")
def get_evaluation_metrics(
targets: Iterable[Any],
predictions: Iterable[Any],
output_dict: bool = False,
exclude_label: Optional[Text] = None,
) -> Tuple[Union[Text, Dict[Text, Dict[Text, float]]], float, float, float]:
"""Compute the f1, precision, accuracy and summary report from sklearn.
Args:
targets: target labels
predictions: predicted labels
output_dict: if True sklearn returns a summary report as dict, if False the
report is in string format
exclude_label: labels to exclude from evaluation
Returns:
Report from sklearn, precision, f1, and accuracy values.
"""
from sklearn import metrics
targets = clean_labels(targets)
predictions = clean_labels(predictions)
labels = get_unique_labels(targets, exclude_label)
if not labels:
logger.warning("No labels to evaluate. Skip evaluation.")
return {}, 0.0, 0.0, 0.0
report = metrics.classification_report(
targets, predictions, labels=labels, output_dict=output_dict
)
precision = metrics.precision_score(
targets, predictions, labels=labels, average="weighted"
)
f1 = metrics.f1_score(targets, predictions, labels=labels, average="weighted")
accuracy = metrics.accuracy_score(targets, predictions)
return report, precision, f1, accuracy
def clean_labels(labels: Iterable[Text]) -> List[Text]:
"""Remove `None` labels. sklearn metrics do not support them.
Args:
labels: list of labels
Returns:
Cleaned labels.
"""
return [label if label is not None else "" for label in labels]
def get_unique_labels(
targets: Iterable[Text], exclude_label: Optional[Text]
) -> List[Text]:
"""Get unique labels. Exclude 'exclude_label' if specified.
Args:
targets: labels
exclude_label: label to exclude
Returns:
Unique labels.
"""
labels = set(targets)
if exclude_label and exclude_label in labels:
labels.remove(exclude_label)
return list(labels)