From 392e12f93e31bec07ea7f7071aff76c703209576 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Tue, 12 Feb 2019 17:17:37 +0100 Subject: [PATCH 01/17] refactor stuff to make it accessible by rasa stack --- rasa_nlu/evaluate.py | 11 +++++---- rasa_nlu/server.py | 55 ++++++++++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 29 deletions(-) diff --git a/rasa_nlu/evaluate.py b/rasa_nlu/evaluate.py index 1fe76f343705..3e4f5a8586e9 100644 --- a/rasa_nlu/evaluate.py +++ b/rasa_nlu/evaluate.py @@ -39,6 +39,13 @@ def create_argument_parser(): description='evaluate a Rasa NLU pipeline with cross ' 'validation or on external data') + utils.add_logging_option_arguments(parser, default=logging.INFO) + _add_arguments(parser) + + return parser + + +def _add_arguments(parser): parser.add_argument('-d', '--data', required=True, help="file containing training/evaluation data") @@ -75,10 +82,6 @@ def create_argument_parser(): parser.add_argument('--confmat', required=False, default="confmat.png", help="output path for the confusion matrix plot") - utils.add_logging_option_arguments(parser, default=logging.INFO) - - return parser - def plot_confusion_matrix(cm, classes, diff --git a/rasa_nlu/server.py b/rasa_nlu/server.py index fa5749bf508c..3009c50cf87d 100644 --- a/rasa_nlu/server.py +++ b/rasa_nlu/server.py @@ -21,7 +21,13 @@ def create_argument_parser(): parser = argparse.ArgumentParser(description='parse incoming text') + add_run_arguments(parser) + utils.add_logging_option_arguments(parser) + + return parser + +def add_run_arguments(parser): parser.add_argument('-e', '--emulate', choices=['wit', 'luis', 'dialogflow'], help='which service to emulate (default: None i.e. use' @@ -90,10 +96,6 @@ def create_argument_parser(): help="Default model configuration file used for " "training.") - utils.add_logging_option_arguments(parser) - - return parser - def check_cors(f): """Wraps a request handler with CORS headers checking.""" @@ -406,23 +408,20 @@ def unload_model(self, request): return simplejson.dumps({"error": "{}".format(e)}) -if __name__ == '__main__': - # Running as standalone python application - cmdline_args = create_argument_parser().parse_args() +def main(args): + utils.configure_colored_logging(args.loglevel) + pre_load = args.pre_load - utils.configure_colored_logging(cmdline_args.loglevel) - pre_load = cmdline_args.pre_load - - _endpoints = read_endpoints(cmdline_args.endpoints) + _endpoints = read_endpoints(args.endpoints) router = DataRouter( - cmdline_args.path, - cmdline_args.max_training_processes, - cmdline_args.response_log, - cmdline_args.emulate, - cmdline_args.storage, + args.path, + args.max_training_processes, + args.response_log, + args.emulate, + args.storage, model_server=_endpoints.model, - wait_time_between_pulls=cmdline_args.wait_time_between_pulls + wait_time_between_pulls=args.wait_time_between_pulls ) if pre_load: logger.debug('Preloading....') @@ -432,13 +431,19 @@ def unload_model(self, request): rasa = RasaNLU( router, - cmdline_args.loglevel, - cmdline_args.write, - cmdline_args.num_threads, - cmdline_args.token, - cmdline_args.cors, - default_config_path=cmdline_args.config + args.loglevel, + args.write, + args.num_threads, + args.token, + args.cors, + default_config_path=args.config ) - logger.info('Started http server on port %s' % cmdline_args.port) - rasa.app.run('0.0.0.0', cmdline_args.port) + logger.info('Started http server on port %s' % args.port) + rasa.app.run('0.0.0.0', args.port) + + +if __name__ == '__main__': + # Running as standalone python application + cmdline_args = create_argument_parser().parse_args() + main(cmdline_args) From 91995794ac1e8a8dc68810c396f6a76015c9e698 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Tue, 12 Feb 2019 18:57:14 +0100 Subject: [PATCH 02/17] refactored convert so it's accessible by rasa stack --- rasa_nlu/convert.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/rasa_nlu/convert.py b/rasa_nlu/convert.py index 86680ad1a5bb..e92fa5392563 100644 --- a/rasa_nlu/convert.py +++ b/rasa_nlu/convert.py @@ -4,10 +4,7 @@ from rasa_nlu.utils import write_to_file -def create_argument_parser(): - parser = argparse.ArgumentParser( - description='Convert training data formats into one another') - +def add_arguments(parser): parser.add_argument('-d', '--data_file', required=True, help='file or dir containing training data') @@ -39,11 +36,16 @@ def convert_training_data(data_file, out_file, output_format, language): write_to_file(out_file, output) -if __name__ == "__main__": - arg_parser = create_argument_parser() - args = arg_parser.parse_args() - +def main(args): convert_training_data(args.data_file, args.out_file, args.format, args.language) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='Convert training data formats into one another') + add_arguments(parser) + + main(parser.parse_args()) From e1c946d3413b155fcacfc9c272faa00d53cdf548 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Wed, 13 Feb 2019 14:21:12 +0100 Subject: [PATCH 03/17] move arguments to subdirectory to enable fast imports of these --- rasa_nlu/cli/__init__.py | 1 + rasa_nlu/cli/server.py | 68 ++++++++++++++++++++++++++++++++++++ rasa_nlu/server.py | 74 ++-------------------------------------- 3 files changed, 71 insertions(+), 72 deletions(-) create mode 100644 rasa_nlu/cli/__init__.py create mode 100644 rasa_nlu/cli/server.py diff --git a/rasa_nlu/cli/__init__.py b/rasa_nlu/cli/__init__.py new file mode 100644 index 000000000000..71b760f46232 --- /dev/null +++ b/rasa_nlu/cli/__init__.py @@ -0,0 +1 @@ +import rasa_nlu.cli.server diff --git a/rasa_nlu/cli/server.py b/rasa_nlu/cli/server.py new file mode 100644 index 000000000000..79747b9b0ab9 --- /dev/null +++ b/rasa_nlu/cli/server.py @@ -0,0 +1,68 @@ +def add_server_arguments(parser): + parser.add_argument('-e', '--emulate', + choices=['wit', 'luis', 'dialogflow'], + help='which service to emulate (default: None i.e. use' + ' simple built in format)') + parser.add_argument('-P', '--port', + type=int, + default=5000, + help='port on which to run server') + parser.add_argument('--pre_load', + nargs='+', + default=[], + help='Preload models into memory before starting the ' + 'server. \nIf given `all` as input all the models ' + 'will be loaded.\nElse you can specify a list of ' + 'specific project names.\nEg: python -m ' + 'rasa_nlu.server --pre_load project1 ' + '--path projects ' + '-c config.yaml') + parser.add_argument('-t', '--token', + help="auth token. If set, reject requests which don't " + "provide this token as a query parameter") + parser.add_argument('-w', '--write', + help='file where logs will be saved') + parser.add_argument('--path', + required=True, + help="working directory of the server. Models are" + "loaded from this directory and trained models " + "will be saved here.") + parser.add_argument('--cors', + nargs="*", + help='List of domain patterns from where CORS ' + '(cross-origin resource sharing) calls are ' + 'allowed. The default value is `[]` which ' + 'forbids all CORS requests.') + + parser.add_argument('--max_training_processes', + type=int, + default=1, + help='Number of processes used to handle training ' + 'requests. Increasing this value will have a ' + 'great impact on memory usage. It is ' + 'recommended to keep the default value.') + parser.add_argument('--num_threads', + type=int, + default=1, + help='Number of parallel threads to use for ' + 'handling parse requests.') + parser.add_argument('--endpoints', + help='Configuration file for the model server ' + 'as a yaml file') + parser.add_argument('--wait_time_between_pulls', + type=int, + default=10, + help='Wait time in seconds between NLU model server' + 'queries.') + parser.add_argument('--response_log', + help='Directory where logs will be saved ' + '(containing queries and responses).' + 'If set to ``null`` logging will be disabled.') + parser.add_argument('--storage', + help='Set the remote location where models are stored. ' + 'E.g. on AWS. If nothing is configured, the ' + 'server will only serve the models that are ' + 'on disk in the configured `path`.') + parser.add_argument('-c', '--config', + help="Default model configuration file used for " + "training.") diff --git a/rasa_nlu/server.py b/rasa_nlu/server.py index 3009c50cf87d..69bbec6d82b5 100644 --- a/rasa_nlu/server.py +++ b/rasa_nlu/server.py @@ -6,7 +6,7 @@ from twisted.internet import reactor, threads from twisted.internet.defer import inlineCallbacks, returnValue -from rasa_nlu import config, utils +from rasa_nlu import config, utils, cli from rasa_nlu.config import RasaNLUModelConfig from rasa_nlu.data_router import ( DataRouter, InvalidProjectError, @@ -21,82 +21,12 @@ def create_argument_parser(): parser = argparse.ArgumentParser(description='parse incoming text') - add_run_arguments(parser) + cli.server.add_server_arguments(parser) utils.add_logging_option_arguments(parser) return parser -def add_run_arguments(parser): - parser.add_argument('-e', '--emulate', - choices=['wit', 'luis', 'dialogflow'], - help='which service to emulate (default: None i.e. use' - ' simple built in format)') - parser.add_argument('-P', '--port', - type=int, - default=5000, - help='port on which to run server') - parser.add_argument('--pre_load', - nargs='+', - default=[], - help='Preload models into memory before starting the ' - 'server. \nIf given `all` as input all the models ' - 'will be loaded.\nElse you can specify a list of ' - 'specific project names.\nEg: python -m ' - 'rasa_nlu.server --pre_load project1 ' - '--path projects ' - '-c config.yaml') - parser.add_argument('-t', '--token', - help="auth token. If set, reject requests which don't " - "provide this token as a query parameter") - parser.add_argument('-w', '--write', - help='file where logs will be saved') - parser.add_argument('--path', - required=True, - help="working directory of the server. Models are" - "loaded from this directory and trained models " - "will be saved here.") - parser.add_argument('--cors', - nargs="*", - help='List of domain patterns from where CORS ' - '(cross-origin resource sharing) calls are ' - 'allowed. The default value is `[]` which ' - 'forbids all CORS requests.') - - parser.add_argument('--max_training_processes', - type=int, - default=1, - help='Number of processes used to handle training ' - 'requests. Increasing this value will have a ' - 'great impact on memory usage. It is ' - 'recommended to keep the default value.') - parser.add_argument('--num_threads', - type=int, - default=1, - help='Number of parallel threads to use for ' - 'handling parse requests.') - parser.add_argument('--endpoints', - help='Configuration file for the model server ' - 'as a yaml file') - parser.add_argument('--wait_time_between_pulls', - type=int, - default=10, - help='Wait time in seconds between NLU model server' - 'queries.') - parser.add_argument('--response_log', - help='Directory where logs will be saved ' - '(containing queries and responses).' - 'If set to ``null`` logging will be disabled.') - parser.add_argument('--storage', - help='Set the remote location where models are stored. ' - 'E.g. on AWS. If nothing is configured, the ' - 'server will only serve the models that are ' - 'on disk in the configured `path`.') - parser.add_argument('-c', '--config', - help="Default model configuration file used for " - "training.") - - def check_cors(f): """Wraps a request handler with CORS headers checking.""" From 78f349646d2941729e347b6cb64bf9de58f451fb Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Thu, 21 Feb 2019 11:47:24 +0100 Subject: [PATCH 04/17] rename evaluate script to test --- rasa_nlu/data_router.py | 2 +- rasa_nlu/{evaluate.py => test.py} | 0 tests/base/test_evaluation.py | 10 +++++----- 3 files changed, 6 insertions(+), 6 deletions(-) rename rasa_nlu/{evaluate.py => test.py} (100%) diff --git a/rasa_nlu/data_router.py b/rasa_nlu/data_router.py index a24c8a8f4254..5c4f9d816b2e 100644 --- a/rasa_nlu/data_router.py +++ b/rasa_nlu/data_router.py @@ -14,7 +14,7 @@ from rasa_nlu.components import ComponentBuilder from rasa_nlu.config import RasaNLUModelConfig from rasa_nlu.emulators import NoEmulator -from rasa_nlu.evaluate import run_evaluation +from rasa_nlu.test import run_evaluation from rasa_nlu.model import InvalidProjectError from rasa_nlu.project import ( Project, STATUS_FAILED, STATUS_READY, STATUS_TRAINING, load_from_server) diff --git a/rasa_nlu/evaluate.py b/rasa_nlu/test.py similarity index 100% rename from rasa_nlu/evaluate.py rename to rasa_nlu/test.py diff --git a/tests/base/test_evaluation.py b/tests/base/test_evaluation.py index d6b53a5ef1ea..9f4ddb2755db 100644 --- a/tests/base/test_evaluation.py +++ b/tests/base/test_evaluation.py @@ -4,7 +4,7 @@ import pytest -from rasa_nlu.evaluate import ( +from rasa_nlu.test import ( is_token_within_entity, do_entities_overlap, merge_labels, remove_duckling_entities, remove_empty_intent_examples, get_entity_extractors, @@ -12,10 +12,10 @@ find_component, remove_duckling_extractors, drop_intents_below_freq, run_cv_evaluation, substitute_labels, IntentEvaluationResult, evaluate_intents, evaluate_entities) -from rasa_nlu.evaluate import does_token_cross_borders -from rasa_nlu.evaluate import align_entity_predictions -from rasa_nlu.evaluate import determine_intersection -from rasa_nlu.evaluate import determine_token_labels +from rasa_nlu.test import does_token_cross_borders +from rasa_nlu.test import align_entity_predictions +from rasa_nlu.test import determine_intersection +from rasa_nlu.test import determine_token_labels from rasa_nlu.config import RasaNLUModelConfig from rasa_nlu.tokenizers import Token from rasa_nlu import utils From 14bdd726eda8ec2112c31af523b695c875b7b129 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Thu, 21 Feb 2019 11:48:01 +0100 Subject: [PATCH 05/17] make dirs for data persistence if not existing --- rasa_nlu/training_data/training_data.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/rasa_nlu/training_data/training_data.py b/rasa_nlu/training_data/training_data.py index 195ee752bdce..a9a0104d944d 100644 --- a/rasa_nlu/training_data/training_data.py +++ b/rasa_nlu/training_data/training_data.py @@ -125,11 +125,15 @@ def as_markdown(self) -> Text: from rasa_nlu.training_data.formats import MarkdownWriter return MarkdownWriter().dumps(self) - def persist(self, dir_name: Text) -> Dict[Text, Any]: + def persist(self, dir_name: Text, filename: Text = "training_data.json" + ) -> Dict[Text, Any]: """Persists this training data to disk and returns necessary information to load it again.""" - data_file = os.path.join(dir_name, "training_data.json") + if not os.path.exists(dir_name): + os.makedirs(dir_name) + + data_file = os.path.join(dir_name, filename) write_to_file(data_file, self.as_json(indent=2)) return { From 7da605674d02dbe60c7f91acbc2ceb365b652e11 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Thu, 21 Feb 2019 11:53:09 +0100 Subject: [PATCH 06/17] add missing deprecated evalute module --- rasa_nlu/evaluate.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 rasa_nlu/evaluate.py diff --git a/rasa_nlu/evaluate.py b/rasa_nlu/evaluate.py new file mode 100644 index 000000000000..f9794254815f --- /dev/null +++ b/rasa_nlu/evaluate.py @@ -0,0 +1,10 @@ +import logging + +import rasa_nlu.test as test + +logger = logging.getLogger(__name__) + +if __name__ == '__main__': # pragma: no cover + logger.warning("Calling `rasa_nlu.evaluate` is deprecated. " + "Please use `rasa_nlu.test` instead.") + test.main() From c7163d5fdf28deec07ca07937676637fbd0f77d9 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Thu, 21 Feb 2019 12:01:36 +0100 Subject: [PATCH 07/17] rename do_train to train and make it compatible with config file paths --- rasa_nlu/train.py | 52 +++++++++++++++++++----------------- tests/base/test_persistor.py | 2 +- tests/training/test_train.py | 18 ++++++------- tests/utilities.py | 6 ++--- 4 files changed, 40 insertions(+), 38 deletions(-) diff --git a/rasa_nlu/train.py b/rasa_nlu/train.py index e3b9be7eacf7..dd86fd2342a3 100644 --- a/rasa_nlu/train.py +++ b/rasa_nlu/train.py @@ -1,6 +1,6 @@ import argparse import logging -from typing import Any, Optional, Text, Tuple +from typing import Any, Optional, Text, Tuple, Union from rasa_nlu import config, utils from rasa_nlu.components import ComponentBuilder @@ -106,37 +106,39 @@ def do_train_in_worker(cfg: RasaNLUModelConfig, """Loads the trainer and the data and runs the training in a worker.""" try: - _, _, persisted_path = do_train(cfg, data, path, project, - fixed_model_name, storage, - component_builder) + _, _, persisted_path = train(cfg, data, path, project, + fixed_model_name, storage, + component_builder) return persisted_path except BaseException as e: logger.exception("Failed to train project '{}'.".format(project)) raise TrainingException(project, e) -def do_train(cfg: RasaNLUModelConfig, - data: Text, - path: Optional[Text] = None, - project: Optional[Text] = None, - fixed_model_name: Optional[Text] = None, - storage: Optional[Text] = None, - component_builder: Optional[ComponentBuilder] = None, - training_data_endpoint: Optional[EndpointConfig] = None, - **kwargs: Any - ) -> Tuple[Trainer, Interpreter, Text]: +def train(nlu_config: Union[Text, RasaNLUModelConfig], + data: Text, + path: Optional[Text] = None, + project: Optional[Text] = None, + fixed_model_name: Optional[Text] = None, + storage: Optional[Text] = None, + component_builder: Optional[ComponentBuilder] = None, + training_data_endpoint: Optional[EndpointConfig] = None, + **kwargs: Any + ) -> Tuple[Trainer, Interpreter, Text]: """Loads the trainer and the data and runs the training of the model.""" + if isinstance(nlu_config, str): + nlu_config = config.load(nlu_config) # Ensure we are training a model that we can save in the end # WARN: there is still a race condition if a model with the same name is # trained in another subprocess - trainer = Trainer(cfg, component_builder) + trainer = Trainer(nlu_config, component_builder) persistor = create_persistor(storage) if training_data_endpoint is not None: training_data = load_data_from_endpoint(training_data_endpoint, - cfg.language) + nlu_config.language) else: - training_data = load_data(data, cfg.language) + training_data = load_data(data, nlu_config.language) interpreter = trainer.train(training_data, **kwargs) if path: @@ -160,12 +162,12 @@ def do_train(cfg: RasaNLUModelConfig, else: data_endpoint = read_endpoints(cmdline_args.endpoints).data - do_train(config.load(cmdline_args.config), - cmdline_args.data, - cmdline_args.path, - cmdline_args.project, - cmdline_args.fixed_model_name, - cmdline_args.storage, - data_endpoint=data_endpoint, - num_threads=cmdline_args.num_threads) + train(cmdline_args.config, + cmdline_args.data, + cmdline_args.path, + cmdline_args.project, + cmdline_args.fixed_model_name, + cmdline_args.storage, + data_endpoint=data_endpoint, + num_threads=cmdline_args.num_threads) logger.info("Finished training") diff --git a/tests/base/test_persistor.py b/tests/base/test_persistor.py index 46de6f0a5aeb..954e061e33b5 100644 --- a/tests/base/test_persistor.py +++ b/tests/base/test_persistor.py @@ -24,7 +24,7 @@ def test_list_projects_method_in_AWSPersistor(component_builder, tmpdir): os.environ["BUCKET_NAME"] = 'rasa-test' os.environ["AWS_DEFAULT_REGION"] = 'us-east-1' - (trained, _, persisted_path) = train.do_train( + (trained, _, persisted_path) = train.train( _config, data="data/test/demo-rasa-small.json", path=tmpdir.strpath, diff --git a/tests/training/test_train.py b/tests/training/test_train.py index b67a7b0b3330..5b08cee6f22c 100644 --- a/tests/training/test_train.py +++ b/tests/training/test_train.py @@ -68,7 +68,7 @@ def test_all_components_are_in_at_least_one_test_pipeline(): list(registry.registered_pipeline_templates.keys())) def test_train_model(pipeline_template, component_builder, tmpdir): _config = utilities.base_test_conf(pipeline_template) - (trained, _, persisted_path) = train.do_train( + (trained, _, persisted_path) = train.train( _config, path=tmpdir.strpath, data=DEFAULT_DATA_PATH, @@ -89,13 +89,13 @@ def test_random_seed(component_builder, tmpdir): _config.set_component_attr("intent_classifier_tensorflow_embedding", random_seed=1) # first run - (trained_a, _, persisted_path_a) = train.do_train( + (trained_a, _, persisted_path_a) = train.train( _config, path=tmpdir.strpath + "_a", data=DEFAULT_DATA_PATH, component_builder=component_builder) # second run - (trained_b, _, persisted_path_b) = train.do_train( + (trained_b, _, persisted_path_b) = train.train( _config, path=tmpdir.strpath + "_b", data=DEFAULT_DATA_PATH, @@ -112,7 +112,7 @@ def test_random_seed(component_builder, tmpdir): def test_train_model_on_test_pipelines(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) - (trained, _, persisted_path) = train.do_train( + (trained, _, persisted_path) = train.train( _config, path=tmpdir.strpath, data=DEFAULT_DATA_PATH, @@ -128,7 +128,7 @@ def test_train_model_on_test_pipelines(language, pipeline, @pytest.mark.parametrize("language, pipeline", pipelines_for_tests()) def test_train_model_noents(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) - (trained, _, persisted_path) = train.do_train( + (trained, _, persisted_path) = train.train( _config, path=tmpdir.strpath, data="./data/test/demo-rasa-noents.json", @@ -144,7 +144,7 @@ def test_train_model_noents(language, pipeline, component_builder, tmpdir): @pytest.mark.parametrize("language, pipeline", pipelines_for_tests()) def test_train_model_multithread(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) - (trained, _, persisted_path) = train.do_train( + (trained, _, persisted_path) = train.train( _config, path=tmpdir.strpath, data=DEFAULT_DATA_PATH, @@ -161,7 +161,7 @@ def test_train_model_empty_pipeline(component_builder): # Should return an empty pipeline _config = utilities.base_test_conf(pipeline_template=None) with pytest.raises(ValueError): - train.do_train( + train.train( _config, data=DEFAULT_DATA_PATH, component_builder=component_builder) @@ -169,7 +169,7 @@ def test_train_model_empty_pipeline(component_builder): def test_train_named_model(component_builder, tmpdir): _config = utilities.base_test_conf("keyword") - (trained, _, persisted_path) = train.do_train( + (trained, _, persisted_path) = train.train( _config, path=tmpdir.strpath, project="my_keyword_model", @@ -185,7 +185,7 @@ def test_handles_pipeline_with_non_existing_component(component_builder): _config = utilities.base_test_conf("spacy_sklearn") _config.pipeline.append({"name": "my_made_up_component"}) with pytest.raises(Exception) as execinfo: - train.do_train( + train.train( _config, data=DEFAULT_DATA_PATH, component_builder=component_builder) diff --git a/tests/utilities.py b/tests/utilities.py index 27b65260b562..0d8e691d2cf7 100644 --- a/tests/utilities.py +++ b/tests/utilities.py @@ -5,7 +5,7 @@ from rasa_nlu.config import RasaNLUModelConfig from rasa_nlu.model import Interpreter -from rasa_nlu.train import do_train +from rasa_nlu.train import train slowtest = pytest.mark.slowtest @@ -29,8 +29,8 @@ def write_file_config(file_config): def interpreter_for(component_builder, data, path, config): - (trained, _, path) = do_train(config, data, path, - component_builder=component_builder) + (trained, _, path) = train(config, data, path, + component_builder=component_builder) interpreter = Interpreter.load(path, component_builder) return interpreter From 2e5cda6b5158bb346ea74ee8aa9b02420a351188 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Fri, 22 Feb 2019 15:11:16 +0100 Subject: [PATCH 08/17] prepare for api --- rasa_nlu/__init__.py | 5 +++++ rasa_nlu/convert.py | 1 - rasa_nlu/test.py | 27 +++++++++++++++++---------- tests/base/test_evaluation.py | 4 ++-- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/rasa_nlu/__init__.py b/rasa_nlu/__init__.py index 02dee4d28c72..47e576327414 100644 --- a/rasa_nlu/__init__.py +++ b/rasa_nlu/__init__.py @@ -2,6 +2,11 @@ import rasa_nlu.version +from rasa_nlu.train import train +from rasa_nlu.test import run_evaluation as test +from rasa_nlu.test import cross_validate +from rasa_nlu.training_data import load_data + logging.getLogger(__name__).addHandler(logging.NullHandler()) __version__ = rasa_nlu.version.__version__ diff --git a/rasa_nlu/convert.py b/rasa_nlu/convert.py index e92fa5392563..fed74e8f2592 100644 --- a/rasa_nlu/convert.py +++ b/rasa_nlu/convert.py @@ -47,5 +47,4 @@ def main(args): parser = argparse.ArgumentParser( description='Convert training data formats into one another') add_arguments(parser) - main(parser.parse_args()) diff --git a/rasa_nlu/test.py b/rasa_nlu/test.py index 3e4f5a8586e9..0941b5d811d2 100644 --- a/rasa_nlu/test.py +++ b/rasa_nlu/test.py @@ -6,7 +6,7 @@ import logging import numpy as np import shutil -from typing import List, Optional, Text +from typing import List, Optional, Text, Union from rasa_nlu import config, training_data, utils from rasa_nlu.config import RasaNLUModelConfig @@ -804,19 +804,26 @@ def combine_entity_result(results, interpreter, data): return results -def run_cv_evaluation(data: TrainingData, - n_folds: int, - nlu_config: RasaNLUModelConfig) -> CVEvaluationResult: - """Stratified cross validation on data - :param data: Training Data - :param n_folds: integer, number of cv folds - :param nlu_config: nlu config file - :return: dictionary with key, list structure, where each entry in list +def cross_validate(data: TrainingData, n_folds: int, + nlu_config: Union[RasaNLUModelConfig, Text] + ) -> CVEvaluationResult: + """Stratified cross validation on data. + + Args: + data: Training Data + n_folds: integer, number of cv folds + nlu_config: nlu config file + + Returns: + dictionary with key, list structure, where each entry in list corresponds to the relevant result for one fold """ from collections import defaultdict import tempfile + if isinstance(nlu_config, str): + nlu_config = config.load(nlu_config) + trainer = Trainer(nlu_config) train_results = defaultdict(list) test_results = defaultdict(list) @@ -947,7 +954,7 @@ def main(): nlu_config = config.load(cmdline_args.config) data = training_data.load_data(cmdline_args.data) data = drop_intents_below_freq(data, cutoff=5) - results, entity_results = run_cv_evaluation( + results, entity_results = cross_validate( data, int(cmdline_args.folds), nlu_config) logger.info("CV evaluation (n={})".format(cmdline_args.folds)) diff --git a/tests/base/test_evaluation.py b/tests/base/test_evaluation.py index 9f4ddb2755db..9756f4ff209d 100644 --- a/tests/base/test_evaluation.py +++ b/tests/base/test_evaluation.py @@ -10,7 +10,7 @@ remove_empty_intent_examples, get_entity_extractors, get_duckling_dimensions, known_duckling_dimensions, find_component, remove_duckling_extractors, drop_intents_below_freq, - run_cv_evaluation, substitute_labels, IntentEvaluationResult, + cross_validate, substitute_labels, IntentEvaluationResult, evaluate_intents, evaluate_entities) from rasa_nlu.test import does_token_cross_borders from rasa_nlu.test import align_entity_predictions @@ -242,7 +242,7 @@ def test_run_cv_evaluation(): nlu_config = config.load("sample_configs/config_spacy.yml") n_folds = 2 - results, entity_results = run_cv_evaluation(td, n_folds, nlu_config) + results, entity_results = cross_validate(td, n_folds, nlu_config) assert len(results.train["Accuracy"]) == n_folds assert len(results.train["Precision"]) == n_folds From 9b4d716393a3bba6ca715e6cb5186db9605729b0 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Fri, 22 Feb 2019 15:23:10 +0100 Subject: [PATCH 09/17] refactor imports --- rasa_nlu/cli/__init__.py | 1 - rasa_nlu/server.py | 5 +++-- rasa_nlu/train.py | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/rasa_nlu/cli/__init__.py b/rasa_nlu/cli/__init__.py index 71b760f46232..e69de29bb2d1 100644 --- a/rasa_nlu/cli/__init__.py +++ b/rasa_nlu/cli/__init__.py @@ -1 +0,0 @@ -import rasa_nlu.cli.server diff --git a/rasa_nlu/server.py b/rasa_nlu/server.py index 69bbec6d82b5..1c4bfa90525d 100644 --- a/rasa_nlu/server.py +++ b/rasa_nlu/server.py @@ -6,7 +6,8 @@ from twisted.internet import reactor, threads from twisted.internet.defer import inlineCallbacks, returnValue -from rasa_nlu import config, utils, cli +from rasa_nlu import config, utils +import rasa_nlu.cli.server as cli from rasa_nlu.config import RasaNLUModelConfig from rasa_nlu.data_router import ( DataRouter, InvalidProjectError, @@ -21,7 +22,7 @@ def create_argument_parser(): parser = argparse.ArgumentParser(description='parse incoming text') - cli.server.add_server_arguments(parser) + cli.add_server_arguments(parser) utils.add_logging_option_arguments(parser) return parser diff --git a/rasa_nlu/train.py b/rasa_nlu/train.py index dd86fd2342a3..049f6e422377 100644 --- a/rasa_nlu/train.py +++ b/rasa_nlu/train.py @@ -129,6 +129,7 @@ def train(nlu_config: Union[Text, RasaNLUModelConfig], if isinstance(nlu_config, str): nlu_config = config.load(nlu_config) + # Ensure we are training a model that we can save in the end # WARN: there is still a race condition if a model with the same name is # trained in another subprocess From f1292a8d7a29b84877f234ef441280b0606d7c3d Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Fri, 22 Feb 2019 15:27:00 +0100 Subject: [PATCH 10/17] update changelog --- CHANGELOG.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 7cb91c84b81a..f82dd62c89a5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -14,6 +14,11 @@ Changed ------- - applied spacy guidelines on how to disable pipeline components - starter packs now also tested when attempting to merge a branch to master +- renamed ``rasa_nlu.evaluate`` to ``rasa_nlu.test`` +- renamed ``rasa_nlu.test.run_cv_evaluation`` to + ``rasa_nlu.test.cross_validate`` +- renamed ``rasa_nlu.train.do_train()`` to ``rasa_nlu.train.train()`` +- train command can now also load config from file ======= - replace pep8 with pycodestyle From 3b1007770451b7c8c553b8ae7fe81eae6a905d15 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Fri, 22 Feb 2019 15:38:44 +0100 Subject: [PATCH 11/17] fix travis --- tests/base/test_persistor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/base/test_persistor.py b/tests/base/test_persistor.py index 954e061e33b5..de46c3408acf 100644 --- a/tests/base/test_persistor.py +++ b/tests/base/test_persistor.py @@ -24,7 +24,7 @@ def test_list_projects_method_in_AWSPersistor(component_builder, tmpdir): os.environ["BUCKET_NAME"] = 'rasa-test' os.environ["AWS_DEFAULT_REGION"] = 'us-east-1' - (trained, _, persisted_path) = train.train( + (trained, _, persisted_path) = train( _config, data="data/test/demo-rasa-small.json", path=tmpdir.strpath, From 8cf0adbc5cbb416bc06da6d55018d98db137a6ae Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Fri, 22 Feb 2019 16:08:20 +0100 Subject: [PATCH 12/17] fix test imports --- tests/training/test_train.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/training/test_train.py b/tests/training/test_train.py index 5b08cee6f22c..34b8677870de 100644 --- a/tests/training/test_train.py +++ b/tests/training/test_train.py @@ -68,7 +68,7 @@ def test_all_components_are_in_at_least_one_test_pipeline(): list(registry.registered_pipeline_templates.keys())) def test_train_model(pipeline_template, component_builder, tmpdir): _config = utilities.base_test_conf(pipeline_template) - (trained, _, persisted_path) = train.train( + (trained, _, persisted_path) = train( _config, path=tmpdir.strpath, data=DEFAULT_DATA_PATH, @@ -89,13 +89,13 @@ def test_random_seed(component_builder, tmpdir): _config.set_component_attr("intent_classifier_tensorflow_embedding", random_seed=1) # first run - (trained_a, _, persisted_path_a) = train.train( + (trained_a, _, persisted_path_a) = train( _config, path=tmpdir.strpath + "_a", data=DEFAULT_DATA_PATH, component_builder=component_builder) # second run - (trained_b, _, persisted_path_b) = train.train( + (trained_b, _, persisted_path_b) = train( _config, path=tmpdir.strpath + "_b", data=DEFAULT_DATA_PATH, @@ -112,7 +112,7 @@ def test_random_seed(component_builder, tmpdir): def test_train_model_on_test_pipelines(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) - (trained, _, persisted_path) = train.train( + (trained, _, persisted_path) = train( _config, path=tmpdir.strpath, data=DEFAULT_DATA_PATH, @@ -128,7 +128,7 @@ def test_train_model_on_test_pipelines(language, pipeline, @pytest.mark.parametrize("language, pipeline", pipelines_for_tests()) def test_train_model_noents(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) - (trained, _, persisted_path) = train.train( + (trained, _, persisted_path) = train( _config, path=tmpdir.strpath, data="./data/test/demo-rasa-noents.json", @@ -144,7 +144,7 @@ def test_train_model_noents(language, pipeline, component_builder, tmpdir): @pytest.mark.parametrize("language, pipeline", pipelines_for_tests()) def test_train_model_multithread(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) - (trained, _, persisted_path) = train.train( + (trained, _, persisted_path) = train( _config, path=tmpdir.strpath, data=DEFAULT_DATA_PATH, @@ -161,7 +161,7 @@ def test_train_model_empty_pipeline(component_builder): # Should return an empty pipeline _config = utilities.base_test_conf(pipeline_template=None) with pytest.raises(ValueError): - train.train( + train( _config, data=DEFAULT_DATA_PATH, component_builder=component_builder) @@ -169,7 +169,7 @@ def test_train_model_empty_pipeline(component_builder): def test_train_named_model(component_builder, tmpdir): _config = utilities.base_test_conf("keyword") - (trained, _, persisted_path) = train.train( + (trained, _, persisted_path) = train( _config, path=tmpdir.strpath, project="my_keyword_model", From ed67813ea45db221089eb65b260e1c7c81ffa6cb Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Fri, 22 Feb 2019 17:29:06 +0100 Subject: [PATCH 13/17] fix test imports --- tests/training/test_train.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/training/test_train.py b/tests/training/test_train.py index 34b8677870de..7bf22fbb2436 100644 --- a/tests/training/test_train.py +++ b/tests/training/test_train.py @@ -185,10 +185,8 @@ def test_handles_pipeline_with_non_existing_component(component_builder): _config = utilities.base_test_conf("spacy_sklearn") _config.pipeline.append({"name": "my_made_up_component"}) with pytest.raises(Exception) as execinfo: - train.train( - _config, - data=DEFAULT_DATA_PATH, - component_builder=component_builder) + train(_config, data=DEFAULT_DATA_PATH, + component_builder=component_builder) assert "Failed to find component" in str(execinfo.value) From f11c1386e8a1b5149977738bc238ee7063b65e60 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Sun, 24 Feb 2019 14:23:15 +0100 Subject: [PATCH 14/17] improve imports --- rasa_nlu/training_data/formats/markdown.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rasa_nlu/training_data/formats/markdown.py b/rasa_nlu/training_data/formats/markdown.py index 524ea558721c..a9ad8e42c9df 100644 --- a/rasa_nlu/training_data/formats/markdown.py +++ b/rasa_nlu/training_data/formats/markdown.py @@ -1,15 +1,13 @@ import logging import re -import typing -from typing import Any, Text +from typing import Any, Text, TYPE_CHECKING from rasa_nlu.training_data.formats.readerwriter import ( TrainingDataReader, TrainingDataWriter) -from rasa_nlu.training_data.util import check_duplicate_synonym from rasa_nlu.utils import build_entity -if typing.TYPE_CHECKING: +if TYPE_CHECKING: from rasa_nlu.training_data import Message, TrainingData INTENT = "intent" @@ -143,6 +141,8 @@ def _find_entities_in_training_example(example): return entities def _add_synonym(self, text, value): + from rasa_nlu.training_data.util import check_duplicate_synonym + check_duplicate_synonym(self.entity_synonyms, text, value, "reading markdown") self.entity_synonyms[text] = value From 41fc97eef1c2d1d01040458010e403e3fe346e31 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Mon, 4 Mar 2019 11:28:25 +0100 Subject: [PATCH 15/17] add renamed functions to migration guide --- docs/migrations.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/migrations.rst b/docs/migrations.rst index 2834114ee17a..b65febeb6577 100644 --- a/docs/migrations.rst +++ b/docs/migrations.rst @@ -35,6 +35,12 @@ custom components - ``persist(...)`` method additionally takes file name prefix Change your custom components accordingly. +function names +~~~~~~~~~~~~~~ +- ``rasa_nlu.evaluate`` was renamed to ``rasa_nlu.test`` +- ``rasa_nlu.test.run_cv_evaluation`` was renamed to + ``rasa_nlu.test.cross_validate`` +- ``rasa_nlu.train.do_train()`` was renamed to to ``rasa_nlu.train.train()`` 0.13.x to 0.14.0 ---------------- From e6687438775b69115b7022b2d00635f586c19606 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Mon, 4 Mar 2019 11:30:35 +0100 Subject: [PATCH 16/17] revert import of TYPE_CHECKING variable --- rasa_nlu/training_data/formats/markdown.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rasa_nlu/training_data/formats/markdown.py b/rasa_nlu/training_data/formats/markdown.py index a9ad8e42c9df..876d4d20e117 100644 --- a/rasa_nlu/training_data/formats/markdown.py +++ b/rasa_nlu/training_data/formats/markdown.py @@ -1,13 +1,14 @@ import logging import re -from typing import Any, Text, TYPE_CHECKING +import typing +from typing import Any, Text from rasa_nlu.training_data.formats.readerwriter import ( TrainingDataReader, TrainingDataWriter) from rasa_nlu.utils import build_entity -if TYPE_CHECKING: +if typing.TYPE_CHECKING: from rasa_nlu.training_data import Message, TrainingData INTENT = "intent" From 7604f32a42eab91c93578234d26a2e1a7d8bac38 Mon Sep 17 00:00:00 2001 From: Tobias Wochinger Date: Mon, 4 Mar 2019 11:33:37 +0100 Subject: [PATCH 17/17] add constant for training_data.json --- rasa_nlu/training_data/training_data.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rasa_nlu/training_data/training_data.py b/rasa_nlu/training_data/training_data.py index a9a0104d944d..0526c9728bce 100644 --- a/rasa_nlu/training_data/training_data.py +++ b/rasa_nlu/training_data/training_data.py @@ -13,6 +13,8 @@ from rasa_nlu.training_data.util import check_duplicate_synonym from rasa_nlu.utils import lazyproperty, list_to_str, write_to_file +DEFAULT_TRAINING_DATA_OUTPUT_PATH = "training_data.json" + logger = logging.getLogger(__name__) @@ -125,7 +127,8 @@ def as_markdown(self) -> Text: from rasa_nlu.training_data.formats import MarkdownWriter return MarkdownWriter().dumps(self) - def persist(self, dir_name: Text, filename: Text = "training_data.json" + def persist(self, dir_name: Text, + filename: Text = DEFAULT_TRAINING_DATA_OUTPUT_PATH ) -> Dict[Text, Any]: """Persists this training data to disk and returns necessary information to load it again.""" @@ -137,7 +140,7 @@ def persist(self, dir_name: Text, filename: Text = "training_data.json" write_to_file(data_file, self.as_json(indent=2)) return { - "training_data": "training_data.json" + "training_data": DEFAULT_TRAINING_DATA_OUTPUT_PATH } def sorted_entities(self) -> List[Any]: