From 871b27a9e6bf45ec608f7a431125ad454bf9c064 Mon Sep 17 00:00:00 2001 From: ArthurTemporim Date: Tue, 25 Jun 2019 11:42:09 -0300 Subject: [PATCH 01/13] Update microsoft LUIS nlu data version Signed-off-by: ArthurTemporim --- rasa/nlu/training_data/formats/luis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rasa/nlu/training_data/formats/luis.py b/rasa/nlu/training_data/formats/luis.py index a8d27e9f8eac..22cffdc8db45 100644 --- a/rasa/nlu/training_data/formats/luis.py +++ b/rasa/nlu/training_data/formats/luis.py @@ -19,10 +19,10 @@ def read_from_json(self, js: Dict[Text, Any], **kwargs: Any) -> "TrainingData": regex_features = [] # Simple check to ensure we support this luis data schema version - if not js["luis_schema_version"].startswith("2"): + if not js["luis_schema_version"].startswith("3"): raise Exception( "Invalid luis data schema version {}, " - "should be 2.x.x. " + "should be 3.x.x. " "Make sure to use the latest luis version " "(e.g. by downloading your data again)." "".format(js["luis_schema_version"]) From 9cb0488619ae72769e9646157e4dd1a1a461789b Mon Sep 17 00:00:00 2001 From: ArthurTemporim Date: Wed, 26 Jun 2019 14:27:29 -0300 Subject: [PATCH 02/13] Improve method _is_nlu_format to understand more nlu types Signed-off-by: ArthurTemporim --- rasa/data.py | 13 +++---------- rasa/nlu/__init__.py | 2 -- rasa/nlu/training_data/loading.py | 4 ++-- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/rasa/data.py b/rasa/data.py index 95e85dfb44ed..839da2c62145 100644 --- a/rasa/data.py +++ b/rasa/data.py @@ -4,8 +4,9 @@ import tempfile import uuid import typing -from typing import Tuple, List, Text, Set, Union, Optional import re +from typing import Tuple, List, Text, Set, Union, Optional +from rasa.nlu.training_data.loading import guess_format import rasa.utils.io as io_utils @@ -156,15 +157,7 @@ def _is_valid_filetype(path: Text) -> bool: def _is_nlu_file(file_path: Text) -> bool: - with open(file_path, encoding="utf-8") as f: - if file_path.endswith(".json"): - content = io_utils.read_json_file(file_path) - is_nlu_file = ( - isinstance(content, dict) and content.get("rasa_nlu_data") is not None - ) - else: - is_nlu_file = any(_contains_nlu_pattern(l) for l in f) - return is_nlu_file + return guess_format(file_path) != 'unk' def _contains_nlu_pattern(text: Text) -> bool: diff --git a/rasa/nlu/__init__.py b/rasa/nlu/__init__.py index 5fd59bf6feba..7c542d054fb2 100644 --- a/rasa/nlu/__init__.py +++ b/rasa/nlu/__init__.py @@ -2,8 +2,6 @@ import rasa from rasa.nlu.train import train -from rasa.nlu.test import run_evaluation as test -from rasa.nlu.test import cross_validate from rasa.nlu.training_data import load_data logging.getLogger(__name__).addHandler(logging.NullHandler()) diff --git a/rasa/nlu/training_data/loading.py b/rasa/nlu/training_data/loading.py index 0b3f76a08a8c..fba5ab363696 100644 --- a/rasa/nlu/training_data/loading.py +++ b/rasa/nlu/training_data/loading.py @@ -110,7 +110,7 @@ def _reader_factory(fformat: Text) -> Optional["TrainingDataReader"]: def _load(filename: Text, language: Optional[Text] = "en") -> Optional["TrainingData"]: """Loads a single training data file from disk.""" - fformat = _guess_format(filename) + fformat = guess_format(filename) if fformat == UNK: raise ValueError("Unknown data format for file {}".format(filename)) @@ -123,7 +123,7 @@ def _load(filename: Text, language: Optional[Text] = "en") -> Optional["Training return None -def _guess_format(filename: Text) -> Text: +def guess_format(filename: Text) -> Text: """Applies heuristics to guess the data format of a file.""" guess = UNK content = rasa.utils.io.read_file(filename) From 8b16fb58086b11640648b9291810377208a4ee20 Mon Sep 17 00:00:00 2001 From: ArthurTemporim Date: Thu, 27 Jun 2019 10:58:58 -0300 Subject: [PATCH 03/13] Update method _guess_format to guess_format Signed-off-by: ArthurTemporim --- rasa/core/training/interactive.py | 4 ++-- rasa/data.py | 18 ++++++++---------- rasa/nlu/training_data/util.py | 4 ++-- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/rasa/core/training/interactive.py b/rasa/core/training/interactive.py index 646256f1cfc2..2cfb959cdda0 100644 --- a/rasa/core/training/interactive.py +++ b/rasa/core/training/interactive.py @@ -58,7 +58,7 @@ from rasa.utils.endpoints import EndpointConfig # noinspection PyProtectedMember -from rasa.nlu.training_data.loading import _guess_format, load_data +from rasa.nlu.training_data.loading import guess_format, load_data from rasa.nlu.training_data.message import Message # WARNING: This command line UI is using an external library @@ -797,7 +797,7 @@ async def _write_nlu_to_file( # need to guess the format of the file before opening it to avoid a read # in a write - if _guess_format(export_nlu_path) in {"md", "unk"}: + if guess_format(export_nlu_path) in {"md", "unk"}: fformat = "md" else: fformat = "json" diff --git a/rasa/data.py b/rasa/data.py index 839da2c62145..dbaaace64066 100644 --- a/rasa/data.py +++ b/rasa/data.py @@ -8,8 +8,6 @@ from typing import Tuple, List, Text, Set, Union, Optional from rasa.nlu.training_data.loading import guess_format -import rasa.utils.io as io_utils - logger = logging.getLogger(__name__) if typing.TYPE_CHECKING: @@ -24,8 +22,8 @@ def get_core_directory( Args: paths: List of paths to training files or folders containing them. - skill_imports: `SkillSelector` instance which determines which files should - be loaded. + skill_imports: `SkillSelector` instance which determines which files + should be loaded. Returns: Path to temporary directory containing all found Core training files. @@ -42,8 +40,8 @@ def get_nlu_directory( Args: paths: List of paths to training files or folders containing them. - skill_imports: `SkillSelector` instance which determines which files should - be loaded. + skill_imports: `SkillSelector` instance which determines which files + should be loaded. Returns: Path to temporary directory containing all found NLU training files. @@ -60,8 +58,8 @@ def get_core_nlu_directories( Args: paths: List of paths to training files or folders containing them. - skill_imports: `SkillSelector` instance which determines which files should - be loaded. + skill_imports: `SkillSelector` instance which determines which files + should be loaded. Returns: Path to directory containing the Core files and path to directory @@ -84,8 +82,8 @@ def get_core_nlu_files( Args: paths: List of paths to training files or folders containing them. - skill_imports: `SkillSelector` instance which determines which files should - be loaded. + skill_imports: `SkillSelector` instance which determines which files + should be loaded. Returns: Tuple of paths to story and NLU files. diff --git a/rasa/nlu/training_data/util.py b/rasa/nlu/training_data/util.py index aa965962a9fc..b65d2c4b8a94 100644 --- a/rasa/nlu/training_data/util.py +++ b/rasa/nlu/training_data/util.py @@ -30,14 +30,14 @@ def check_duplicate_synonym(entity_synonyms, text, syn, context_str=""): def get_file_format(resource_name: Text) -> Text: - from rasa.nlu.training_data.loading import _guess_format + from rasa.nlu.training_data.loading import guess_format if resource_name is None or not os.path.exists(resource_name): raise AttributeError("Resource '{}' does not exist.".format(resource_name)) files = utils.list_files(resource_name) - file_formats = list(map(lambda f: _guess_format(f), files)) + file_formats = list(map(lambda f: guess_format(f), files)) if not file_formats: return "json" From eb6155e4b677c411c2757a555c5b2fd783f48ee7 Mon Sep 17 00:00:00 2001 From: ArthurTemporim Date: Fri, 28 Jun 2019 00:55:54 -0300 Subject: [PATCH 04/13] Add unk constant, remove unnecessary method and change luis version back to 2 Signed-off-by: ArthurTemporim --- rasa/constants.py | 2 ++ rasa/data.py | 9 ++------- rasa/nlu/training_data/formats/luis.py | 4 ++-- rasa/nlu/training_data/loading.py | 6 +++--- 4 files changed, 9 insertions(+), 12 deletions(-) diff --git a/rasa/constants.py b/rasa/constants.py index 346fd89439b3..439f66443124 100644 --- a/rasa/constants.py +++ b/rasa/constants.py @@ -12,6 +12,8 @@ DEFAULT_NLU_RESULTS_PATH = "nlu_comparison_results" DEFAULT_REQUEST_TIMEOUT = 60 * 5 # 5 minutes +NLU_FORMAT_UNKNOWN = "unk" + TEST_DATA_FILE = "test.md" TRAIN_DATA_FILE = "train.md" RESULTS_FILE = "results.json" diff --git a/rasa/data.py b/rasa/data.py index dbaaace64066..2b42c13100f9 100644 --- a/rasa/data.py +++ b/rasa/data.py @@ -7,6 +7,7 @@ import re from typing import Tuple, List, Text, Set, Union, Optional from rasa.nlu.training_data.loading import guess_format +from rasa import constants logger = logging.getLogger(__name__) @@ -155,13 +156,7 @@ def _is_valid_filetype(path: Text) -> bool: def _is_nlu_file(file_path: Text) -> bool: - return guess_format(file_path) != 'unk' - - -def _contains_nlu_pattern(text: Text) -> bool: - nlu_pattern = r"\s*##\s*(intent|regex||synonym|lookup):" - - return re.match(nlu_pattern, text) is not None + return guess_format(file_path) != constants.NLU_FORMAT_UNKNOWN def _is_story_file(file_path: Text) -> bool: diff --git a/rasa/nlu/training_data/formats/luis.py b/rasa/nlu/training_data/formats/luis.py index 22cffdc8db45..a8d27e9f8eac 100644 --- a/rasa/nlu/training_data/formats/luis.py +++ b/rasa/nlu/training_data/formats/luis.py @@ -19,10 +19,10 @@ def read_from_json(self, js: Dict[Text, Any], **kwargs: Any) -> "TrainingData": regex_features = [] # Simple check to ensure we support this luis data schema version - if not js["luis_schema_version"].startswith("3"): + if not js["luis_schema_version"].startswith("2"): raise Exception( "Invalid luis data schema version {}, " - "should be 3.x.x. " + "should be 2.x.x. " "Make sure to use the latest luis version " "(e.g. by downloading your data again)." "".format(js["luis_schema_version"]) diff --git a/rasa/nlu/training_data/loading.py b/rasa/nlu/training_data/loading.py index fba5ab363696..a819a200b11c 100644 --- a/rasa/nlu/training_data/loading.py +++ b/rasa/nlu/training_data/loading.py @@ -3,6 +3,7 @@ import requests import typing from typing import Optional, Text +from rasa.constants import NLU_FORMAT_UNKNOWN import rasa.utils.io from rasa.nlu import utils @@ -28,7 +29,6 @@ WIT = "wit" LUIS = "luis" RASA = "rasa_nlu" -UNK = "unk" MARKDOWN = "md" DIALOGFLOW_RELEVANT = {DIALOGFLOW_ENTITIES, DIALOGFLOW_INTENT} @@ -111,7 +111,7 @@ def _load(filename: Text, language: Optional[Text] = "en") -> Optional["Training """Loads a single training data file from disk.""" fformat = guess_format(filename) - if fformat == UNK: + if fformat == NLU_FORMAT_UNKNOWN: raise ValueError("Unknown data format for file {}".format(filename)) logger.info("Training data format of {} is {}".format(filename, fformat)) @@ -125,7 +125,7 @@ def _load(filename: Text, language: Optional[Text] = "en") -> Optional["Training def guess_format(filename: Text) -> Text: """Applies heuristics to guess the data format of a file.""" - guess = UNK + guess = NLU_FORMAT_UNKNOWN content = rasa.utils.io.read_file(filename) try: js = json.loads(content) From 410302aa639c13386585388a30176af60ceadc04 Mon Sep 17 00:00:00 2001 From: ArthurTemporim Date: Fri, 28 Jun 2019 10:47:36 -0300 Subject: [PATCH 05/13] Update UNK constant, improve loading imports and add docstrings to loading nlu file methods Signed-off-by: ArthurTemporim --- rasa/constants.py | 2 -- rasa/core/training/interactive.py | 6 +++--- rasa/data.py | 11 +++-------- rasa/nlu/training_data/loading.py | 28 ++++++++++++++++++++++++---- 4 files changed, 30 insertions(+), 17 deletions(-) diff --git a/rasa/constants.py b/rasa/constants.py index 439f66443124..346fd89439b3 100644 --- a/rasa/constants.py +++ b/rasa/constants.py @@ -12,8 +12,6 @@ DEFAULT_NLU_RESULTS_PATH = "nlu_comparison_results" DEFAULT_REQUEST_TIMEOUT = 60 * 5 # 5 minutes -NLU_FORMAT_UNKNOWN = "unk" - TEST_DATA_FILE = "test.md" TRAIN_DATA_FILE = "train.md" RESULTS_FILE = "results.json" diff --git a/rasa/core/training/interactive.py b/rasa/core/training/interactive.py index 2cfb959cdda0..5c521d6255c7 100644 --- a/rasa/core/training/interactive.py +++ b/rasa/core/training/interactive.py @@ -58,7 +58,7 @@ from rasa.utils.endpoints import EndpointConfig # noinspection PyProtectedMember -from rasa.nlu.training_data.loading import guess_format, load_data +from rasa.nlu.training_data import loading from rasa.nlu.training_data.message import Message # WARNING: This command line UI is using an external library @@ -776,7 +776,7 @@ async def _write_nlu_to_file( # noinspection PyBroadException try: - previous_examples = load_data(export_nlu_path) + previous_examples = loading.load_data(export_nlu_path) except Exception as e: logger.exception("An exception occurred while trying to load the NLU data.") @@ -797,7 +797,7 @@ async def _write_nlu_to_file( # need to guess the format of the file before opening it to avoid a read # in a write - if guess_format(export_nlu_path) in {"md", "unk"}: + if loading.guess_format(export_nlu_path) in {"md", "unk"}: fformat = "md" else: fformat = "json" diff --git a/rasa/data.py b/rasa/data.py index 2b42c13100f9..f631172dcd58 100644 --- a/rasa/data.py +++ b/rasa/data.py @@ -6,8 +6,7 @@ import typing import re from typing import Tuple, List, Text, Set, Union, Optional -from rasa.nlu.training_data.loading import guess_format -from rasa import constants +from rasa.nlu.training_data import loading logger = logging.getLogger(__name__) @@ -109,7 +108,7 @@ def get_core_nlu_files( continue if _is_valid_filetype(path) and skill_imports.is_imported(path): - if _is_nlu_file(path): + if loading.is_nlu_file(path): nlu_data_files.add(os.path.abspath(path)) elif _is_story_file(path): story_files.add(os.path.abspath(path)) @@ -140,7 +139,7 @@ def _find_core_nlu_files_in_directory( if not _is_valid_filetype(full_path): continue - if _is_nlu_file(full_path): + if loading.is_nlu_file(full_path): nlu_data_files.add(full_path) elif _is_story_file(full_path): story_files.add(full_path) @@ -155,10 +154,6 @@ def _is_valid_filetype(path: Text) -> bool: return is_file and is_datafile -def _is_nlu_file(file_path: Text) -> bool: - return guess_format(file_path) != constants.NLU_FORMAT_UNKNOWN - - def _is_story_file(file_path: Text) -> bool: is_story_file = False diff --git a/rasa/nlu/training_data/loading.py b/rasa/nlu/training_data/loading.py index a819a200b11c..be7003a98728 100644 --- a/rasa/nlu/training_data/loading.py +++ b/rasa/nlu/training_data/loading.py @@ -3,7 +3,6 @@ import requests import typing from typing import Optional, Text -from rasa.constants import NLU_FORMAT_UNKNOWN import rasa.utils.io from rasa.nlu import utils @@ -30,6 +29,7 @@ LUIS = "luis" RASA = "rasa_nlu" MARKDOWN = "md" +UNK = "unk" DIALOGFLOW_RELEVANT = {DIALOGFLOW_ENTITIES, DIALOGFLOW_INTENT} _markdown_section_markers = ["## {}:".format(s) for s in markdown.available_sections] @@ -111,7 +111,7 @@ def _load(filename: Text, language: Optional[Text] = "en") -> Optional["Training """Loads a single training data file from disk.""" fformat = guess_format(filename) - if fformat == NLU_FORMAT_UNKNOWN: + if fformat == UNK: raise ValueError("Unknown data format for file {}".format(filename)) logger.info("Training data format of {} is {}".format(filename, fformat)) @@ -124,8 +124,15 @@ def _load(filename: Text, language: Optional[Text] = "en") -> Optional["Training def guess_format(filename: Text) -> Text: - """Applies heuristics to guess the data format of a file.""" - guess = NLU_FORMAT_UNKNOWN + """Applies heuristics to guess the data format of a file. + + Args: + filename: Text type with the file name. + + Returns: + Text tupe with the guessed nlu format. + """ + guess = UNK content = rasa.utils.io.read_file(filename) try: js = json.loads(content) @@ -139,3 +146,16 @@ def guess_format(filename: Text) -> Text: break return guess + + +def is_nlu_file(file_path: Text) -> bool: + """Verifies if the nlu file is in a valid format. + + Args: + file_path: Text type with the file name and path. + + Returns: + Boolean type with True if the file is a valid nlu format or False if + it isn't. + """ + return guess_format(file_path) != UNK From a63ff6788a461db92b0696e2a4b8eb57ffbb7988 Mon Sep 17 00:00:00 2001 From: ArthurTemporim Date: Fri, 28 Jun 2019 12:40:45 -0300 Subject: [PATCH 06/13] Remove tests from removed method and update _is_nlu_file method Signed-off-by: ArthurTemporim --- rasa/data.py | 9 +++++++-- rasa/nlu/training_data/loading.py | 13 ------------- tests/core/test_data.py | 18 ------------------ 3 files changed, 7 insertions(+), 33 deletions(-) diff --git a/rasa/data.py b/rasa/data.py index f631172dcd58..ba0e556becd0 100644 --- a/rasa/data.py +++ b/rasa/data.py @@ -108,7 +108,7 @@ def get_core_nlu_files( continue if _is_valid_filetype(path) and skill_imports.is_imported(path): - if loading.is_nlu_file(path): + if _is_nlu_file(path): nlu_data_files.add(os.path.abspath(path)) elif _is_story_file(path): story_files.add(os.path.abspath(path)) @@ -139,7 +139,7 @@ def _find_core_nlu_files_in_directory( if not _is_valid_filetype(full_path): continue - if loading.is_nlu_file(full_path): + if _is_nlu_file(full_path): nlu_data_files.add(full_path) elif _is_story_file(full_path): story_files.add(full_path) @@ -154,6 +154,11 @@ def _is_valid_filetype(path: Text) -> bool: return is_file and is_datafile +def _is_nlu_file(file_path: Text) -> bool: + """Checks whether a file is an NLU file.""" + return loading.guess_format(file_path) != loading.UNK + + def _is_story_file(file_path: Text) -> bool: is_story_file = False diff --git a/rasa/nlu/training_data/loading.py b/rasa/nlu/training_data/loading.py index be7003a98728..59ec729f225a 100644 --- a/rasa/nlu/training_data/loading.py +++ b/rasa/nlu/training_data/loading.py @@ -146,16 +146,3 @@ def guess_format(filename: Text) -> Text: break return guess - - -def is_nlu_file(file_path: Text) -> bool: - """Verifies if the nlu file is in a valid format. - - Args: - file_path: Text type with the file name and path. - - Returns: - Boolean type with True if the file is a valid nlu format or False if - it isn't. - """ - return guess_format(file_path) != UNK diff --git a/tests/core/test_data.py b/tests/core/test_data.py index aa8ab818f3e9..70d681fc033b 100644 --- a/tests/core/test_data.py +++ b/tests/core/test_data.py @@ -105,19 +105,6 @@ def test_same_file_names_get_resolved(tmpdir): assert all([f.endswith("stories.md") for f in stories]) -@pytest.mark.parametrize( - "line", - [ - "##intent:aintent", - "##synonym: synonym", - "##regex:a_regex", - " ##lookup:additional", - ], -) -def test_contains_nlu_pattern(line): - assert data._contains_nlu_pattern(line) - - def test_is_nlu_file_with_json(): test = { "rasa_nlu_data": { @@ -142,8 +129,3 @@ def test_is_not_nlu_file_with_json(): f.write('{"test": "a"}') assert not data._is_nlu_file(file) - - -@pytest.mark.parametrize("line", ["- example", "## story intent 1 + two##slots* entry"]) -def test_not_contains_nlu_pattern(line): - assert not data._contains_nlu_pattern(line) From d41192d33ee37edd84e8a3311bfe92fbf3111b8a Mon Sep 17 00:00:00 2001 From: Arthur Temporim Date: Mon, 1 Jul 2019 02:13:51 -0300 Subject: [PATCH 07/13] Update rasa/nlu/training_data/loading.py Co-Authored-By: Tobias Wochinger --- rasa/nlu/training_data/loading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rasa/nlu/training_data/loading.py b/rasa/nlu/training_data/loading.py index 59ec729f225a..8da8a4de3aaa 100644 --- a/rasa/nlu/training_data/loading.py +++ b/rasa/nlu/training_data/loading.py @@ -130,7 +130,7 @@ def guess_format(filename: Text) -> Text: filename: Text type with the file name. Returns: - Text tupe with the guessed nlu format. + Guessed file format. """ guess = UNK content = rasa.utils.io.read_file(filename) From ba01210c263413ee26a27e3c0b4bcc91d8dffbb7 Mon Sep 17 00:00:00 2001 From: ArthurTemporim Date: Wed, 3 Jul 2019 02:02:43 -0300 Subject: [PATCH 08/13] Create test to _find_core_nlu_files_in_directory Signed-off-by: ArthurTemporim --- rasa/nlu/training_data/loading.py | 2 +- tests/core/test_data.py | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/rasa/nlu/training_data/loading.py b/rasa/nlu/training_data/loading.py index 8da8a4de3aaa..5b2f4f4db4de 100644 --- a/rasa/nlu/training_data/loading.py +++ b/rasa/nlu/training_data/loading.py @@ -127,7 +127,7 @@ def guess_format(filename: Text) -> Text: """Applies heuristics to guess the data format of a file. Args: - filename: Text type with the file name. + filename: file whose type should be guessed Returns: Guessed file format. diff --git a/tests/core/test_data.py b/tests/core/test_data.py index 70d681fc033b..081ab45bc200 100644 --- a/tests/core/test_data.py +++ b/tests/core/test_data.py @@ -9,6 +9,8 @@ from tests.core.conftest import DEFAULT_STORIES_FILE, DEFAULT_NLU_DATA from rasa.nlu.training_data import load_data from rasa.nlu.utils import json_to_string +from rasa.skill import SkillSelector + def test_get_core_directory(project): @@ -105,6 +107,16 @@ def test_same_file_names_get_resolved(tmpdir): assert all([f.endswith("stories.md") for f in stories]) +def test_find_core_nlu_files_in_directory(): + examples_dir = 'data/examples' + examples_dirs = os.listdir(examples_dir) + for example in examples_dirs: + data_dir = os.path.join(examples_dir, example) + skill_import = SkillSelector(data_dir) + nlu_files = data._find_core_nlu_files_in_directory(data_dir, skill_import)[1] + assert nlu_files + + def test_is_nlu_file_with_json(): test = { "rasa_nlu_data": { From 39e540c1c15aa6f323d6e722a32285394778ed6e Mon Sep 17 00:00:00 2001 From: ArthurTemporim Date: Wed, 3 Jul 2019 06:23:10 -0300 Subject: [PATCH 09/13] Solve simple black issues Signed-off-by: ArthurTemporim --- tests/core/test_data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/core/test_data.py b/tests/core/test_data.py index 081ab45bc200..80a0aba7884b 100644 --- a/tests/core/test_data.py +++ b/tests/core/test_data.py @@ -12,7 +12,6 @@ from rasa.skill import SkillSelector - def test_get_core_directory(project): data_dir = os.path.join(project, "data") core_directory = data.get_core_directory([data_dir]) @@ -108,7 +107,7 @@ def test_same_file_names_get_resolved(tmpdir): def test_find_core_nlu_files_in_directory(): - examples_dir = 'data/examples' + examples_dir = "data/examples" examples_dirs = os.listdir(examples_dir) for example in examples_dirs: data_dir = os.path.join(examples_dir, example) From d372ba1b309c3216608279b4eafdabf41a79b9f2 Mon Sep 17 00:00:00 2001 From: ArthurTemporim Date: Thu, 4 Jul 2019 14:56:23 -0300 Subject: [PATCH 10/13] Correct data.py imports Signed-off-by: ArthurTemporim --- rasa/data.py | 3 ++- rasa/nlu/__init__.py | 2 ++ rasa/nlu/test.py | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/rasa/data.py b/rasa/data.py index ba0e556becd0..03b5691a8577 100644 --- a/rasa/data.py +++ b/rasa/data.py @@ -6,7 +6,6 @@ import typing import re from typing import Tuple, List, Text, Set, Union, Optional -from rasa.nlu.training_data import loading logger = logging.getLogger(__name__) @@ -156,6 +155,8 @@ def _is_valid_filetype(path: Text) -> bool: def _is_nlu_file(file_path: Text) -> bool: """Checks whether a file is an NLU file.""" + from rasa.nlu.training_data import loading + return loading.guess_format(file_path) != loading.UNK diff --git a/rasa/nlu/__init__.py b/rasa/nlu/__init__.py index 7c542d054fb2..5fd59bf6feba 100644 --- a/rasa/nlu/__init__.py +++ b/rasa/nlu/__init__.py @@ -2,6 +2,8 @@ import rasa from rasa.nlu.train import train +from rasa.nlu.test import run_evaluation as test +from rasa.nlu.test import cross_validate from rasa.nlu.training_data import load_data logging.getLogger(__name__).addHandler(logging.NullHandler()) diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py index fc4ccd612710..8609f3eee3bd 100644 --- a/rasa/nlu/test.py +++ b/rasa/nlu/test.py @@ -19,7 +19,6 @@ from rasa.constants import TEST_DATA_FILE, TRAIN_DATA_FILE from rasa.model import get_model -from rasa.train import train_nlu from rasa.utils.io import create_path from rasa.nlu import config, training_data, utils from rasa.nlu.utils import write_to_file @@ -939,6 +938,8 @@ def compare_nlu( Returns: training examples per run """ + from rasa.train import train_nlu + training_examples_per_run = [] for run in range(runs): From 03057ae5df954c2c7782cc0669fb08d178f2f9bc Mon Sep 17 00:00:00 2001 From: ArthurTemporim Date: Thu, 4 Jul 2019 15:01:19 -0300 Subject: [PATCH 11/13] Add nlu file read tests Signed-off-by: ArthurTemporim --- rasa/nlu/training_data/util.py | 4 ++-- tests/core/test_data.py | 39 +++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/rasa/nlu/training_data/util.py b/rasa/nlu/training_data/util.py index b65d2c4b8a94..4ee1ee68a253 100644 --- a/rasa/nlu/training_data/util.py +++ b/rasa/nlu/training_data/util.py @@ -30,14 +30,14 @@ def check_duplicate_synonym(entity_synonyms, text, syn, context_str=""): def get_file_format(resource_name: Text) -> Text: - from rasa.nlu.training_data.loading import guess_format + from rasa.nlu.training_data import loading if resource_name is None or not os.path.exists(resource_name): raise AttributeError("Resource '{}' does not exist.".format(resource_name)) files = utils.list_files(resource_name) - file_formats = list(map(lambda f: guess_format(f), files)) + file_formats = list(map(lambda f: loading.guess_format(f), files)) if not file_formats: return "json" diff --git a/tests/core/test_data.py b/tests/core/test_data.py index 80a0aba7884b..90a5724d11c6 100644 --- a/tests/core/test_data.py +++ b/tests/core/test_data.py @@ -106,7 +106,44 @@ def test_same_file_names_get_resolved(tmpdir): assert all([f.endswith("stories.md") for f in stories]) -def test_find_core_nlu_files_in_directory(): +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + "dialogflow", + { + "data/examples/dialogflow/entities/cuisine.json", + "data/examples/dialogflow/intents/affirm.json", + "data/examples/dialogflow/entities/location_entries_es.json", + "data/examples/dialogflow/intents/affirm_usersays_en.json", + "data/examples/dialogflow/intents/hi_usersays_es.json", + "data/examples/dialogflow/entities/cuisine_entries_es.json", + "data/examples/dialogflow/intents/inform_usersays_en.json", + "data/examples/dialogflow/intents/hi.json", + "data/examples/dialogflow/intents/goodbye_usersays_en.json", + "data/examples/dialogflow/agent.json", + "data/examples/dialogflow/intents/hi_usersays_en.json", + "data/examples/dialogflow/entities/location.json", + "data/examples/dialogflow/intents/affirm_usersays_es.json", + "data/examples/dialogflow/entities/cuisine_entries_en.json", + "data/examples/dialogflow/package.json", + "data/examples/dialogflow/intents/Default Fallback Intent.json", + "data/examples/dialogflow/intents/goodbye_usersays_es.json", + "data/examples/dialogflow/intents/goodbye.json", + "data/examples/dialogflow/entities/location_entries_en.json", + "data/examples/dialogflow/intents/inform.json", + "data/examples/dialogflow/intents/inform_usersays_es.json", + }, + ), + ("luis", {"data/examples/luis/demo-restaurants.json"}), + ( + "rasa", + {"data/examples/rasa/demo-rasa.json", "data/examples/rasa/demo-rasa.md"}, + ), + ("wit", {"data/examples/wit/demo-flights.json"}), + ], +) +def test_find_nlu_files_with_different_formats(test_input, expected): examples_dir = "data/examples" examples_dirs = os.listdir(examples_dir) for example in examples_dirs: From 421e8884cc04a5f94803177b964e21b88ee9a500 Mon Sep 17 00:00:00 2001 From: ArthurTemporim Date: Thu, 4 Jul 2019 16:07:24 -0300 Subject: [PATCH 12/13] Improve test_data.py withi get_core_nlu_files method Signed-off-by: ArthurTemporim --- tests/core/test_data.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/core/test_data.py b/tests/core/test_data.py index 90a5724d11c6..0abddd12e093 100644 --- a/tests/core/test_data.py +++ b/tests/core/test_data.py @@ -9,7 +9,6 @@ from tests.core.conftest import DEFAULT_STORIES_FILE, DEFAULT_NLU_DATA from rasa.nlu.training_data import load_data from rasa.nlu.utils import json_to_string -from rasa.skill import SkillSelector def test_get_core_directory(project): @@ -145,12 +144,9 @@ def test_same_file_names_get_resolved(tmpdir): ) def test_find_nlu_files_with_different_formats(test_input, expected): examples_dir = "data/examples" - examples_dirs = os.listdir(examples_dir) - for example in examples_dirs: - data_dir = os.path.join(examples_dir, example) - skill_import = SkillSelector(data_dir) - nlu_files = data._find_core_nlu_files_in_directory(data_dir, skill_import)[1] - assert nlu_files + data_dir = os.path.join(examples_dir, test_input) + core_files, nlu_files = data.get_core_nlu_files([data_dir]) + assert nlu_files == expected def test_is_nlu_file_with_json(): From d2b623f783279c2a00c48d69ae429d0064ef68ef Mon Sep 17 00:00:00 2001 From: ArthurTemporim Date: Thu, 4 Jul 2019 20:39:16 -0300 Subject: [PATCH 13/13] Improve data imports and add a description to CHANGELOG Signed-off-by: ArthurTemporim --- CHANGELOG.rst | 1 + rasa/core/domain.py | 2 +- rasa/data.py | 3 +-- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index dcc7abe46252..c5e377c91322 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -32,6 +32,7 @@ Fixed ----- - all temporal model files are now deleted after stopping the Rasa server - ``rasa shell nlu`` now outputs unicode characters instead of ``\uxxxx`` codes +- ``rasa train nlu`` now can train different nlu file formats [1.1.4] - 2019-06-18 diff --git a/rasa/core/domain.py b/rasa/core/domain.py index 481c96b925a9..8319771c5374 100644 --- a/rasa/core/domain.py +++ b/rasa/core/domain.py @@ -6,7 +6,6 @@ from typing import Any, Dict, List, Optional, Text, Tuple, Union, Set import rasa.utils.io -from rasa import data from rasa.cli.utils import bcolors from rasa.constants import DOMAIN_SCHEMA_FILE from rasa.core import utils @@ -131,6 +130,7 @@ def from_directory( cls, path: Text, skill_imports: Optional[SkillSelector] = None ) -> "Domain": """Loads and merges multiple domain files recursively from a directory tree.""" + from rasa import data domain = Domain.empty() skill_imports = skill_imports or SkillSelector.all_skills() diff --git a/rasa/data.py b/rasa/data.py index 03b5691a8577..ba0e556becd0 100644 --- a/rasa/data.py +++ b/rasa/data.py @@ -6,6 +6,7 @@ import typing import re from typing import Tuple, List, Text, Set, Union, Optional +from rasa.nlu.training_data import loading logger = logging.getLogger(__name__) @@ -155,8 +156,6 @@ def _is_valid_filetype(path: Text) -> bool: def _is_nlu_file(file_path: Text) -> bool: """Checks whether a file is an NLU file.""" - from rasa.nlu.training_data import loading - return loading.guess_format(file_path) != loading.UNK