diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 94f6842baccc..5058ae399431 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -33,6 +33,7 @@ Fixed ----- - all temporal model files are now deleted after stopping the Rasa server - ``rasa shell nlu`` now outputs unicode characters instead of ``\uxxxx`` codes +- ``rasa train`` now also includes NLU files in other formats than the Rasa format [1.1.4] - 2019-06-18 diff --git a/rasa/core/domain.py b/rasa/core/domain.py index 481c96b925a9..8319771c5374 100644 --- a/rasa/core/domain.py +++ b/rasa/core/domain.py @@ -6,7 +6,6 @@ from typing import Any, Dict, List, Optional, Text, Tuple, Union, Set import rasa.utils.io -from rasa import data from rasa.cli.utils import bcolors from rasa.constants import DOMAIN_SCHEMA_FILE from rasa.core import utils @@ -131,6 +130,7 @@ def from_directory( cls, path: Text, skill_imports: Optional[SkillSelector] = None ) -> "Domain": """Loads and merges multiple domain files recursively from a directory tree.""" + from rasa import data domain = Domain.empty() skill_imports = skill_imports or SkillSelector.all_skills() diff --git a/rasa/core/training/interactive.py b/rasa/core/training/interactive.py index 646256f1cfc2..5c521d6255c7 100644 --- a/rasa/core/training/interactive.py +++ b/rasa/core/training/interactive.py @@ -58,7 +58,7 @@ from rasa.utils.endpoints import EndpointConfig # noinspection PyProtectedMember -from rasa.nlu.training_data.loading import _guess_format, load_data +from rasa.nlu.training_data import loading from rasa.nlu.training_data.message import Message # WARNING: This command line UI is using an external library @@ -776,7 +776,7 @@ async def _write_nlu_to_file( # noinspection PyBroadException try: - previous_examples = load_data(export_nlu_path) + previous_examples = loading.load_data(export_nlu_path) except Exception as e: logger.exception("An exception occurred while trying to load the NLU data.") @@ -797,7 +797,7 @@ async def _write_nlu_to_file( # need to guess the format of the file before opening it to avoid a read # in a write - if _guess_format(export_nlu_path) in {"md", "unk"}: + if loading.guess_format(export_nlu_path) in {"md", "unk"}: fformat = "md" else: fformat = "json" diff --git a/rasa/data.py b/rasa/data.py index 95e85dfb44ed..ba0e556becd0 100644 --- a/rasa/data.py +++ b/rasa/data.py @@ -4,10 +4,9 @@ import tempfile import uuid import typing -from typing import Tuple, List, Text, Set, Union, Optional import re - -import rasa.utils.io as io_utils +from typing import Tuple, List, Text, Set, Union, Optional +from rasa.nlu.training_data import loading logger = logging.getLogger(__name__) @@ -23,8 +22,8 @@ def get_core_directory( Args: paths: List of paths to training files or folders containing them. - skill_imports: `SkillSelector` instance which determines which files should - be loaded. + skill_imports: `SkillSelector` instance which determines which files + should be loaded. Returns: Path to temporary directory containing all found Core training files. @@ -41,8 +40,8 @@ def get_nlu_directory( Args: paths: List of paths to training files or folders containing them. - skill_imports: `SkillSelector` instance which determines which files should - be loaded. + skill_imports: `SkillSelector` instance which determines which files + should be loaded. Returns: Path to temporary directory containing all found NLU training files. @@ -59,8 +58,8 @@ def get_core_nlu_directories( Args: paths: List of paths to training files or folders containing them. - skill_imports: `SkillSelector` instance which determines which files should - be loaded. + skill_imports: `SkillSelector` instance which determines which files + should be loaded. Returns: Path to directory containing the Core files and path to directory @@ -83,8 +82,8 @@ def get_core_nlu_files( Args: paths: List of paths to training files or folders containing them. - skill_imports: `SkillSelector` instance which determines which files should - be loaded. + skill_imports: `SkillSelector` instance which determines which files + should be loaded. Returns: Tuple of paths to story and NLU files. @@ -156,21 +155,8 @@ def _is_valid_filetype(path: Text) -> bool: def _is_nlu_file(file_path: Text) -> bool: - with open(file_path, encoding="utf-8") as f: - if file_path.endswith(".json"): - content = io_utils.read_json_file(file_path) - is_nlu_file = ( - isinstance(content, dict) and content.get("rasa_nlu_data") is not None - ) - else: - is_nlu_file = any(_contains_nlu_pattern(l) for l in f) - return is_nlu_file - - -def _contains_nlu_pattern(text: Text) -> bool: - nlu_pattern = r"\s*##\s*(intent|regex||synonym|lookup):" - - return re.match(nlu_pattern, text) is not None + """Checks whether a file is an NLU file.""" + return loading.guess_format(file_path) != loading.UNK def _is_story_file(file_path: Text) -> bool: diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py index fc4ccd612710..8609f3eee3bd 100644 --- a/rasa/nlu/test.py +++ b/rasa/nlu/test.py @@ -19,7 +19,6 @@ from rasa.constants import TEST_DATA_FILE, TRAIN_DATA_FILE from rasa.model import get_model -from rasa.train import train_nlu from rasa.utils.io import create_path from rasa.nlu import config, training_data, utils from rasa.nlu.utils import write_to_file @@ -939,6 +938,8 @@ def compare_nlu( Returns: training examples per run """ + from rasa.train import train_nlu + training_examples_per_run = [] for run in range(runs): diff --git a/rasa/nlu/training_data/loading.py b/rasa/nlu/training_data/loading.py index 0b3f76a08a8c..5b2f4f4db4de 100644 --- a/rasa/nlu/training_data/loading.py +++ b/rasa/nlu/training_data/loading.py @@ -28,8 +28,8 @@ WIT = "wit" LUIS = "luis" RASA = "rasa_nlu" -UNK = "unk" MARKDOWN = "md" +UNK = "unk" DIALOGFLOW_RELEVANT = {DIALOGFLOW_ENTITIES, DIALOGFLOW_INTENT} _markdown_section_markers = ["## {}:".format(s) for s in markdown.available_sections] @@ -110,7 +110,7 @@ def _reader_factory(fformat: Text) -> Optional["TrainingDataReader"]: def _load(filename: Text, language: Optional[Text] = "en") -> Optional["TrainingData"]: """Loads a single training data file from disk.""" - fformat = _guess_format(filename) + fformat = guess_format(filename) if fformat == UNK: raise ValueError("Unknown data format for file {}".format(filename)) @@ -123,8 +123,15 @@ def _load(filename: Text, language: Optional[Text] = "en") -> Optional["Training return None -def _guess_format(filename: Text) -> Text: - """Applies heuristics to guess the data format of a file.""" +def guess_format(filename: Text) -> Text: + """Applies heuristics to guess the data format of a file. + + Args: + filename: file whose type should be guessed + + Returns: + Guessed file format. + """ guess = UNK content = rasa.utils.io.read_file(filename) try: diff --git a/rasa/nlu/training_data/util.py b/rasa/nlu/training_data/util.py index aa965962a9fc..4ee1ee68a253 100644 --- a/rasa/nlu/training_data/util.py +++ b/rasa/nlu/training_data/util.py @@ -30,14 +30,14 @@ def check_duplicate_synonym(entity_synonyms, text, syn, context_str=""): def get_file_format(resource_name: Text) -> Text: - from rasa.nlu.training_data.loading import _guess_format + from rasa.nlu.training_data import loading if resource_name is None or not os.path.exists(resource_name): raise AttributeError("Resource '{}' does not exist.".format(resource_name)) files = utils.list_files(resource_name) - file_formats = list(map(lambda f: _guess_format(f), files)) + file_formats = list(map(lambda f: loading.guess_format(f), files)) if not file_formats: return "json" diff --git a/tests/core/test_data.py b/tests/core/test_data.py index aa8ab818f3e9..0abddd12e093 100644 --- a/tests/core/test_data.py +++ b/tests/core/test_data.py @@ -106,16 +106,47 @@ def test_same_file_names_get_resolved(tmpdir): @pytest.mark.parametrize( - "line", + "test_input,expected", [ - "##intent:aintent", - "##synonym: synonym", - "##regex:a_regex", - " ##lookup:additional", + ( + "dialogflow", + { + "data/examples/dialogflow/entities/cuisine.json", + "data/examples/dialogflow/intents/affirm.json", + "data/examples/dialogflow/entities/location_entries_es.json", + "data/examples/dialogflow/intents/affirm_usersays_en.json", + "data/examples/dialogflow/intents/hi_usersays_es.json", + "data/examples/dialogflow/entities/cuisine_entries_es.json", + "data/examples/dialogflow/intents/inform_usersays_en.json", + "data/examples/dialogflow/intents/hi.json", + "data/examples/dialogflow/intents/goodbye_usersays_en.json", + "data/examples/dialogflow/agent.json", + "data/examples/dialogflow/intents/hi_usersays_en.json", + "data/examples/dialogflow/entities/location.json", + "data/examples/dialogflow/intents/affirm_usersays_es.json", + "data/examples/dialogflow/entities/cuisine_entries_en.json", + "data/examples/dialogflow/package.json", + "data/examples/dialogflow/intents/Default Fallback Intent.json", + "data/examples/dialogflow/intents/goodbye_usersays_es.json", + "data/examples/dialogflow/intents/goodbye.json", + "data/examples/dialogflow/entities/location_entries_en.json", + "data/examples/dialogflow/intents/inform.json", + "data/examples/dialogflow/intents/inform_usersays_es.json", + }, + ), + ("luis", {"data/examples/luis/demo-restaurants.json"}), + ( + "rasa", + {"data/examples/rasa/demo-rasa.json", "data/examples/rasa/demo-rasa.md"}, + ), + ("wit", {"data/examples/wit/demo-flights.json"}), ], ) -def test_contains_nlu_pattern(line): - assert data._contains_nlu_pattern(line) +def test_find_nlu_files_with_different_formats(test_input, expected): + examples_dir = "data/examples" + data_dir = os.path.join(examples_dir, test_input) + core_files, nlu_files = data.get_core_nlu_files([data_dir]) + assert nlu_files == expected def test_is_nlu_file_with_json(): @@ -142,8 +173,3 @@ def test_is_not_nlu_file_with_json(): f.write('{"test": "a"}') assert not data._is_nlu_file(file) - - -@pytest.mark.parametrize("line", ["- example", "## story intent 1 + two##slots* entry"]) -def test_not_contains_nlu_pattern(line): - assert not data._contains_nlu_pattern(line)