diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 949111a54350..dbdbc8ff0332 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -13,6 +13,7 @@ This project adheres to `Semantic Versioning`_ starting with version 1.0.
 Added
 -----
 - add root route to server started without ``--enable-api`` parameter
+- add ``--evaluate-model-directory`` to ``rasa test core`` to evaluate models from ``rasa train core -c <config-1> <config-2>``
 
 Changed
 -------
@@ -22,6 +23,7 @@ Removed
 
 Fixed
 -----
+- ``rasa test core`` can handle compressed model files
 
 
 [1.1.8] - 2019-07-25
@@ -48,6 +50,7 @@ Fixed
 - ``rasa train core`` in comparison mode stores the model files compressed (``tar.gz`` files)
 - slot setting in interactive learning with the TwoStageFallbackPolicy
 
+
 [1.1.7] - 2019-07-18
 ^^^^^^^^^^^^^^^^^^^^
 
diff --git a/docs/user-guide/evaluating-models.rst b/docs/user-guide/evaluating-models.rst
index b27837bde8a7..976cac7fb82d 100644
--- a/docs/user-guide/evaluating-models.rst
+++ b/docs/user-guide/evaluating-models.rst
@@ -204,11 +204,11 @@ mode to evaluate the models you just trained:
 
 .. code-block:: bash
 
-  $ rasa test core -m comparison_models/<model-1>.tar.gz comparison_models/<model-2>.tar.gz \
-    --stories stories_folder --out comparison_results
+  $ rasa test core -m comparison_models --stories stories_folder
+  --out comparison_results --evaluate-model-directory
 
 This will evaluate each of the models on the training set and plot some graphs
-to show you which policy performs best.  By evaluating on the full set of stories, you
+to show you which policy performs best. By evaluating on the full set of stories, you
 can measure how well Rasa Core is predicting the held-out stories.
 
 If you're not sure which policies to compare, we'd recommend trying out the
diff --git a/rasa/cli/arguments/data.py b/rasa/cli/arguments/data.py
index b951cff83bac..f70396c8be17 100644
--- a/rasa/cli/arguments/data.py
+++ b/rasa/cli/arguments/data.py
@@ -4,7 +4,6 @@
     add_nlu_data_param,
     add_out_param,
     add_data_param,
-    add_stories_param,
     add_domain_param,
 )
 
diff --git a/rasa/cli/arguments/test.py b/rasa/cli/arguments/test.py
index 61c695aa8474..0f3c768545f9 100644
--- a/rasa/cli/arguments/test.py
+++ b/rasa/cli/arguments/test.py
@@ -1,7 +1,7 @@
 import argparse
 from typing import Union
 
-from rasa.constants import DEFAULT_MODELS_PATH, DEFAULT_CONFIG_PATH
+from rasa.constants import DEFAULT_MODELS_PATH, DEFAULT_RESULTS_PATH
 
 from rasa.cli.arguments.default_arguments import (
     add_stories_param,
@@ -42,7 +42,7 @@ def add_test_core_argument_group(
     )
     add_out_param(
         parser,
-        default="results",
+        default=DEFAULT_RESULTS_PATH,
         help_text="Output path for any files created during the evaluation.",
     )
     parser.add_argument(
@@ -70,6 +70,15 @@ def add_test_core_argument_group(
         "trains on it. Fetches the data by sending a GET request "
         "to the supplied URL.",
     )
+    parser.add_argument(
+        "--evaluate-model-directory",
+        default=False,
+        action="store_true",
+        help="Should be set to evaluate models trained via "
+        "'rasa train core --config <config-1> <config-2>'. "
+        "All models in the provided directory are evaluated "
+        "and compared against each other.",
+    )
 
 
 def add_test_nlu_argument_group(
@@ -150,7 +159,7 @@ def add_test_nlu_argument_group(
         required=False,
         nargs="+",
         type=int,
-        default=[0, 25, 50, 75, 90],
+        default=[0, 25, 50, 75],
         help="Percentages of training data to exclude during comparison.",
     )
 
@@ -164,6 +173,6 @@ def add_test_core_model_param(parser: argparse.ArgumentParser):
         default=[default_path],
         help="Path to a pre-trained model. If it is a 'tar.gz' file that model file "
         "will be used. If it is a directory, the latest model in that directory "
-        "will be used. If multiple 'tar.gz' files are provided, all those models "
-        "will be compared.",
+        "will be used (exception: '--evaluate-model-directory' flag is set). If multiple "
+        "'tar.gz' files are provided, all those models will be compared.",
     )
diff --git a/rasa/cli/arguments/train.py b/rasa/cli/arguments/train.py
index 392618684171..6cb1d6be5062 100644
--- a/rasa/cli/arguments/train.py
+++ b/rasa/cli/arguments/train.py
@@ -88,7 +88,7 @@ def add_compare_params(
         "--percentages",
         nargs="*",
         type=int,
-        default=[0, 5, 25, 50, 70, 90, 95],
+        default=[0, 25, 50, 75],
         help="Range of exclusion percentages.",
     )
     parser.add_argument(
diff --git a/rasa/cli/test.py b/rasa/cli/test.py
index 74b753324977..5150f35e7e76 100644
--- a/rasa/cli/test.py
+++ b/rasa/cli/test.py
@@ -3,9 +3,7 @@
 import os
 from typing import List
 
-from rasa import data
 from rasa.cli.arguments import test as arguments
-from rasa.cli.utils import get_validated_path
 from rasa.constants import (
     DEFAULT_CONFIG_PATH,
     DEFAULT_DATA_PATH,
@@ -15,8 +13,8 @@
     DEFAULT_NLU_RESULTS_PATH,
     CONFIG_SCHEMA_FILE,
 )
-from rasa.test import test_compare_core, compare_nlu_models
-from rasa.utils.validation import validate_yaml_schema, InvalidYamlFileError
+import rasa.utils.validation as validation_utils
+import rasa.cli.utils as cli_utils
 
 logger = logging.getLogger(__name__)
 
@@ -59,12 +57,13 @@ def add_subparser(
 
 
 def test_core(args: argparse.Namespace) -> None:
-    from rasa.test import test_core
+    from rasa import data
+    from rasa.test import test_core_models_in_directory, test_core, test_core_models
 
-    endpoints = get_validated_path(
+    endpoints = cli_utils.get_validated_path(
         args.endpoints, "endpoints", DEFAULT_ENDPOINTS_PATH, True
     )
-    stories = get_validated_path(args.stories, "stories", DEFAULT_DATA_PATH)
+    stories = cli_utils.get_validated_path(args.stories, "stories", DEFAULT_DATA_PATH)
     stories = data.get_core_directory(stories)
     output = args.out or DEFAULT_RESULTS_PATH
 
@@ -75,25 +74,31 @@ def test_core(args: argparse.Namespace) -> None:
         args.model = args.model[0]
 
     if isinstance(args.model, str):
-        model_path = get_validated_path(args.model, "model", DEFAULT_MODELS_PATH)
-
-        test_core(
-            model=model_path,
-            stories=stories,
-            endpoints=endpoints,
-            output=output,
-            kwargs=vars(args),
+        model_path = cli_utils.get_validated_path(
+            args.model, "model", DEFAULT_MODELS_PATH
         )
 
+        if args.evaluate_model_directory:
+            test_core_models_in_directory(args.model, stories, output)
+        else:
+            test_core(
+                model=model_path,
+                stories=stories,
+                endpoints=endpoints,
+                output=output,
+                kwargs=vars(args),
+            )
+
     else:
-        test_compare_core(args.model, stories, output)
+        test_core_models(args.model, stories, output)
 
 
 def test_nlu(args: argparse.Namespace) -> None:
-    from rasa.test import test_nlu, perform_nlu_cross_validation
-    import rasa.utils.io
+    from rasa import data
+    import rasa.utils.io as io_utils
+    from rasa.test import compare_nlu_models, perform_nlu_cross_validation, test_nlu
 
-    nlu_data = get_validated_path(args.nlu, "nlu", DEFAULT_DATA_PATH)
+    nlu_data = cli_utils.get_validated_path(args.nlu, "nlu", DEFAULT_DATA_PATH)
     nlu_data = data.get_nlu_directory(nlu_data)
 
     if args.config is not None and len(args.config) == 1:
@@ -114,13 +119,13 @@ def test_nlu(args: argparse.Namespace) -> None:
         config_files = []
         for file in args.config:
             try:
-                validate_yaml_schema(
-                    rasa.utils.io.read_file(file),
+                validation_utils.validate_yaml_schema(
+                    io_utils.read_file(file),
                     CONFIG_SCHEMA_FILE,
                     show_validation_errors=False,
                 )
                 config_files.append(file)
-            except InvalidYamlFileError:
+            except validation_utils.InvalidYamlFileError:
                 logger.debug(
                     "Ignoring file '{}' as it is not a valid config file.".format(file)
                 )
@@ -136,10 +141,14 @@ def test_nlu(args: argparse.Namespace) -> None:
         )
     elif args.cross_validation:
         logger.info("Test model using cross validation.")
-        config = get_validated_path(args.config, "config", DEFAULT_CONFIG_PATH)
+        config = cli_utils.get_validated_path(
+            args.config, "config", DEFAULT_CONFIG_PATH
+        )
         perform_nlu_cross_validation(config, nlu_data, vars(args))
     else:
-        model_path = get_validated_path(args.model, "model", DEFAULT_MODELS_PATH)
+        model_path = cli_utils.get_validated_path(
+            args.model, "model", DEFAULT_MODELS_PATH
+        )
         test_nlu(model_path, nlu_data, vars(args))
 
 
diff --git a/rasa/cli/train.py b/rasa/cli/train.py
index 8d61caa11ca8..be13f7033a86 100644
--- a/rasa/cli/train.py
+++ b/rasa/cli/train.py
@@ -90,7 +90,7 @@ def train_core(
     args.domain = get_validated_path(
         args.domain, "domain", DEFAULT_DOMAIN_PATH, none_is_valid=True
     )
-    stories = get_validated_path(
+    story_file = get_validated_path(
         args.stories, "stories", DEFAULT_DATA_PATH, none_is_valid=True
     )
 
@@ -105,7 +105,7 @@ def train_core(
         return train_core(
             domain=args.domain,
             config=config,
-            stories=stories,
+            stories=story_file,
             output=output,
             train_path=train_path,
             fixed_model_name=args.fixed_model_name,
@@ -114,7 +114,7 @@ def train_core(
     else:
         from rasa.core.train import do_compare_training
 
-        loop.run_until_complete(do_compare_training(args, stories))
+        loop.run_until_complete(do_compare_training(args, story_file))
 
 
 def train_nlu(
diff --git a/rasa/core/test.py b/rasa/core/test.py
index e1b523bf55a2..d6830603e938 100644
--- a/rasa/core/test.py
+++ b/rasa/core/test.py
@@ -400,9 +400,9 @@ def collect_story_predictions(
     story_eval_store = EvaluationStore()
     failed = []
     correct_dialogues = []
-    num_stories = len(completed_trackers)
+    number_of_stories = len(completed_trackers)
 
-    logger.info("Evaluating {} stories\nProgress:".format(num_stories))
+    logger.info("Evaluating {} stories\nProgress:".format(number_of_stories))
 
     action_list = []
 
@@ -451,7 +451,7 @@ def collect_story_predictions(
             action_list=action_list,
             in_training_data_fraction=in_training_data_fraction,
         ),
-        num_stories,
+        number_of_stories,
     )
 
 
@@ -587,38 +587,61 @@ def plot_story_evaluation(
     fig.savefig(os.path.join(out_directory, "story_confmat.pdf"), bbox_inches="tight")
 
 
-async def compare(models: Text, stories_file: Text, output: Text) -> None:
-    """Evaluates multiple trained models on a test set."""
-    from rasa.core.agent import Agent
-    import rasa.nlu.utils as nlu_utils
+async def compare_models_in_dir(
+    model_dir: Text, stories_file: Text, output: Text
+) -> None:
+    """Evaluates multiple trained models in a directory on a test set."""
     from rasa.core import utils
+    import rasa.utils.io as io_utils
 
-    num_correct = defaultdict(list)
-
-    for run in nlu_utils.list_subdirectories(models):
-        num_correct_run = defaultdict(list)
+    number_correct = defaultdict(list)
 
-        for model in sorted(nlu_utils.list_subdirectories(run)):
-            logger.info("Evaluating model {}".format(model))
+    for run in io_utils.list_subdirectories(model_dir):
+        number_correct_in_run = defaultdict(list)
 
-            agent = Agent.load(model)
+        for model in sorted(io_utils.list_files(run)):
+            if not model.endswith("tar.gz"):
+                continue
 
-            completed_trackers = await _generate_trackers(stories_file, agent)
-
-            story_eval_store, no_of_stories = collect_story_predictions(
-                completed_trackers, agent
-            )
-
-            failed_stories = story_eval_store.failed_stories
+            # The model files are named like <policy-name><number>.tar.gz
+            # Remove the number from the name to get the policy name
             policy_name = "".join(
                 [i for i in os.path.basename(model) if not i.isdigit()]
             )
-            num_correct_run[policy_name].append(no_of_stories - len(failed_stories))
+            number_of_correct_stories = await _evaluate_core_model(model, stories_file)
+            number_correct_in_run[policy_name].append(number_of_correct_stories)
+
+        for k, v in number_correct_in_run.items():
+            number_correct[k].append(v)
+
+    utils.dump_obj_as_json_to_file(os.path.join(output, RESULTS_FILE), number_correct)
+
+
+async def compare_models(models: List[Text], stories_file: Text, output: Text) -> None:
+    """Evaluates provided trained models on a test set."""
+    from rasa.core import utils
+
+    number_correct = defaultdict(list)
+
+    for model in models:
+        number_of_correct_stories = await _evaluate_core_model(model, stories_file)
+        number_correct[os.path.basename(model)].append(number_of_correct_stories)
+
+    utils.dump_obj_as_json_to_file(os.path.join(output, RESULTS_FILE), number_correct)
 
-        for k, v in num_correct_run.items():
-            num_correct[k].append(v)
 
-    utils.dump_obj_as_json_to_file(os.path.join(output, "results.json"), num_correct)
+async def _evaluate_core_model(model: Text, stories_file: Text) -> int:
+    from rasa.core.agent import Agent
+
+    logger.info("Evaluating model '{}'".format(model))
+
+    agent = Agent.load(model)
+    completed_trackers = await _generate_trackers(stories_file, agent)
+    story_eval_store, number_of_stories = collect_story_predictions(
+        completed_trackers, agent
+    )
+    failed_stories = story_eval_store.failed_stories
+    return number_of_stories - len(failed_stories)
 
 
 def plot_nlu_results(output: Text, number_of_examples: List[int]) -> None:
diff --git a/rasa/core/training/dsl.py b/rasa/core/training/dsl.py
index a3258c3e88b0..01fe63ae5824 100644
--- a/rasa/core/training/dsl.py
+++ b/rasa/core/training/dsl.py
@@ -7,6 +7,7 @@
 import warnings
 from typing import Optional, List, Text, Any, Dict, TYPE_CHECKING, Iterable
 
+import rasa.utils.io as io_utils
 from rasa.constants import DOCS_BASE_URL
 from rasa.core import utils
 from rasa.core.constants import INTENT_MESSAGE_PREFIX
@@ -175,8 +176,6 @@ async def read_from_folder(
         exclusion_percentage: Optional[int] = None,
     ) -> List[StoryStep]:
         """Given a path reads all contained story files."""
-        import rasa.nlu.utils as nlu_utils
-
         if not os.path.exists(resource_name):
             raise ValueError(
                 "Story file or folder could not be found. Make "
@@ -184,7 +183,7 @@ async def read_from_folder(
                 "or file.".format(os.path.abspath(resource_name))
             )
 
-        files = nlu_utils.list_files(resource_name)
+        files = io_utils.list_files(resource_name)
 
         return await StoryFileReader.read_from_files(
             files,
diff --git a/rasa/nlu/model.py b/rasa/nlu/model.py
index 863b951f1c91..8efbe1a97486 100644
--- a/rasa/nlu/model.py
+++ b/rasa/nlu/model.py
@@ -15,7 +15,7 @@
 from rasa.nlu.config import RasaNLUModelConfig, component_config_from_pipeline
 from rasa.nlu.persistor import Persistor
 from rasa.nlu.training_data import TrainingData, Message
-from rasa.nlu.utils import create_dir, write_json_to_file
+from rasa.nlu.utils import write_json_to_file
 import rasa.utils.io
 
 MODEL_NAME_PREFIX = "nlu_"
@@ -221,7 +221,7 @@ def persist(
         path = os.path.abspath(path)
         dir_name = os.path.join(path, model_name)
 
-        create_dir(dir_name)
+        rasa.utils.io.create_directory(dir_name)
 
         if self.training_data:
             metadata.update(self.training_data.persist(dir_name))
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index 8609f3eee3bd..869d9914f241 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -17,9 +17,10 @@
     Any,
 )
 
+import rasa.utils.io as io_utils
+
 from rasa.constants import TEST_DATA_FILE, TRAIN_DATA_FILE
 from rasa.model import get_model
-from rasa.utils.io import create_path
 from rasa.nlu import config, training_data, utils
 from rasa.nlu.utils import write_to_file
 from rasa.nlu.components import ComponentBuilder
@@ -713,7 +714,7 @@ def run_evaluation(
     }  # type: Dict[Text, Optional[Dict]]
 
     if report:
-        utils.create_dir(report)
+        io_utils.create_directory(report)
 
     intent_results, entity_results = get_eval_data(interpreter, test_data)
 
@@ -830,7 +831,7 @@ def cross_validate(
         nlu_config = config.load(nlu_config)
 
     if report:
-        utils.create_dir(report)
+        io_utils.create_directory(report)
 
     trainer = Trainer(nlu_config)
     trainer.pipeline = remove_pretrained_extractors(trainer.pipeline)
@@ -947,10 +948,10 @@ def compare_nlu(
         logger.info("Beginning comparison run {}/{}".format(run + 1, runs))
 
         run_path = os.path.join(output, "run_{}".format(run + 1))
-        create_path(run_path)
+        io_utils.create_path(run_path)
 
         test_path = os.path.join(run_path, TEST_DATA_FILE)
-        create_path(test_path)
+        io_utils.create_path(test_path)
 
         train, test = data.train_test_split()
         write_to_file(test_path, test.as_markdown())
@@ -965,7 +966,7 @@ def compare_nlu(
 
             model_output_path = os.path.join(run_path, percent_string)
             train_split_path = os.path.join(model_output_path, TRAIN_DATA_FILE)
-            create_path(train_split_path)
+            io_utils.create_path(train_split_path)
             write_to_file(train_split_path, train.as_markdown())
 
             for nlu_config, model_name in zip(configs, model_names):
diff --git a/rasa/nlu/training_data/loading.py b/rasa/nlu/training_data/loading.py
index fc7139007425..20b8336f3552 100644
--- a/rasa/nlu/training_data/loading.py
+++ b/rasa/nlu/training_data/loading.py
@@ -6,7 +6,6 @@
 import typing
 from typing import Optional, Text
 
-import rasa.utils.io
 from rasa.nlu import utils
 from rasa.nlu.training_data.formats import markdown
 from rasa.nlu.training_data.formats.dialogflow import (
@@ -57,7 +56,7 @@ def load_data(resource_name: Text, language: Optional[Text] = "en") -> "Training
     if not os.path.exists(resource_name):
         raise ValueError("File '{}' does not exist.".format(resource_name))
 
-    files = utils.list_files(resource_name)
+    files = io_utils.list_files(resource_name)
     data_sets = [_load(f, language) for f in files]
     data_sets = [ds for ds in data_sets if ds]
     if len(data_sets) == 0:
@@ -141,7 +140,7 @@ def guess_format(filename: Text) -> Text:
 
     content = ""
     try:
-        content = rasa.utils.io.read_file(filename)
+        content = io_utils.read_file(filename)
         js = json.loads(content)
     except ValueError:
         if any([marker in content for marker in _markdown_section_markers]):
diff --git a/rasa/nlu/training_data/util.py b/rasa/nlu/training_data/util.py
index 4ee1ee68a253..b7533c378fb5 100644
--- a/rasa/nlu/training_data/util.py
+++ b/rasa/nlu/training_data/util.py
@@ -4,7 +4,7 @@
 import os
 from typing import Text
 
-from rasa.nlu import utils
+import rasa.utils.io as io_utils
 
 logger = logging.getLogger(__name__)
 
@@ -35,7 +35,7 @@ def get_file_format(resource_name: Text) -> Text:
     if resource_name is None or not os.path.exists(resource_name):
         raise AttributeError("Resource '{}' does not exist.".format(resource_name))
 
-    files = utils.list_files(resource_name)
+    files = io_utils.list_files(resource_name)
 
     file_formats = list(map(lambda f: loading.guess_format(f), files))
 
diff --git a/rasa/nlu/utils/__init__.py b/rasa/nlu/utils/__init__.py
index 7025263ad0f9..210fbb9ae4ee 100644
--- a/rasa/nlu/utils/__init__.py
+++ b/rasa/nlu/utils/__init__.py
@@ -1,5 +1,4 @@
 import errno
-import glob
 import io
 import json
 import os
@@ -20,62 +19,6 @@ def relative_normpath(f: Optional[Text], path: Text) -> Optional[Text]:
         return None
 
 
-def create_dir(dir_path: Text) -> None:
-    """Creates a directory and its super paths.
-
-    Succeeds even if the path already exists."""
-
-    try:
-        os.makedirs(dir_path)
-    except OSError as e:
-        # be happy if someone already created the path
-        if e.errno != errno.EEXIST:
-            raise
-
-
-def list_directory(path: Text) -> List[Text]:
-    """Returns all files and folders excluding hidden files.
-
-    If the path points to a file, returns the file. This is a recursive
-    implementation returning files in any depth of the path."""
-
-    if not isinstance(path, str):
-        raise ValueError(
-            "`resource_name` must be a string type. "
-            "Got `{}` instead".format(type(path))
-        )
-
-    if os.path.isfile(path):
-        return [path]
-    elif os.path.isdir(path):
-        results = []
-        for base, dirs, files in os.walk(path):
-            # remove hidden files
-            goodfiles = filter(lambda x: not x.startswith("."), files)
-            results.extend(os.path.join(base, f) for f in goodfiles)
-        return results
-    else:
-        raise ValueError(
-            "Could not locate the resource '{}'.".format(os.path.abspath(path))
-        )
-
-
-def list_files(path: Text) -> List[Text]:
-    """Returns all files excluding hidden files.
-
-    If the path points to a file, returns the file."""
-
-    return [fn for fn in list_directory(path) if os.path.isfile(fn)]
-
-
-def list_subdirectories(path: Text) -> List[Text]:
-    """Returns all folders excluding hidden files.
-
-    If the path points to a file, returns an empty list."""
-
-    return [fn for fn in glob.glob(os.path.join(path, "*")) if os.path.isdir(fn)]
-
-
 def lazyproperty(fn: Callable) -> Any:
     """Allows to avoid recomputing a property over and over.
 
diff --git a/rasa/test.py b/rasa/test.py
index 89d0b9dfd939..60488ffd7349 100644
--- a/rasa/test.py
+++ b/rasa/test.py
@@ -1,13 +1,14 @@
 import asyncio
 import logging
-import tempfile
-from typing import Text, Dict, Optional, List, Any
 import os
+from typing import Text, Dict, Optional, List, Any
 
-from rasa.core.interpreter import RegexInterpreter
-
-from rasa.constants import DEFAULT_RESULTS_PATH, RESULTS_FILE
-from rasa.model import get_model, get_model_subdirectories, unpack_model
+import rasa.utils.io as io_utils
+from rasa.constants import (
+    DEFAULT_RESULTS_PATH,
+    RESULTS_FILE,
+    NUMBER_OF_TRAINING_STORIES_FILE,
+)
 from rasa.cli.utils import print_error, print_warning
 import rasa.utils.common as utils
 from rasa.exceptions import ModelNotFound
@@ -15,20 +16,24 @@
 logger = logging.getLogger(__name__)
 
 
-def test_compare_core(models: List[Text], stories: Text, output: Text):
-    from rasa.core.test import compare, plot_core_results
-    import rasa.utils.io
-
-    model_directory = copy_models_to_compare(models)
+def test_core_models_in_directory(model_directory: Text, stories: Text, output: Text):
+    from rasa.core.test import compare_models_in_dir, plot_core_results
 
     loop = asyncio.get_event_loop()
-    loop.run_until_complete(compare(model_directory, stories, output))
+    loop.run_until_complete(compare_models_in_dir(model_directory, stories, output))
 
-    story_n_path = os.path.join(model_directory, "num_stories.json")
-    number_of_stories = rasa.utils.io.read_json_file(story_n_path)
+    story_n_path = os.path.join(model_directory, NUMBER_OF_TRAINING_STORIES_FILE)
+    number_of_stories = io_utils.read_json_file(story_n_path)
     plot_core_results(output, number_of_stories)
 
 
+def test_core_models(models: List[Text], stories: Text, output: Text):
+    from rasa.core.test import compare_models
+
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(compare_models(models, stories, output))
+
+
 def test(
     model: Text,
     stories: Text,
@@ -53,9 +58,8 @@ def test_core(
 ):
     import rasa.core.test
     import rasa.core.utils as core_utils
-    from rasa.nlu import utils as nlu_utils
-    from rasa.model import get_model
-    from rasa.core.interpreter import NaturalLanguageInterpreter
+    import rasa.model
+    from rasa.core.interpreter import RegexInterpreter, NaturalLanguageInterpreter
     from rasa.core.agent import Agent
 
     _endpoints = core_utils.AvailableEndpoints.read_endpoints(endpoints)
@@ -64,23 +68,23 @@ def test_core(
         kwargs = {}
 
     if output:
-        nlu_utils.create_dir(output)
+        io_utils.create_directory(output)
 
     try:
-        unpacked_model = get_model(model)
+        unpacked_model = rasa.model.get_model(model)
     except ModelNotFound:
         print_error(
             "Unable to test: could not find a model. Use 'rasa train' to train a "
-            "Rasa model."
+            "Rasa model and provide it via the '--model' argument."
         )
         return
 
-    core_path, nlu_path = get_model_subdirectories(unpacked_model)
+    core_path, nlu_path = rasa.model.get_model_subdirectories(unpacked_model)
 
     if not core_path:
         print_error(
-            "Unable to test: could not find a Core model. Use 'rasa train' to "
-            "train a model."
+            "Unable to test: could not find a Core model. Use 'rasa train' to train a "
+            "Rasa model and provide it via the '--model' argument."
         )
 
     use_e2e = kwargs["e2e"] if "e2e" in kwargs else False
@@ -107,12 +111,14 @@ def test_core(
 
 def test_nlu(model: Optional[Text], nlu_data: Optional[Text], kwargs: Optional[Dict]):
     from rasa.nlu.test import run_evaluation
+    from rasa.model import get_model
 
     try:
         unpacked_model = get_model(model)
     except ModelNotFound:
         print_error(
-            "Could not find any model. Use 'rasa train nlu' to train an NLU model."
+            "Could not find any model. Use 'rasa train nlu' to train a "
+            "Rasa model and provide it via the '--model' argument."
         )
         return
 
@@ -123,7 +129,8 @@ def test_nlu(model: Optional[Text], nlu_data: Optional[Text], kwargs: Optional[D
         run_evaluation(nlu_data, nlu_model, **kwargs)
     else:
         print_error(
-            "Could not find any model. Use 'rasa train nlu' to train an NLU model."
+            "Could not find any model. Use 'rasa train nlu' to train a "
+            "Rasa model and provide it via the '--model' argument."
         )
 
 
@@ -199,18 +206,3 @@ def perform_nlu_cross_validation(
         logger.info("Entity evaluation results")
         return_entity_results(entity_results.train, "train")
         return_entity_results(entity_results.test, "test")
-
-
-def copy_models_to_compare(models: List[str]) -> Text:
-    models_dir = tempfile.mkdtemp()
-
-    for i, model in enumerate(models):
-        if os.path.exists(model) and os.path.isfile(model):
-            path = os.path.join(models_dir, "model_" + str(i))
-            unpack_model(model, path)
-        else:
-            logger.warning("Ignore '{}' as it is not a valid model file.".format(model))
-
-    logger.debug("Unpacked models to compare to '{}'".format(models_dir))
-
-    return models_dir
diff --git a/rasa/utils/io.py b/rasa/utils/io.py
index b8531f3a8739..21ba7f10a4f8 100644
--- a/rasa/utils/io.py
+++ b/rasa/utils/io.py
@@ -6,6 +6,7 @@
 import tempfile
 import warnings
 import zipfile
+import glob
 from asyncio import AbstractEventLoop
 from typing import Text, Any, Dict, Union, List, Type, Callable
 import ruamel.yaml as yaml
@@ -290,6 +291,62 @@ def validate(document: Document) -> None:
     return FunctionValidator
 
 
+def list_files(path: Text) -> List[Text]:
+    """Returns all files excluding hidden files.
+
+    If the path points to a file, returns the file."""
+
+    return [fn for fn in list_directory(path) if os.path.isfile(fn)]
+
+
+def list_subdirectories(path: Text) -> List[Text]:
+    """Returns all folders excluding hidden files.
+
+    If the path points to a file, returns an empty list."""
+
+    return [fn for fn in glob.glob(os.path.join(path, "*")) if os.path.isdir(fn)]
+
+
+def list_directory(path: Text) -> List[Text]:
+    """Returns all files and folders excluding hidden files.
+
+    If the path points to a file, returns the file. This is a recursive
+    implementation returning files in any depth of the path."""
+
+    if not isinstance(path, str):
+        raise ValueError(
+            "`resource_name` must be a string type. "
+            "Got `{}` instead".format(type(path))
+        )
+
+    if os.path.isfile(path):
+        return [path]
+    elif os.path.isdir(path):
+        results = []
+        for base, dirs, files in os.walk(path):
+            # remove hidden files
+            goodfiles = filter(lambda x: not x.startswith("."), files)
+            results.extend(os.path.join(base, f) for f in goodfiles)
+        return results
+    else:
+        raise ValueError(
+            "Could not locate the resource '{}'.".format(os.path.abspath(path))
+        )
+
+
+def create_directory(directory_path: Text) -> None:
+    """Creates a directory and its super paths.
+
+    Succeeds even if the path already exists."""
+
+    try:
+        os.makedirs(directory_path)
+    except OSError as e:
+        # be happy if someone already created the path
+        if e.errno != errno.EEXIST:
+            raise
+
+
 def zip_folder(folder: Text) -> Text:
     """Create an archive from a folder."""
     import tempfile
diff --git a/tests/cli/test_rasa_test.py b/tests/cli/test_rasa_test.py
index 2e1983903169..357bacb87c33 100644
--- a/tests/cli/test_rasa_test.py
+++ b/tests/cli/test_rasa_test.py
@@ -1,4 +1,7 @@
 import os
+from shutil import copyfile
+from rasa.constants import DEFAULT_RESULTS_PATH, RESULTS_FILE
+from rasa.utils.io import list_files, write_yaml_file
 
 
 def test_test_core(run_in_default_project):
@@ -34,8 +37,6 @@ def test_test_nlu_cross_validation(run_in_default_project):
 
 
 def test_test_nlu_comparison(run_in_default_project):
-    from shutil import copyfile
-
     copyfile("config.yml", "nlu-config.yml")
 
     run_in_default_project(
@@ -53,15 +54,89 @@ def test_test_nlu_comparison(run_in_default_project):
     assert os.path.exists("nlu-report")
 
 
+def test_test_core_comparison(run_in_default_project):
+    files = list_files("models")
+    copyfile(files[0], "models/copy-model.tar.gz")
+
+    run_in_default_project(
+        "test",
+        "core",
+        "-m",
+        files[0],
+        "models/copy-model.tar.gz",
+        "--stories",
+        "data/stories.md",
+    )
+
+    assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
+
+
+def test_test_core_comparison_after_train(run_in_default_project):
+    write_yaml_file(
+        {
+            "language": "en",
+            "pipeline": "supervised_embeddings",
+            "policies": [{"name": "KerasPolicy"}],
+        },
+        "config_1.yml",
+    )
+
+    write_yaml_file(
+        {
+            "language": "en",
+            "pipeline": "supervised_embeddings",
+            "policies": [{"name": "MemoizationPolicy"}],
+        },
+        "config_2.yml",
+    )
+    run_in_default_project(
+        "train",
+        "core",
+        "-c",
+        "config_1.yml",
+        "config_2.yml",
+        "--stories",
+        "data/stories.md",
+        "--runs",
+        "2",
+        "--percentages",
+        "25",
+        "75",
+        "--augmentation",
+        "5",
+        "--out",
+        "comparison_models",
+    )
+
+    assert os.path.exists("comparison_models")
+    assert os.path.exists("comparison_models/run_1")
+    assert os.path.exists("comparison_models/run_2")
+
+    run_in_default_project(
+        "test",
+        "core",
+        "-m",
+        "comparison_models",
+        "--stories",
+        "data/stories",
+        "--evaluate-model-directory",
+    )
+
+    assert os.path.exists(os.path.join(DEFAULT_RESULTS_PATH, RESULTS_FILE))
+    assert os.path.exists(
+        os.path.join(DEFAULT_RESULTS_PATH, "core_model_comparison_graph.pdf")
+    )
+
+
 def test_test_help(run):
     output = run("test", "--help")
 
     help_text = """usage: rasa test [-h] [-v] [-vv] [--quiet] [-m MODEL] [-s STORIES]
                  [--max-stories MAX_STORIES] [--out OUT] [--e2e]
                  [--endpoints ENDPOINTS] [--fail-on-prediction-errors]
-                 [--url URL] [-u NLU] [--report [REPORT]]
-                 [--successes [SUCCESSES]] [--errors ERRORS]
-                 [--histogram HISTOGRAM] [--confmat CONFMAT]
+                 [--url URL] [--evaluate-model-directory] [-u NLU]
+                 [--report [REPORT]] [--successes [SUCCESSES]]
+                 [--errors ERRORS] [--histogram HISTOGRAM] [--confmat CONFMAT]
                  [-c CONFIG [CONFIG ...]] [--cross-validation] [-f FOLDS]
                  [-r RUNS] [-p PERCENTAGES [PERCENTAGES ...]]
                  {core,nlu} ..."""
@@ -94,7 +169,8 @@ def test_test_core_help(run):
     help_text = """usage: rasa test core [-h] [-v] [-vv] [--quiet] [-m MODEL [MODEL ...]]
                       [-s STORIES] [--max-stories MAX_STORIES] [--out OUT]
                       [--e2e] [--endpoints ENDPOINTS]
-                      [--fail-on-prediction-errors] [--url URL]"""
+                      [--fail-on-prediction-errors] [--url URL]
+                      [--evaluate-model-directory]"""
 
     lines = help_text.split("\n")
 
diff --git a/tests/cli/test_rasa_train.py b/tests/cli/test_rasa_train.py
index b47c7a6ff282..358569fe0b2f 100644
--- a/tests/cli/test_rasa_train.py
+++ b/tests/cli/test_rasa_train.py
@@ -12,8 +12,7 @@
     CONFIG_MANDATORY_KEYS,
     CONFIG_MANDATORY_KEYS_NLU,
 )
-from rasa.nlu.utils import list_files, list_subdirectories
-from rasa.utils.io import write_yaml_file
+import rasa.utils.io as io_utils
 
 
 def test_train(run_in_default_project):
@@ -34,7 +33,7 @@ def test_train(run_in_default_project):
     )
 
     assert os.path.exists(os.path.join(temp_dir, "train_models"))
-    files = list_files(os.path.join(temp_dir, "train_models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "train_models"))
     assert len(files) == 1
     assert os.path.basename(files[0]) == "test-model.tar.gz"
 
@@ -42,7 +41,7 @@ def test_train(run_in_default_project):
 def test_train_core_compare(run_in_default_project):
     temp_dir = os.getcwd()
 
-    write_yaml_file(
+    io_utils.write_yaml_file(
         {
             "language": "en",
             "pipeline": "supervised_embeddings",
@@ -51,7 +50,7 @@ def test_train_core_compare(run_in_default_project):
         "config_1.yml",
     )
 
-    write_yaml_file(
+    io_utils.write_yaml_file(
         {
             "language": "en",
             "pipeline": "supervised_embeddings",
@@ -80,11 +79,11 @@ def test_train_core_compare(run_in_default_project):
     )
 
     assert os.path.exists(os.path.join(temp_dir, "core_comparison_results"))
-    run_directories = list_subdirectories(
+    run_directories = io_utils.list_subdirectories(
         os.path.join(temp_dir, "core_comparison_results")
     )
     assert len(run_directories) == 2
-    model_files = list_files(
+    model_files = io_utils.list_files(
         os.path.join(temp_dir, "core_comparison_results", run_directories[0])
     )
     assert len(model_files) == 4
@@ -107,7 +106,7 @@ def test_train_no_domain_exists(run_in_default_project):
     )
 
     assert os.path.exists("train_models_no_domain")
-    files = list_files("train_models_no_domain")
+    files = io_utils.list_files("train_models_no_domain")
     assert len(files) == 1
 
     trained_model_path = "train_models_no_domain/nlu-model-only.tar.gz"
@@ -121,14 +120,14 @@ def test_train_skip_on_model_not_changed(run_in_default_project):
     temp_dir = os.getcwd()
 
     assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = list_files(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
     assert len(files) == 1
 
     file_name = files[0]
     run_in_default_project("train")
 
     assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = list_files(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
     assert len(files) == 1
     assert file_name == files[0]
 
@@ -137,13 +136,13 @@ def test_train_force(run_in_default_project):
     temp_dir = os.getcwd()
 
     assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = list_files(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
     assert len(files) == 1
 
     run_in_default_project("train", "--force")
 
     assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = list_files(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
     assert len(files) == 2
 
 
@@ -157,7 +156,7 @@ def test_train_with_only_nlu_data(run_in_default_project):
     run_in_default_project("train", "--fixed-model-name", "test-model")
 
     assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = list_files(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
     assert len(files) == 1
     assert os.path.basename(files[0]) == "test-model.tar.gz"
 
@@ -172,7 +171,7 @@ def test_train_with_only_core_data(run_in_default_project):
     run_in_default_project("train", "--fixed-model-name", "test-model")
 
     assert os.path.exists(os.path.join(temp_dir, "models"))
-    files = list_files(os.path.join(temp_dir, "models"))
+    files = io_utils.list_files(os.path.join(temp_dir, "models"))
     assert len(files) == 1
     assert os.path.basename(files[0]) == "test-model.tar.gz"
 
@@ -255,7 +254,7 @@ def test_train_nlu(run_in_default_project):
     )
 
     assert os.path.exists("train_models")
-    files = list_files("train_models")
+    files = io_utils.list_files("train_models")
     assert len(files) == 1
     assert os.path.basename(files[0]).startswith("nlu-")
 
diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/base/test_evaluation.py
index 16ccd2648624..02b66c89cdd4 100644
--- a/tests/nlu/base/test_evaluation.py
+++ b/tests/nlu/base/test_evaluation.py
@@ -273,7 +273,7 @@ def test_intent_evaluation_report(tmpdir_factory):
     report_folder = os.path.join(path, "reports")
     report_filename = os.path.join(report_folder, "intent_report.json")
 
-    utils.create_dir(report_folder)
+    rasa.utils.io.create_directory(report_folder)
 
     intent_results = [
         IntentEvaluationResult("", "restaurant_search", "I am hungry", 0.12345),
@@ -328,7 +328,7 @@ def __init__(self, component_config=None) -> None:
     report_filename_a = os.path.join(report_folder, "EntityExtractorA_report.json")
     report_filename_b = os.path.join(report_folder, "EntityExtractorB_report.json")
 
-    utils.create_dir(report_folder)
+    rasa.utils.io.create_directory(report_folder)
     mock_interpreter = Interpreter(
         [
             EntityExtractorA({"provides": ["entities"]}),
diff --git a/tests/nlu/base/test_utils.py b/tests/nlu/base/test_utils.py
index 933cfb41f37b..fd59bc38632b 100644
--- a/tests/nlu/base/test_utils.py
+++ b/tests/nlu/base/test_utils.py
@@ -4,9 +4,8 @@
 import pickle
 import pytest
 import tempfile
-from rasa.nlu import utils
+import rasa.utils.io as io_utils
 from rasa.nlu.utils import (
-    create_dir,
     is_model_dir,
     is_url,
     ordered,
@@ -33,13 +32,13 @@ def test_relative_normpath():
 
 def test_list_files_invalid_resource():
     with pytest.raises(ValueError) as execinfo:
-        utils.list_files(None)
+        io_utils.list_files(None)
     assert "must be a string type" in str(execinfo.value)
 
 
 def test_list_files_non_existing_dir():
     with pytest.raises(ValueError) as execinfo:
-        utils.list_files("my/made_up/path")
+        io_utils.list_files("my/made_up/path")
     assert "Could not locate the resource" in str(execinfo.value)
 
 
@@ -49,12 +48,12 @@ def test_list_files_ignores_hidden_files(tmpdir):
     # create a normal file
     normal_file = os.path.join(tmpdir.strpath, "normal_file")
     open(normal_file, "a").close()
-    assert utils.list_files(tmpdir.strpath) == [normal_file]
+    assert io_utils.list_files(tmpdir.strpath) == [normal_file]
 
 
 def test_creation_of_existing_dir(tmpdir):
     # makes sure there is no exception
-    assert create_dir(tmpdir.strpath) is None
+    assert io_utils.create_directory(tmpdir.strpath) is None
 
 
 def test_ordered():