diff --git a/.travis.yml b/.travis.yml
index 8c365131eae0..cb861d1d3522 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -29,7 +29,7 @@ before_script:
   - mkdir $HOME/tmp
   - export TMPDIR=$HOME/tmp
 script: 
-  - py.test --pep8 -m pep8
+  - py.test --codestyle -m codestyle
   - py.test tests/base --cov rasa_nlu -v --cov-append
   - py.test tests/training --cov rasa_nlu -v --cov-append
 after_success:
@@ -71,14 +71,16 @@ jobs:
     - git remote set-url --push origin "git@github.com:$TRAVIS_REPO_SLUG"
     - export ${!TRAVIS*}
     - sphinx-versioning push docs newdocs . -- -b dirhtml  -A html_theme=rasabaster
-  - stage: Test starter packs
+  - stage: test
+    if: branch = "*.x" # only new NLU version builds test the starter packs
     name: "NLU starter pack"
     python: 3.6
     script:
     - git clone https://github.com/RasaHQ/starter-pack-rasa-nlu.git
     - cd starter-pack-rasa-nlu
     - python -m pytest tests/test_nlu.py
-  - stage: Test starter packs
+  - stage: test
+    if: branch = "*.x" # only new NLU version builds test the starter packs
     name: "Stack starter pack (NLU only)"
     python: 3.6
     script:
@@ -106,6 +108,7 @@ jobs:
     - git commit --allow-empty -m "trigger nlu docs update"
     - git push origin master
   - stage: deploy
+    name: "PyPI test"
     python: 3.6
     install: skip
     script: skip
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 2b9840828490..01e207e4bd09 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -15,6 +15,9 @@ Changed
 - validate training data only if used for training
 - applied spacy guidelines on how to disable pipeline components
 
+=======
+- replace pep8 with pycodestyle
+
 Removed
 -------
 - **removed python 2.7 support**
@@ -22,7 +25,22 @@ Removed
 Fixed
 -----
 
-.. _v0-14-0:
+[0.14.2] - 2018-01-29
+^^^^^^^^^^^^^^^^^^^^^
+
+Added
+-----
+
+- ``rasa_nlu.evaluate`` now exports reports into a folder and also
+  includes the entity extractor reports
+
+Changed
+-------
+- updated requirements to match Core and SDK
+- pinned keras dependecies
+- starter packs are now tested in parallel with the unittests,
+  and only on branches ending in ``.x`` (i.e. new version releases)
+
 
 [0.14.1] - 2018-01-23
 ^^^^^^^^^^^^^^^^^^^^^
@@ -31,6 +49,8 @@ Fixed
 -----
 - scikit-learn is a global requirement
 
+.. _v0-14-0:
+
 [0.14.0] - 2018-01-23
 ^^^^^^^^^^^^^^^^^^^^^
 
@@ -57,6 +77,8 @@ Changed
 - updated TensorFlow version to 1.12.0
 - updated scikit-learn version to 0.20.2
 - updated cloudpickle version to 0.6.1
+- updated requirements to match Core and SDK
+- pinned keras dependecies
 
 Removed
 -------
diff --git a/alt_requirements/requirements_bare.txt b/alt_requirements/requirements_bare.txt
index 2b220e9331f2..8d750dcbdbb2 100644
--- a/alt_requirements/requirements_bare.txt
+++ b/alt_requirements/requirements_bare.txt
@@ -2,7 +2,8 @@ gevent==1.2.2
 klein==17.10.0
 hyperlink==17.3.1
 typing==3.6.2
-future==0.16.0
+future==0.17.1
+six==1.11.0
 jsonschema==2.6.0
 matplotlib==2.1.0
 requests==2.20.0
@@ -10,7 +11,7 @@ tqdm==4.19.5
 numpy==1.14.5
 simplejson==3.13.2
 cloudpickle==0.6.1
-packaging==17.1
+packaging==18.0
 ruamel.yaml==0.15.78
-coloredlogs==9.0
+coloredlogs==10.0
 scikit-learn==0.20.2
diff --git a/alt_requirements/requirements_dev.txt b/alt_requirements/requirements_dev.txt
index 667b14c534a7..f700f172a072 100644
--- a/alt_requirements/requirements_dev.txt
+++ b/alt_requirements/requirements_dev.txt
@@ -4,7 +4,7 @@
 
 # test
 python-coveralls==2.9.1
-pytest-pep8==1.0.6
+pytest-pycodestyle==1.4.0
 pytest-cov==2.5.1
 pytest-twisted==1.6
 pytest==3.3.2
diff --git a/alt_requirements/requirements_tensorflow_sklearn.txt b/alt_requirements/requirements_tensorflow_sklearn.txt
index bf43ab5fcf81..6c5687e17077 100644
--- a/alt_requirements/requirements_tensorflow_sklearn.txt
+++ b/alt_requirements/requirements_tensorflow_sklearn.txt
@@ -4,3 +4,5 @@
 tensorflow==1.12.0
 scipy==1.1.0
 sklearn-crfsuite==0.3.6
+keras-applications==1.0.6
+keras-preprocessing==1.0.5
\ No newline at end of file
diff --git a/docs/evaluation.rst b/docs/evaluation.rst
index bd008beeb9a8..84d59de1afe9 100644
--- a/docs/evaluation.rst
+++ b/docs/evaluation.rst
@@ -80,9 +80,9 @@ Intent Classification
 The evaluation script will produce a report, confusion matrix
 and confidence histogram for your model.
 
-The report logs precision, recall, and f1 measure for
-each intent, as well as provide an overall average.  You can save this
-report as a JSON file using the `--report` flag.
+The report logs precision, recall and f1 measure for
+each intent and entity, as well as provide an overall average.
+You can save these reports as JSON files using the `--report` flag.
 
 The confusion matrix shows you which
 intents are mistaken for others; any samples which have been
diff --git a/rasa_nlu/classifiers/embedding_intent_classifier.py b/rasa_nlu/classifiers/embedding_intent_classifier.py
index 450001273ac7..3095577de50d 100644
--- a/rasa_nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa_nlu/classifiers/embedding_intent_classifier.py
@@ -112,13 +112,13 @@ def __init__(self,
                  component_config: Optional[Dict[Text, Any]] = None,
                  inv_intent_dict: Optional[Dict[int, Text]] = None,
                  encoded_all_intents: Optional[np.ndarray] = None,
-                 session: Optional[tf.Session] = None,
-                 graph: Optional[tf.Graph] = None,
-                 message_placeholder: Optional[tf.Tensor] = None,
-                 intent_placeholder: Optional[tf.Tensor] = None,
-                 similarity_op: Optional[tf.Tensor] = None,
-                 word_embed: Optional[tf.Tensor] = None,
-                 intent_embed: Optional[tf.Tensor] = None
+                 session: Optional['tf.Session'] = None,
+                 graph: Optional['tf.Graph'] = None,
+                 message_placeholder: Optional['tf.Tensor'] = None,
+                 intent_placeholder: Optional['tf.Tensor'] = None,
+                 similarity_op: Optional['tf.Tensor'] = None,
+                 word_embed: Optional['tf.Tensor'] = None,
+                 intent_embed: Optional['tf.Tensor'] = None
                  ) -> None:
         """Declare instant variables with default values"""
 
@@ -198,9 +198,9 @@ def required_packages(cls) -> List[Text]:
     def _check_tensorflow():
         if tf is None:
             raise ImportError(
-                    'Failed to import `tensorflow`. '
-                    'Please install `tensorflow`. '
-                    'For example with `pip install tensorflow`.')
+                'Failed to import `tensorflow`. '
+                'Please install `tensorflow`. '
+                'For example with `pip install tensorflow`.')
 
     # training data helpers:
     @staticmethod
@@ -232,7 +232,7 @@ def _create_encoded_intents(self,
 
         if self.intent_tokenization_flag:
             intent_token_dict = self._create_intent_token_dict(
-                    list(intent_dict.keys()), self.intent_split_symbol)
+                list(intent_dict.keys()), self.intent_split_symbol)
 
             encoded_all_intents = np.zeros((len(intent_dict),
                                             len(intent_token_dict)))
@@ -277,8 +277,8 @@ def _prepare_data_for_training(
 
         # tf helpers:
 
-    def _create_tf_embed_nn(self, x_in: tf.Tensor, is_training: tf.Tensor,
-                            layer_sizes: List[int], name: Text) -> tf.Tensor:
+    def _create_tf_embed_nn(self, x_in: 'tf.Tensor', is_training: 'tf.Tensor',
+                            layer_sizes: List[int], name: Text) -> 'tf.Tensor':
         """Create nn with hidden layers and name"""
 
         reg = tf.contrib.layers.l2_regularizer(self.C2)
@@ -298,10 +298,10 @@ def _create_tf_embed_nn(self, x_in: tf.Tensor, is_training: tf.Tensor,
         return x
 
     def _create_tf_embed(self,
-                         a_in: tf.Tensor,
-                         b_in: tf.Tensor,
-                         is_training: tf.Tensor
-                         ) -> Tuple[tf.Tensor, tf.Tensor]:
+                         a_in: 'tf.Tensor',
+                         b_in: 'tf.Tensor',
+                         is_training: 'tf.Tensor'
+                         ) -> Tuple['tf.Tensor', 'tf.Tensor']:
         """Create tf graph for training"""
 
         emb_a = self._create_tf_embed_nn(a_in, is_training,
@@ -313,8 +313,8 @@ def _create_tf_embed(self,
         return emb_a, emb_b
 
     def _tf_sim(self,
-                a: tf.Tensor,
-                b: tf.Tensor) -> Tuple[tf.Tensor, tf.Tensor]:
+                a: 'tf.Tensor',
+                b: 'tf.Tensor') -> Tuple['tf.Tensor', 'tf.Tensor']:
         """Define similarity
 
         in two cases:
@@ -338,7 +338,7 @@ def _tf_sim(self,
                              "should be 'cosine' or 'inner'"
                              "".format(self.similarity_type))
 
-    def _tf_loss(self, sim: tf.Tensor, sim_emb: tf.Tensor) -> tf.Tensor:
+    def _tf_loss(self, sim: 'tf.Tensor', sim_emb: 'tf.Tensor') -> 'tf.Tensor':
         """Define loss"""
 
         # loss for maximizing similarity with correct action
@@ -379,8 +379,8 @@ def _create_batch_b(self, batch_pos_b: np.ndarray,
         for b in range(batch_pos_b.shape[0]):
             # create negative indexes out of possible ones
             # except for correct index of b
-            negative_indexes = [i for i in range(
-                    self.encoded_all_intents.shape[0])
+            negative_indexes = [i for i in
+                                range(self.encoded_all_intents.shape[0])
                                 if i != intent_ids[b]]
             negs = np.random.choice(negative_indexes, size=self.num_neg)
 
@@ -410,9 +410,9 @@ def _train_tf(self,
                   X: np.ndarray,
                   Y: np.ndarray,
                   intents_for_X: np.ndarray,
-                  loss: tf.Tensor,
-                  is_training: tf.Tensor,
-                  train_op: tf.Tensor
+                  loss: 'tf.Tensor',
+                  is_training: 'tf.Tensor',
+                  train_op: 'tf.Tensor'
                   ) -> None:
         """Train tf graph"""
 
@@ -443,10 +443,10 @@ def _train_tf(self,
                 batch_b = self._create_batch_b(batch_pos_b, intents_for_b)
 
                 sess_out = self.session.run(
-                        {'loss': loss, 'train_op': train_op},
-                        feed_dict={self.a_in: batch_a,
-                                   self.b_in: batch_b,
-                                   is_training: True}
+                    {'loss': loss, 'train_op': train_op},
+                    feed_dict={self.a_in: batch_a,
+                               self.b_in: batch_b,
+                               is_training: True}
                 )
                 ep_loss += sess_out.get('loss') / batches_per_epoch
 
@@ -477,7 +477,7 @@ def _train_tf(self,
     def _output_training_stat(self,
                               X: np.ndarray,
                               intents_for_X: np.ndarray,
-                              is_training: tf.Tensor) -> np.ndarray:
+                              is_training: 'tf.Tensor') -> np.ndarray:
         """Output training statistics"""
 
         n = self.evaluate_on_num_examples
@@ -509,10 +509,10 @@ def train(self,
 
         self.inv_intent_dict = {v: k for k, v in intent_dict.items()}
         self.encoded_all_intents = self._create_encoded_intents(
-                intent_dict)
+            intent_dict)
 
         X, Y, intents_for_X = self._prepare_data_for_training(
-                training_data, intent_dict)
+            training_data, intent_dict)
 
         # check if number of negatives is less than number of intents
         logger.debug("Check if num_neg {} is smaller than "
@@ -707,16 +707,16 @@ def load(cls,
                 encoded_all_intents = pickle.load(f)
 
             return cls(
-                    component_config=meta,
-                    inv_intent_dict=inv_intent_dict,
-                    encoded_all_intents=encoded_all_intents,
-                    session=sess,
-                    graph=graph,
-                    message_placeholder=a_in,
-                    intent_placeholder=b_in,
-                    similarity_op=sim_op,
-                    word_embed=word_embed,
-                    intent_embed=intent_embed
+                component_config=meta,
+                inv_intent_dict=inv_intent_dict,
+                encoded_all_intents=encoded_all_intents,
+                session=sess,
+                graph=graph,
+                message_placeholder=a_in,
+                intent_placeholder=b_in,
+                similarity_op=sim_op,
+                word_embed=word_embed,
+                intent_embed=intent_embed
             )
 
         else:
diff --git a/rasa_nlu/components.py b/rasa_nlu/components.py
index ce349d040b33..98e85d0930f0 100644
--- a/rasa_nlu/components.py
+++ b/rasa_nlu/components.py
@@ -37,7 +37,7 @@ def validate_requirements(component_names: List[Text]) -> None:
     for component_name in component_names:
         component_class = registry.get_component_class(component_name)
         failed_imports.update(find_unavailable_packages(
-                component_class.required_packages()))
+            component_class.required_packages()))
     if failed_imports:  # pragma: no cover
         # if available, use the development file to figure out the correct
         # version numbers for each requirement
@@ -170,7 +170,7 @@ def __init__(self,
         component_config["name"] = self.name
 
         self.component_config = config.override_defaults(
-                self.defaults, component_config)
+            self.defaults, component_config)
 
         self.partial_processing_pipeline = None
         self.partial_processing_context = None
@@ -355,9 +355,9 @@ def __get_cached_component(self,
 
         component_class = registry.get_component_class(component_name)
         cache_key = component_class.cache_key(model_metadata)
-        if (cache_key is not None
-                and self.use_cache
-                and cache_key in self.component_cache):
+        if (cache_key is not None and
+                self.use_cache and
+                cache_key in self.component_cache):
             return self.component_cache[cache_key], cache_key
         else:
             return None, cache_key
@@ -393,10 +393,10 @@ def load_component(self,
 
         try:
             cached_component, cache_key = self.__get_cached_component(
-                    component_name, model_metadata)
+                component_name, model_metadata)
             component = registry.load_component_by_name(
-                    component_name, model_dir, model_metadata,
-                    cached_component, **context)
+                component_name, model_dir, model_metadata,
+                cached_component, **context)
             if not cached_component:
                 # If the component wasn't in the cache,
                 # let us add it if possible
@@ -416,7 +416,7 @@ def create_component(self,
 
         try:
             component, cache_key = self.__get_cached_component(
-                    component_name, Metadata(cfg.as_dict(), None))
+                component_name, Metadata(cfg.as_dict(), None))
             if component is None:
                 component = registry.create_component_by_name(component_name,
                                                               cfg)
diff --git a/rasa_nlu/config.py b/rasa_nlu/config.py
index 8d6acbf4ac2f..7e5c569766fe 100644
--- a/rasa_nlu/config.py
+++ b/rasa_nlu/config.py
@@ -115,7 +115,7 @@ def __init__(self, configuration_values=None):
                 self.__dict__['pipeline'] = pipeline
             else:
                 known_templates = ", ".join(
-                        registry.registered_pipeline_templates.keys())
+                    registry.registered_pipeline_templates.keys())
 
                 raise InvalidConfigError("No pipeline specified and unknown "
                                          "pipeline template '{}' passed. Known "
diff --git a/rasa_nlu/convert.py b/rasa_nlu/convert.py
index b6d67db4731d..86680ad1a5bb 100644
--- a/rasa_nlu/convert.py
+++ b/rasa_nlu/convert.py
@@ -6,7 +6,7 @@
 
 def create_argument_parser():
     parser = argparse.ArgumentParser(
-            description='Convert training data formats into one another')
+        description='Convert training data formats into one another')
 
     parser.add_argument('-d', '--data_file',
                         required=True,
diff --git a/rasa_nlu/data_router.py b/rasa_nlu/data_router.py
index c2955e558678..a24c8a8f4254 100644
--- a/rasa_nlu/data_router.py
+++ b/rasa_nlu/data_router.py
@@ -135,8 +135,8 @@ def _create_query_logger(response_log):
             out_file = io.open(response_logfile, 'a', encoding='utf8')
             # noinspection PyTypeChecker
             query_logger = Logger(
-                    observer=jsonFileLogObserver(out_file, recordSeparator=''),
-                    namespace='query-logger')
+                observer=jsonFileLogObserver(out_file, recordSeparator=''),
+                namespace='query-logger')
             # Prevents queries getting logged with parent logger
             # --> might log them to stdout
             logger.info("Logging requests to '{}'.".format(response_logfile))
@@ -166,12 +166,12 @@ def _create_project_store(self,
 
         if self.model_server is not None:
             project_store[default_project] = load_from_server(
-                    self.component_builder,
-                    default_project,
-                    self.project_dir,
-                    self.remote_storage,
-                    self.model_server,
-                    self.wait_time_between_pulls
+                self.component_builder,
+                default_project,
+                self.project_dir,
+                self.remote_storage,
+                self.model_server,
+                self.wait_time_between_pulls
             )
         else:
             for project in projects:
@@ -182,9 +182,9 @@ def _create_project_store(self,
 
             if not project_store:
                 project_store[default_project] = Project(
-                        project=default_project,
-                        project_dir=self.project_dir,
-                        remote_storage=self.remote_storage
+                    project=default_project,
+                    project_dir=self.project_dir,
+                    remote_storage=self.remote_storage
                 )
 
         return project_store
@@ -251,16 +251,16 @@ def parse(self, data: Dict[Text, Any]) -> Dict[Text, Any]:
 
             if project not in projects:
                 raise InvalidProjectError(
-                        "No project found with name '{}'.".format(project))
+                    "No project found with name '{}'.".format(project))
             else:
                 try:
                     self.project_store[project] = Project(
-                            self.component_builder, project,
-                            self.project_dir, self.remote_storage)
+                        self.component_builder, project,
+                        self.project_dir, self.remote_storage)
                 except Exception as e:
                     raise InvalidProjectError(
-                            "Unable to load project '{}'. "
-                            "Error: {}".format(project, e))
+                        "Unable to load project '{}'. "
+                        "Error: {}".format(project, e))
 
         time = data.get('time')
         response = self.project_store[project].parse(data['text'], time,
@@ -313,8 +313,8 @@ def start_train_process(self,
             self.project_store[project].status = STATUS_TRAINING
         elif project not in self.project_store:
             self.project_store[project] = Project(
-                    self.component_builder, project,
-                    self.project_dir, self.remote_storage)
+                self.component_builder, project,
+                self.project_dir, self.remote_storage)
             self.project_store[project].status = STATUS_TRAINING
 
         def training_callback(model_path):
@@ -383,9 +383,9 @@ def evaluate(self,
             self.project_store[project]._loader_lock.release()
 
         return run_evaluation(
-                data_path=file_name,
-                model=self.project_store[project]._models[model_name],
-                errors_filename=None
+            data_path=file_name,
+            model=self.project_store[project]._models[model_name],
+            errors_filename=None
         )
 
     def unload_model(self,
diff --git a/rasa_nlu/evaluate.py b/rasa_nlu/evaluate.py
index 756a98acd98c..1fe76f343705 100644
--- a/rasa_nlu/evaluate.py
+++ b/rasa_nlu/evaluate.py
@@ -2,6 +2,7 @@
 from collections import defaultdict, namedtuple
 
 import json
+import os
 import logging
 import numpy as np
 import shutil
@@ -35,8 +36,8 @@
 def create_argument_parser():
     import argparse
     parser = argparse.ArgumentParser(
-            description='evaluate a Rasa NLU pipeline with cross '
-                        'validation or on external data')
+        description='evaluate a Rasa NLU pipeline with cross '
+                    'validation or on external data')
 
     parser.add_argument('-d', '--data', required=True,
                         help="file containing training/evaluation data")
@@ -57,8 +58,9 @@ def create_argument_parser():
                         help="number of CV folds (crossvalidation only)")
 
     parser.add_argument('--report', required=False, nargs='?',
-                        const="report.json", default=False,
-                        help="output path to save the metrics report")
+                        const="reports", default=False,
+                        help="output path to save the intent/entity"
+                             "metrics report")
 
     parser.add_argument('--successes', required=False, nargs='?',
                         const="successes.json", default=False,
@@ -86,7 +88,6 @@ def plot_confusion_matrix(cm,
                           zmin=1,
                           out=None) -> None:  # pragma: no cover
     """Print and plot the confusion matrix for the intent classification.
-
     Normalization can be applied by setting `normalize=True`."""
     import matplotlib.pyplot as plt
     from matplotlib.colors import LogNorm
@@ -207,7 +208,7 @@ def drop_intents_below_freq(td: TrainingData, cutoff: int = 5):
     """Remove intent groups with less than cutoff instances."""
 
     logger.debug(
-            "Raw data intent examples: {}".format(len(td.intent_examples)))
+        "Raw data intent examples: {}".format(len(td.intent_examples)))
     keep_examples = [ex
                      for ex in td.intent_examples
                      if td.examples_per_intent[ex.get("intent")] >= cutoff]
@@ -277,7 +278,7 @@ def plot_intent_confidences(intent_results, intent_hist_filename):
 
 
 def evaluate_intents(intent_results,
-                     report_filename,
+                     report_folder,
                      successes_filename,
                      errors_filename,
                      confmat_filename,
@@ -301,13 +302,15 @@ def evaluate_intents(intent_results,
 
     targets, predictions = _targets_predictions_from(intent_results)
 
-    if report_filename:
+    if report_folder:
         report, precision, f1, accuracy = get_evaluation_metrics(
-                targets, predictions, output_dict=True)
+            targets, predictions, output_dict=True)
+
+        report_filename = os.path.join(report_folder, 'intent_report.json')
 
         save_json(report, report_filename)
         logger.info("Classification report saved to {}."
-                    .format(report_filename))
+                    "".format(report_filename))
 
     else:
         report, precision, f1, accuracy = get_evaluation_metrics(targets,
@@ -359,7 +362,6 @@ def evaluate_intents(intent_results,
 
 def merge_labels(aligned_predictions, extractor=None):
     """Concatenates all labels of the aligned predictions.
-
     Takes the aligned prediction labels which are grouped for each message
     and concatenates them."""
 
@@ -382,9 +384,9 @@ def substitute_labels(labels, old, new):
 def evaluate_entities(targets,
                       predictions,
                       tokens,
-                      extractors):  # pragma: no cover
+                      extractors,
+                      report_folder):  # pragma: no cover
     """Creates summary statistics for each entity extractor.
-
     Logs precision, recall, and F1 per entity type for each extractor."""
 
     aligned_predictions = align_all_entity_predictions(targets, predictions,
@@ -397,11 +399,24 @@ def evaluate_entities(targets,
     for extractor in extractors:
         merged_predictions = merge_labels(aligned_predictions, extractor)
         merged_predictions = substitute_labels(
-                merged_predictions, "O", "no_entity")
+            merged_predictions, "O", "no_entity")
         logger.info("Evaluation for entity extractor: {} ".format(extractor))
-        report, precision, f1, accuracy = get_evaluation_metrics(
+        if report_folder:
+            report, precision, f1, accuracy = get_evaluation_metrics(
+                merged_targets, merged_predictions, output_dict=True)
+
+            report_filename = extractor + "_report.json"
+            extractor_report = os.path.join(report_folder, report_filename)
+
+            save_json(report, extractor_report)
+            logger.info("Classification report for '{}' saved to '{}'."
+                        "".format(extractor, extractor_report))
+
+        else:
+            report, precision, f1, accuracy = get_evaluation_metrics(
                 merged_targets, merged_predictions)
-        log_evaluation_table(report, precision, f1, accuracy)
+            log_evaluation_table(report, precision, f1, accuracy)
+
         result[extractor] = {
             "report": report,
             "precision": precision,
@@ -434,9 +449,7 @@ def determine_intersection(token, entity):
 
 def do_entities_overlap(entities):
     """Checks if entities overlap.
-
     I.e. cross each others start and end boundaries.
-
     :param entities: list of entities
     :return: boolean
     """
@@ -454,7 +467,6 @@ def do_entities_overlap(entities):
 
 def find_intersecting_entites(token, entities):
     """Finds the entities that intersect with a token.
-
     :param token: a single token
     :param entities: entities found by a single extractor
     :return: list of entities
@@ -474,7 +486,6 @@ def find_intersecting_entites(token, entities):
 
 def pick_best_entity_fit(token, candidates):
     """Determines the token label given intersecting entities.
-
     :param token: a single token
     :param candidates: entities found by a single extractor
     :return: entity type
@@ -518,11 +529,9 @@ def do_extractors_support_overlap(extractors):
 
 def align_entity_predictions(targets, predictions, tokens, extractors):
     """Aligns entity predictions to the message tokens.
-
     Determines for every token the true label based on the
     prediction targets and the label assigned by each
     single extractor.
-
     :param targets: list of target entities
     :param predictions: list of predicted entities
     :param tokens: original message tokens
@@ -538,7 +547,7 @@ def align_entity_predictions(targets, predictions, tokens, extractors):
     extractor_labels = defaultdict(list)
     for t in tokens:
         true_token_labels.append(
-                determine_token_labels(t, targets, None))
+            determine_token_labels(t, targets, None))
         for extractor, entities in entities_by_extractors.items():
             extracted = determine_token_labels(t, entities, extractor)
             extractor_labels[extractor].append(extracted)
@@ -550,7 +559,6 @@ def align_entity_predictions(targets, predictions, tokens, extractors):
 def align_all_entity_predictions(targets, predictions, tokens, extractors):
     """ Aligns entity predictions to the message tokens for the whole dataset
         using align_entity_predictions
-
     :param targets: list of lists of target entities
     :param predictions: list of lists of predicted entities
     :param tokens: list of original message tokens
@@ -606,10 +614,10 @@ def get_intent_predictions(targets, interpreter,
     for e, target in zip(test_data.training_examples, targets):
         res = interpreter.parse(e.text, only_output_properties=False)
         intent_results.append(IntentEvaluationResult(
-                target,
-                extract_intent(res),
-                extract_message(res),
-                extract_confidence(res)))
+            target,
+            extract_intent(res),
+            extract_message(res),
+            extract_confidence(res)))
 
     return intent_results
 
@@ -631,7 +639,6 @@ def get_entity_predictions(interpreter, test_data):  # pragma: no cover
 
 def get_entity_extractors(interpreter):
     """Finds the names of entity extractors used by the interpreter.
-
     Processors are removed since they do not
     detect the boundaries themselves."""
 
@@ -655,7 +662,6 @@ def combine_extractor_and_dimension_name(extractor, dim):
 
 def get_duckling_dimensions(interpreter, duckling_extractor_name):
     """Gets the activated dimensions of a duckling extractor.
-
     If there are no activated dimensions, it uses all known
     dimensions as a fallback."""
 
@@ -700,7 +706,7 @@ def remove_duckling_entities(entity_predictions):
 
 
 def run_evaluation(data_path, model,
-                   report_filename=None,
+                   report_folder=None,
                    successes_filename=None,
                    errors_filename='errors.json',
                    confmat_filename=None,
@@ -728,14 +734,17 @@ def run_evaluation(data_path, model,
         "entity_evaluation": None
     }
 
+    if report_folder:
+        utils.create_dir(report_folder)
+
     if is_intent_classifier_present(interpreter):
         intent_targets = get_intent_targets(test_data)
         intent_results = get_intent_predictions(
-                intent_targets, interpreter, test_data)
+            intent_targets, interpreter, test_data)
 
         logger.info("Intent evaluation results:")
         result['intent_evaluation'] = evaluate_intents(intent_results,
-                                                       report_filename,
+                                                       report_folder,
                                                        successes_filename,
                                                        errors_filename,
                                                        confmat_filename,
@@ -748,7 +757,8 @@ def run_evaluation(data_path, model,
         result['entity_evaluation'] = evaluate_entities(entity_targets,
                                                         entity_predictions,
                                                         tokens,
-                                                        extractors)
+                                                        extractors,
+                                                        report_folder)
 
     return result
 
@@ -795,7 +805,6 @@ def run_cv_evaluation(data: TrainingData,
                       n_folds: int,
                       nlu_config: RasaNLUModelConfig) -> CVEvaluationResult:
     """Stratified cross validation on data
-
     :param data: Training Data
     :param n_folds: integer, number of cv folds
     :param nlu_config: nlu config file
@@ -936,7 +945,7 @@ def main():
         data = training_data.load_data(cmdline_args.data)
         data = drop_intents_below_freq(data, cutoff=5)
         results, entity_results = run_cv_evaluation(
-                data, int(cmdline_args.folds), nlu_config)
+            data, int(cmdline_args.folds), nlu_config)
         logger.info("CV evaluation (n={})".format(cmdline_args.folds))
 
         if any(results):
diff --git a/rasa_nlu/extractors/__init__.py b/rasa_nlu/extractors/__init__.py
index 613e68a9db31..42fa08f261e8 100644
--- a/rasa_nlu/extractors/__init__.py
+++ b/rasa_nlu/extractors/__init__.py
@@ -62,9 +62,9 @@ def filter_trainable_entities(self,
             data = message.data.copy()
             data['entities'] = entities
             filtered.append(
-                    Message(text=message.text,
-                            data=data,
-                            output_properties=message.output_properties,
-                            time=message.time))
+                Message(text=message.text,
+                        data=data,
+                        output_properties=message.output_properties,
+                        time=message.time))
 
         return filtered
diff --git a/rasa_nlu/extractors/crf_entity_extractor.py b/rasa_nlu/extractors/crf_entity_extractor.py
index f24c7b555653..8a8b28232dd2 100644
--- a/rasa_nlu/extractors/crf_entity_extractor.py
+++ b/rasa_nlu/extractors/crf_entity_extractor.py
@@ -97,10 +97,10 @@ def _check_pos_features_and_spacy(self):
     def _check_spacy():
         if spacy is None:
             raise ImportError(
-                    'Failed to import `spaCy`. '
-                    '`spaCy` is required for POS features '
-                    'See https://spacy.io/usage/ for installation'
-                    'instructions.')
+                'Failed to import `spaCy`. '
+                '`spaCy` is required for POS features '
+                'See https://spacy.io/usage/ for installation'
+                'instructions.')
 
     def _validate_configuration(self):
         if len(self.component_config.get("features", [])) % 2 != 1:
@@ -127,7 +127,7 @@ def train(self,
 
             # filter out pre-trained entity examples
             filtered_entity_examples = self.filter_trainable_entities(
-                    training_data.training_examples)
+                training_data.training_examples)
 
             # convert the dataset into features
             # this will train on ALL examples, even the ones
@@ -148,12 +148,12 @@ def _create_dataset(self,
     def _check_spacy_doc(self, message):
         if self.pos_features and message.get("spacy_doc") is None:
             raise InvalidConfigError(
-                    'Could not find `spacy_doc` attribute for '
-                    'message {}\n'
-                    'POS features require a pipeline component '
-                    'that provides `spacy_doc` attributes, i.e. `nlp_spacy`. '
-                    'See https://nlu.rasa.com/pipeline.html#nlp-spacy '
-                    'for details'.format(message.text))
+                'Could not find `spacy_doc` attribute for '
+                'message {}\n'
+                'POS features require a pipeline component '
+                'that provides `spacy_doc` attributes, i.e. `nlp_spacy`. '
+                'See https://nlu.rasa.com/pipeline.html#nlp-spacy '
+                'for details'.format(message.text))
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
@@ -241,7 +241,7 @@ def _find_bilou_end(self, word_idx, entities):
 
         while not finished:
             label, label_confidence = self.most_likely_entity(
-                    ent_word_idx, entities)
+                ent_word_idx, entities)
 
             confidence = min(confidence, label_confidence)
 
@@ -277,7 +277,7 @@ def _handle_bilou_label(self, word_idx, entities):
         elif self._bilou_from_label(label) == "B":
             # start of multi word-entity need to represent whole extent
             ent_word_idx, confidence = self._find_bilou_end(
-                    word_idx, entities)
+                word_idx, entities)
             return ent_word_idx, confidence, entity_label
 
         else:
@@ -298,11 +298,11 @@ def _from_crf_to_json(self,
 
         if self.component_config["BILOU_flag"]:
             return self._convert_bilou_tagging_to_entity_result(
-                    tokens, entities)
+                tokens, entities)
         else:
             # not using BILOU tagging scheme, multi-word entities are split.
             return self._convert_simple_tagging_to_entity_result(
-                    tokens, entities)
+                tokens, entities)
 
     def _convert_bilou_tagging_to_entity_result(self, tokens, entities):
         # using the BILOU tagging scheme
@@ -310,7 +310,7 @@ def _convert_bilou_tagging_to_entity_result(self, tokens, entities):
         word_idx = 0
         while word_idx < len(tokens):
             end_idx, confidence, entity_label = self._handle_bilou_label(
-                    word_idx, entities)
+                word_idx, entities)
 
             if end_idx is not None:
                 ent = self._create_entity_dict(tokens,
@@ -329,7 +329,7 @@ def _convert_simple_tagging_to_entity_result(self, tokens, entities):
 
         for word_idx in range(len(tokens)):
             entity_label, confidence = self.most_likely_entity(
-                    word_idx, entities)
+                word_idx, entities)
             word = tokens[word_idx]
             if entity_label != 'O':
                 if self.pos_features:
@@ -413,9 +413,9 @@ def _sentence_to_features(self,
                             # add all regexes as a feature
                             regex_patterns = self.function_dict[feature](word)
                             for p_name, matched in regex_patterns.items():
-                                feature_name = (prefix + ":"
-                                                + feature
-                                                + ":" + p_name)
+                                feature_name = (prefix + ":" +
+                                                feature +
+                                                ":" + p_name)
                                 word_features[feature_name] = matched
                         else:
                             # append each feature to a feature vector
@@ -535,14 +535,14 @@ def _train_model(self,
         X_train = [self._sentence_to_features(sent) for sent in df_train]
         y_train = [self._sentence_to_labels(sent) for sent in df_train]
         self.ent_tagger = sklearn_crfsuite.CRF(
-                algorithm='lbfgs',
-                # coefficient for L1 penalty
-                c1=self.component_config["L1_c"],
-                # coefficient for L2 penalty
-                c2=self.component_config["L2_c"],
-                # stop earlier
-                max_iterations=self.component_config["max_iterations"],
-                # include transitions that are possible, but not observed
-                all_possible_transitions=True
+            algorithm='lbfgs',
+            # coefficient for L1 penalty
+            c1=self.component_config["L1_c"],
+            # coefficient for L2 penalty
+            c2=self.component_config["L2_c"],
+            # stop earlier
+            max_iterations=self.component_config["max_iterations"],
+            # include transitions that are possible, but not observed
+            all_possible_transitions=True
         )
         self.ent_tagger.fit(X_train, y_train)
diff --git a/rasa_nlu/extractors/entity_synonyms.py b/rasa_nlu/extractors/entity_synonyms.py
index 586720ff6f82..dd3d396e865d 100644
--- a/rasa_nlu/extractors/entity_synonyms.py
+++ b/rasa_nlu/extractors/entity_synonyms.py
@@ -94,8 +94,8 @@ def add_entities_if_synonyms(self, entity_a, entity_b):
 
             if original != replacement:
                 original = original.lower()
-                if (original in self.synonyms
-                        and self.synonyms[original] != replacement):
+                if (original in self.synonyms and
+                        self.synonyms[original] != replacement):
                     warnings.warn("Found conflicting synonym definitions "
                                   "for {}. Overwriting target {} with {}. "
                                   "Check your training data and remove "
diff --git a/rasa_nlu/extractors/mitie_entity_extractor.py b/rasa_nlu/extractors/mitie_entity_extractor.py
index 0fb592bbe7d1..8ceca799a377 100644
--- a/rasa_nlu/extractors/mitie_entity_extractor.py
+++ b/rasa_nlu/extractors/mitie_entity_extractor.py
@@ -75,7 +75,7 @@ def train(self,
 
         # filter out pre-trained entity examples
         filtered_entity_examples = self.filter_trainable_entities(
-                training_data.training_examples)
+            training_data.training_examples)
 
         for example in filtered_entity_examples:
             sample = self._prepare_mitie_sample(example)
@@ -97,7 +97,7 @@ def _prepare_mitie_sample(self, training_example):
             try:
                 # if the token is not aligned an exception will be raised
                 start, end = MitieEntityExtractor.find_entity(
-                        ent, text, tokens)
+                    ent, text, tokens)
             except ValueError as e:
                 logger.warning("Example skipped: {}".format(str(e)))
                 continue
diff --git a/rasa_nlu/featurizers/mitie_featurizer.py b/rasa_nlu/featurizers/mitie_featurizer.py
index a567862f0c3c..ba3a9780ac26 100644
--- a/rasa_nlu/featurizers/mitie_featurizer.py
+++ b/rasa_nlu/featurizers/mitie_featurizer.py
@@ -37,7 +37,7 @@ def train(self,
                                                 mitie_feature_extractor)
             example.set("text_features",
                         self._combine_with_existing_text_features(
-                                example, features))
+                            example, features))
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
diff --git a/rasa_nlu/featurizers/ngram_featurizer.py b/rasa_nlu/featurizers/ngram_featurizer.py
index 2cfa20bf996f..63c388fbfba6 100644
--- a/rasa_nlu/featurizers/ngram_featurizer.py
+++ b/rasa_nlu/featurizers/ngram_featurizer.py
@@ -130,7 +130,7 @@ def _get_best_ngrams(self, examples, labels):
 
         oov_strings = self._remove_in_vocab_words(examples)
         ngrams = self._generate_all_ngrams(
-                oov_strings, self.component_config["ngram_min_length"])
+            oov_strings, self.component_config["ngram_min_length"])
         return self._sort_applicable_ngrams(ngrams, examples, labels)
 
     def _remove_in_vocab_words(self, examples):
@@ -148,9 +148,9 @@ def _is_ngram_worthy(token):
 
         Excludes every word with digits in them, hyperlinks or
         an assigned word vector."""
-        return (not token.has_vector and not token.like_url
-                and not token.like_num and not token.like_email
-                and not token.is_punct)
+        return (not token.has_vector and not token.like_url and not
+                token.like_num and not token.like_email and not
+                token.is_punct)
 
     def _remove_in_vocab_words_from_sentence(self, example):
         """Filter for words that do not have a word vector."""
@@ -221,7 +221,7 @@ def _sort_applicable_ngrams(self, ngrams_list, examples, labels):
                 labels = np.array(labels)[mask]
 
                 return self._rank_ngrams_using_cv(
-                        examples, labels, ngrams_list)
+                    examples, labels, ngrams_list)
             except ValueError as e:
                 if "needs samples of at least 2 classes" in str(e):
                     # we got unlucky during the random
@@ -294,11 +294,11 @@ def _generate_all_ngrams(self, list_of_strings, ngram_min_length):
                     begin = can[:-1]
                     end = can[1:]
                     if n >= ngram_min_length:
-                        if (counters[n - 1][begin] == counters[n][can]
-                                and begin in features[n - 1]):
+                        if (counters[n - 1][begin] == counters[n][can] and
+                                begin in features[n - 1]):
                             features[n - 1].remove(begin)
-                        if (counters[n - 1][end] == counters[n][can]
-                                and end in features[n - 1]):
+                        if (counters[n - 1][end] == counters[n][can] and
+                                end in features[n - 1]):
                             features[n - 1].remove(end)
 
         return [item for sublist in list(features.values()) for item in sublist]
@@ -349,7 +349,7 @@ def _score_ngram_selection(self, examples, y, existing_text_features,
         clf = LogisticRegression(class_weight='balanced')
 
         no_ngrams_X = self._append_ngram_features(
-                examples, existing_text_features, max_ngrams)
+            examples, existing_text_features, max_ngrams)
         return np.mean(cross_val_score(clf, no_ngrams_X, y, cv=cv_splits))
 
     @staticmethod
diff --git a/rasa_nlu/model.py b/rasa_nlu/model.py
index f557849149fc..9a343c886012 100644
--- a/rasa_nlu/model.py
+++ b/rasa_nlu/model.py
@@ -158,7 +158,7 @@ def _build_pipeline(cfg: RasaNLUModelConfig,
 
         return pipeline
 
-    def train(self, data: TrainingData, **kwargs: Any)-> 'Interpreter':
+    def train(self, data: TrainingData, **kwargs: Any) -> 'Interpreter':
         """Trains the underlying pipeline using the provided training data."""
 
         self.training_data = data
@@ -195,7 +195,7 @@ def persist(self,
                 path: Text,
                 persistor: Optional[Persistor] = None,
                 project_name: Text = None,
-                fixed_model_name: Text = None)-> Text:
+                fixed_model_name: Text = None) -> Text:
         """Persist all components of the pipeline to the passed path.
 
         Returns the directory of the persisted model."""
diff --git a/rasa_nlu/persistor.py b/rasa_nlu/persistor.py
index 215112234683..1628b5bdef5e 100644
--- a/rasa_nlu/persistor.py
+++ b/rasa_nlu/persistor.py
@@ -43,7 +43,7 @@ def persist(self,
                              "found.".format(model_directory))
 
         file_key, tar_path = self._compress(
-                model_directory, model_name, project)
+            model_directory, model_name, project)
         self._persist_tar(file_key, tar_path)
 
     def retrieve(self,
@@ -122,7 +122,7 @@ def _decompress(compressed_path: Text, target_path: Text) -> None:
 
         with tarfile.open(compressed_path, "r:gz") as tar:
             tar.extractall(
-                    target_path)  # project dir will be created if it not exists
+                target_path)  # project dir will be created if it not exists
 
 
 class AWSPersistor(Persistor):
@@ -205,7 +205,7 @@ def list_models(self, project: Text) -> List[Text]:
 
         try:
             blob_iterator = self.bucket.list_blobs(
-                    prefix=self._project_prefix(project))
+                prefix=self._project_prefix(project))
             return [self._project_and_model_from_filename(b.name)[1]
                     for b in blob_iterator]
         except Exception as e:
@@ -259,9 +259,9 @@ def __init__(self,
         super(AzurePersistor, self).__init__()
 
         self.blob_client = azureblob.BlockBlobService(
-                account_name=azure_account_name,
-                account_key=azure_account_key,
-                endpoint_suffix="core.windows.net")
+            account_name=azure_account_name,
+            account_key=azure_account_key,
+            endpoint_suffix="core.windows.net")
 
         self._ensure_container_exists(azure_container)
         self.container_name = azure_container
@@ -276,8 +276,8 @@ def list_models(self, project: Text) -> List[Text]:
 
         try:
             blob_iterator = self.blob_client.list_blobs(
-                    self.container_name,
-                    prefix=self._project_prefix(project)
+                self.container_name,
+                prefix=self._project_prefix(project)
             )
             return [self._project_and_model_from_filename(b.name)[1]
                     for b in blob_iterator]
@@ -290,8 +290,8 @@ def list_projects(self) -> List[Text]:
         try:
             # noinspection PyTypeChecker
             blob_iterator = self.blob_client.list_blobs(
-                    self.container_name,
-                    prefix=None
+                self.container_name,
+                prefix=None
             )
             projects_set = {self._project_and_model_from_filename(b.name)[0]
                             for b in blob_iterator}
@@ -305,16 +305,16 @@ def _persist_tar(self, file_key: Text, tar_path: Text) -> None:
         """Uploads a model persisted in the `target_dir` to Azure."""
 
         self.blob_client.create_blob_from_path(
-                self.container_name,
-                file_key,
-                tar_path
+            self.container_name,
+            file_key,
+            tar_path
         )
 
     def _retrieve_tar(self, target_filename: Text) -> None:
         """Downloads a model that has previously been persisted to Azure."""
 
         self.blob_client.get_blob_to_path(
-                self.container_name,
-                target_filename,
-                target_filename
+            self.container_name,
+            target_filename,
+            target_filename
         )
diff --git a/rasa_nlu/project.py b/rasa_nlu/project.py
index 662da1106348..12d9fc4e097b 100644
--- a/rasa_nlu/project.py
+++ b/rasa_nlu/project.py
@@ -66,7 +66,7 @@ def _update_model_from_server(model_server: EndpointConfig,
     model_directory = tempfile.mkdtemp()
 
     new_model_fingerprint, filename = _pull_model_and_fingerprint(
-            model_server, model_directory, project.fingerprint)
+        model_server, model_directory, project.fingerprint)
     if new_model_fingerprint:
         model_name = _get_remote_model_name(filename)
         project.fingerprint = new_model_fingerprint
@@ -296,13 +296,13 @@ def update_model_from_dir_and_unload_others(self,
         # noinspection PyUnusedLocal
         status = False
 
-        logger.debug('Loading model {} from directory {}'.format(
-                model_name, model_dir))
+        logger.debug("Loading model '{}' from directory '{}'.".format(
+            model_name, model_dir))
 
         self._loader_lock.acquire()
         try:
             interpreter = self._interpreter_for_model(
-                    model_name, model_dir)
+                model_name, model_dir)
             self._models[model_name] = interpreter
             status = True
         finally:
diff --git a/rasa_nlu/registry.py b/rasa_nlu/registry.py
index 42cd43061243..6ba5704883ff 100644
--- a/rasa_nlu/registry.py
+++ b/rasa_nlu/registry.py
@@ -101,13 +101,13 @@ def get_component_class(component_name: Text) -> Type['Component']:
             return utils.class_from_module_path(component_name)
         except Exception:
             raise Exception(
-                    "Failed to find component class for '{}'. Unknown "
-                    "component name. Check your configured pipeline and make "
-                    "sure the mentioned component is not misspelled. If you "
-                    "are creating your own component, make sure it is either "
-                    "listed as part of the `component_classes` in "
-                    "`rasa_nlu.registry.py` or is a proper name of a class "
-                    "in a module.".format(component_name))
+                "Failed to find component class for '{}'. Unknown "
+                "component name. Check your configured pipeline and make "
+                "sure the mentioned component is not misspelled. If you "
+                "are creating your own component, make sure it is either "
+                "listed as part of the `component_classes` in "
+                "`rasa_nlu.registry.py` or is a proper name of a class "
+                "in a module.".format(component_name))
     return registered_components[component_name]
 
 
diff --git a/rasa_nlu/run.py b/rasa_nlu/run.py
index 68fc95024769..926331613862 100644
--- a/rasa_nlu/run.py
+++ b/rasa_nlu/run.py
@@ -10,8 +10,8 @@
 def create_argument_parser():
     import argparse
     parser = argparse.ArgumentParser(
-            description='run a Rasa NLU model locally on the command line '
-                        'for manual testing')
+        description='run a Rasa NLU model locally on the command line '
+                    'for manual testing')
 
     parser.add_argument('-m', '--model', required=True,
                         help="path to model")
diff --git a/rasa_nlu/server.py b/rasa_nlu/server.py
index d04ec40f08bb..fa5749bf508c 100644
--- a/rasa_nlu/server.py
+++ b/rasa_nlu/server.py
@@ -108,19 +108,17 @@ def decorated(*args, **kwargs):
             if '*' in self.cors_origins:
                 request.setHeader('Access-Control-Allow-Origin', '*')
                 request.setHeader(
-                        'Access-Control-Allow-Headers',
-                        'Content-Type')
+                    'Access-Control-Allow-Headers', 'Content-Type')
                 request.setHeader(
-                        'Access-Control-Allow-Methods',
-                        'POST, GET, OPTIONS, PUT, DELETE')
+                    'Access-Control-Allow-Methods',
+                    'POST, GET, OPTIONS, PUT, DELETE')
             elif origin in self.cors_origins:
                 request.setHeader('Access-Control-Allow-Origin', origin)
                 request.setHeader(
-                        'Access-Control-Allow-Headers',
-                        'Content-Type')
+                    'Access-Control-Allow-Headers', 'Content-Type')
                 request.setHeader(
-                        'Access-Control-Allow-Methods',
-                        'POST, GET, OPTIONS, PUT, DELETE')
+                    'Access-Control-Allow-Methods',
+                    'POST, GET, OPTIONS, PUT, DELETE')
             else:
                 request.setResponseCode(403)
                 return 'forbidden'
@@ -191,7 +189,7 @@ def __init__(self,
         self._configure_logging(loglevel, logfile)
 
         self.default_model_config = self._load_default_config(
-                default_config_path)
+            default_config_path)
 
         self.data_router = data_router
         self._testing = testing
@@ -228,7 +226,7 @@ def parse(self, request):
             request_params = decode_parameters(request)
         else:
             request_params = simplejson.loads(
-                    request.content.read().decode('utf-8', 'strict'))
+                request.content.read().decode('utf-8', 'strict'))
 
         if 'query' in request_params:
             request_params['q'] = request_params.pop('query')
@@ -236,7 +234,7 @@ def parse(self, request):
         if 'q' not in request_params:
             request.setResponseCode(404)
             dumped = json_to_string(
-                    {"error": "Invalid parse parameter specified"})
+                {"error": "Invalid parse parameter specified"})
             returnValue(dumped)
         else:
             data = self.data_router.extract(request_params)
@@ -244,7 +242,7 @@ def parse(self, request):
                 request.setResponseCode(200)
                 response = yield (self.data_router.parse(data) if self._testing
                                   else threads.deferToThread(
-                        self.data_router.parse, data))
+                    self.data_router.parse, data))
                 returnValue(json_to_string(response))
             except InvalidProjectError as e:
                 request.setResponseCode(404)
@@ -262,8 +260,8 @@ def version(self, request):
 
         request.setHeader('Content-Type', 'application/json')
         return json_to_string(
-                {'version': __version__,
-                 'minimum_compatible_version': MINIMUM_COMPATIBLE_VERSION}
+            {'version': __version__,
+             'minimum_compatible_version': MINIMUM_COMPATIBLE_VERSION}
         )
 
     @app.route("/status", methods=['GET', 'OPTIONS'])
@@ -348,8 +346,8 @@ def train(self, request):
             request.setResponseCode(200)
 
             response = yield self.data_router.start_train_process(
-                    data_file, project,
-                    RasaNLUModelConfig(model_config), model_name)
+                data_file, project,
+                RasaNLUModelConfig(model_config), model_name)
             returnValue(json_to_string({'info': 'new model trained',
                                         'model': response}))
         except MaxTrainingError as e:
@@ -398,9 +396,8 @@ def unload_model(self, request):
         try:
             request.setResponseCode(200)
             response = self.data_router.unload_model(
-                    params.get('project',
-                               RasaNLUModelConfig.DEFAULT_PROJECT_NAME),
-                    params.get('model')
+                params.get('project', RasaNLUModelConfig.DEFAULT_PROJECT_NAME),
+                params.get('model')
             )
             return simplejson.dumps(response)
         except Exception as e:
@@ -419,13 +416,13 @@ def unload_model(self, request):
     _endpoints = read_endpoints(cmdline_args.endpoints)
 
     router = DataRouter(
-            cmdline_args.path,
-            cmdline_args.max_training_processes,
-            cmdline_args.response_log,
-            cmdline_args.emulate,
-            cmdline_args.storage,
-            model_server=_endpoints.model,
-            wait_time_between_pulls=cmdline_args.wait_time_between_pulls
+        cmdline_args.path,
+        cmdline_args.max_training_processes,
+        cmdline_args.response_log,
+        cmdline_args.emulate,
+        cmdline_args.storage,
+        model_server=_endpoints.model,
+        wait_time_between_pulls=cmdline_args.wait_time_between_pulls
     )
     if pre_load:
         logger.debug('Preloading....')
@@ -434,13 +431,13 @@ def unload_model(self, request):
         router._pre_load(pre_load)
 
     rasa = RasaNLU(
-            router,
-            cmdline_args.loglevel,
-            cmdline_args.write,
-            cmdline_args.num_threads,
-            cmdline_args.token,
-            cmdline_args.cors,
-            default_config_path=cmdline_args.config
+        router,
+        cmdline_args.loglevel,
+        cmdline_args.write,
+        cmdline_args.num_threads,
+        cmdline_args.token,
+        cmdline_args.cors,
+        default_config_path=cmdline_args.config
     )
 
     logger.info('Started http server on port %s' % cmdline_args.port)
diff --git a/rasa_nlu/tokenizers/spacy_tokenizer.py b/rasa_nlu/tokenizers/spacy_tokenizer.py
index dd8a0c62ef91..a16d7f2d6086 100644
--- a/rasa_nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa_nlu/tokenizers/spacy_tokenizer.py
@@ -20,15 +20,15 @@ class SpacyTokenizer(Tokenizer, Component):
     def train(self,
               training_data: TrainingData,
               config: RasaNLUModelConfig,
-              **kwargs: Any)-> None:
+              **kwargs: Any) -> None:
 
         for example in training_data.training_examples:
             example.set("tokens", self.tokenize(example.get("spacy_doc")))
 
-    def process(self, message: Message, **kwargs: Any)-> None:
+    def process(self, message: Message, **kwargs: Any) -> None:
 
         message.set("tokens", self.tokenize(message.get("spacy_doc")))
 
-    def tokenize(self, doc: 'Doc')-> typing.List[Token]:
+    def tokenize(self, doc: 'Doc') -> typing.List[Token]:
 
         return [Token(t.text, t.idx) for t in doc]
diff --git a/rasa_nlu/train.py b/rasa_nlu/train.py
index f15bb5196401..e3b9be7eacf7 100644
--- a/rasa_nlu/train.py
+++ b/rasa_nlu/train.py
@@ -15,7 +15,7 @@
 
 def create_argument_parser():
     parser = argparse.ArgumentParser(
-            description='train a custom language parser')
+        description='train a custom language parser')
 
     parser.add_argument('-o', '--path',
                         default="models/nlu/",
diff --git a/rasa_nlu/training_data/formats/markdown.py b/rasa_nlu/training_data/formats/markdown.py
index 56bd4b4713c4..524ea558721c 100644
--- a/rasa_nlu/training_data/formats/markdown.py
+++ b/rasa_nlu/training_data/formats/markdown.py
@@ -89,7 +89,7 @@ def _load_files(self, line):
             if match:
                 fname = match.group(1)
                 self.lookup_tables.append(
-                        {"name": self.current_title, "elements": str(fname)})
+                    {"name": self.current_title, "elements": str(fname)})
 
     def _parse_item(self, line):
         """Parses an md list item line based on the current section type."""
@@ -103,7 +103,7 @@ def _parse_item(self, line):
                 self._add_synonym(item, self.current_title)
             elif self.current_section == REGEX:
                 self.regex_features.append(
-                        {"name": self.current_title, "pattern": item})
+                    {"name": self.current_title, "pattern": item})
             elif self.current_section == LOOKUP:
                 self._add_item_to_lookup(item)
 
diff --git a/rasa_nlu/training_data/training_data.py b/rasa_nlu/training_data/training_data.py
index e4ad1fdc5c1b..195ee752bdce 100644
--- a/rasa_nlu/training_data/training_data.py
+++ b/rasa_nlu/training_data/training_data.py
@@ -191,24 +191,24 @@ def train_test_split(self,
             test.extend(ex[n_train:])
 
         data_train = TrainingData(
-                train,
-                entity_synonyms=self.entity_synonyms,
-                regex_features=self.regex_features,
-                lookup_tables=self.lookup_tables)
+            train,
+            entity_synonyms=self.entity_synonyms,
+            regex_features=self.regex_features,
+            lookup_tables=self.lookup_tables)
         data_test = TrainingData(
-                test,
-                entity_synonyms=self.entity_synonyms,
-                regex_features=self.regex_features,
-                lookup_tables=self.lookup_tables)
+            test,
+            entity_synonyms=self.entity_synonyms,
+            regex_features=self.regex_features,
+            lookup_tables=self.lookup_tables)
         return data_train, data_test
 
     def print_stats(self) -> None:
         logger.info("Training data stats: \n" +
                     "\t- intent examples: {} ({} distinct intents)\n".format(
-                            len(self.intent_examples), len(self.intents)) +
+                        len(self.intent_examples), len(self.intents)) +
                     "\t- Found intents: {}\n".format(
-                            list_to_str(self.intents)) +
+                        list_to_str(self.intents)) +
                     "\t- entity examples: {} ({} distinct entities)\n".format(
-                            len(self.entity_examples), len(self.entities)) +
+                        len(self.entity_examples), len(self.entities)) +
                     "\t- found entities: {}\n".format(
-                            list_to_str(self.entities)))
+                        list_to_str(self.entities)))
diff --git a/rasa_nlu/utils/__init__.py b/rasa_nlu/utils/__init__.py
index a09591dcd3a6..15490e470ce4 100644
--- a/rasa_nlu/utils/__init__.py
+++ b/rasa_nlu/utils/__init__.py
@@ -21,20 +21,20 @@ def add_logging_option_arguments(parser, default=logging.WARNING):
 
     # arguments for logging configuration
     parser.add_argument(
-            '--debug',
-            help="Print lots of debugging statements. "
-                 "Sets logging level to DEBUG",
-            action="store_const",
-            dest="loglevel",
-            const=logging.DEBUG,
-            default=default,
+        '--debug',
+        help="Print lots of debugging statements. "
+             "Sets logging level to DEBUG",
+        action="store_const",
+        dest="loglevel",
+        const=logging.DEBUG,
+        default=default,
     )
     parser.add_argument(
-            '-v', '--verbose',
-            help="Be verbose. Sets logging level to INFO",
-            action="store_const",
-            dest="loglevel",
-            const=logging.INFO,
+        '-v', '--verbose',
+        help="Be verbose. Sets logging level to INFO",
+        action="store_const",
+        dest="loglevel",
+        const=logging.INFO,
     )
 
 
@@ -330,11 +330,11 @@ def configure_colored_logging(loglevel: Text) -> None:
     level_styles = coloredlogs.DEFAULT_LEVEL_STYLES.copy()
     level_styles['debug'] = {}
     coloredlogs.install(
-            level=loglevel,
-            use_chroot=False,
-            fmt='%(asctime)s %(levelname)-8s %(name)s  - %(message)s',
-            level_styles=level_styles,
-            field_styles=field_styles)
+        level=loglevel,
+        use_chroot=False,
+        fmt='%(asctime)s %(levelname)-8s %(name)s  - %(message)s',
+        level_styles=level_styles,
+        field_styles=field_styles)
 
 
 def pycloud_unpickle(file_name: Text) -> Any:
@@ -494,9 +494,7 @@ def request(self,
 
     @classmethod
     def from_dict(cls, data):
-        return EndpointConfig(
-                data.pop("url"),
-                **data)
+        return EndpointConfig(data.pop("url"), **data)
 
     def __eq__(self, other):
         if isinstance(self, type(other)):
diff --git a/setup.cfg b/setup.cfg
index 681b38148525..b75e577060f4 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,14 +1,11 @@
-# pytest PEP8 configuration
+# pytest pycodestyle configuration
 [tool:pytest]
-pep8maxlinelength = 80
-pep8ignore =
-    docs/conf.py ALL
-    *.py E251
-    *.py W503
-    *.py E126
-
-# ignoring W503: line break occurred before a binary operator
-# ignoring E126: continuation line over-indented for hanging indent
+codestyle_max_line_length = 80
+# ignoring W504: line break occurred after a binary operator
+codestyle_ignore =
+    W504
+codestyle_exclude =
+    docs/conf.py
 
 [metadata]
 description-file = README.md
diff --git a/setup.py b/setup.py
index 936f013e2d1a..8b7f4a4dc9d8 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 
 tests_requires = [
     "pytest~=3.3",
-    "pytest-pep8~=1.0",
+    "pytest-pycodestyle~=1.4",
     "pytest-cov~=2.5",
     "pytest-twisted<1.6",
     "treq~=17.8",
@@ -50,7 +50,9 @@
               ],
     'tensorflow': ["sklearn-crfsuite~=0.3.6",
                    "scipy~=1.1",
-                   "tensorflow~=1.12"
+                   "tensorflow~=1.12",
+                   "keras-applications==1.0.6",
+                   "keras-preprocessing==1.0.5"
                    ],
     'mitie': ["mitie"],
 }
diff --git a/tests/base/test_components.py b/tests/base/test_components.py
index 33cdb2a9f495..ffe8dc304629 100644
--- a/tests/base/test_components.py
+++ b/tests/base/test_components.py
@@ -80,10 +80,10 @@ def test_example_component(component_builder, tmpdir_factory):
         {"name": "tests.example_component.MyComponent"}]})
 
     interpreter = utilities.interpreter_for(
-            component_builder,
-            data="./data/examples/rasa/demo-rasa.json",
-            path=tmpdir_factory.mktemp("projects").strpath,
-            config=conf)
+        component_builder,
+        data="./data/examples/rasa/demo-rasa.json",
+        path=tmpdir_factory.mktemp("projects").strpath,
+        config=conf)
 
     r = interpreter.parse("test")
     assert r is not None
diff --git a/tests/base/test_evaluation.py b/tests/base/test_evaluation.py
index cef8e14787b5..d6b53a5ef1ea 100644
--- a/tests/base/test_evaluation.py
+++ b/tests/base/test_evaluation.py
@@ -11,7 +11,7 @@
     get_duckling_dimensions, known_duckling_dimensions,
     find_component, remove_duckling_extractors, drop_intents_below_freq,
     run_cv_evaluation, substitute_labels, IntentEvaluationResult,
-    evaluate_intents)
+    evaluate_intents, evaluate_entities)
 from rasa_nlu.evaluate import does_token_cross_borders
 from rasa_nlu.evaluate import align_entity_predictions
 from rasa_nlu.evaluate import determine_intersection
@@ -20,6 +20,7 @@
 from rasa_nlu.tokenizers import Token
 from rasa_nlu import utils
 import json
+import os
 from rasa_nlu import training_data, config
 from tests import utilities
 
@@ -257,10 +258,13 @@ def test_run_cv_evaluation():
     assert len(entity_results.test['ner_crf']["F1-score"]) == n_folds
 
 
-def test_evaluation_report(tmpdir_factory):
+def test_intent_evaluation_report(tmpdir_factory):
 
     path = tmpdir_factory.mktemp("evaluation").strpath
-    report_filename = path + "report.json"
+    report_folder = os.path.join(path, "reports")
+    report_filename = os.path.join(report_folder, "intent_report.json")
+
+    utils.create_dir(report_folder)
 
     intent_results = [
         IntentEvaluationResult("", "restaurant_search",
@@ -269,7 +273,7 @@ def test_evaluation_report(tmpdir_factory):
                                "hello", 0.98765)]
 
     result = evaluate_intents(intent_results,
-                              report_filename,
+                              report_folder,
                               successes_filename=None,
                               errors_filename=None,
                               confmat_filename=None,
@@ -292,6 +296,32 @@ def test_evaluation_report(tmpdir_factory):
     assert result["predictions"][0] == prediction
 
 
+def test_entity_evaluation_report(tmpdir_factory):
+
+    path = tmpdir_factory.mktemp("evaluation").strpath
+    report_folder = os.path.join(path, "reports")
+
+    mock_extractors = ["A", "B"]
+    report_filename_a = os.path.join(report_folder, "A_report.json")
+    report_filename_b = os.path.join(report_folder, "B_report.json")
+
+    utils.create_dir(report_folder)
+
+    result = evaluate_entities([EN_targets],
+                               [EN_predicted],
+                               [EN_tokens],
+                               mock_extractors,
+                               report_folder)
+
+    report_a = json.loads(utils.read_file(report_filename_a))
+    report_b = json.loads(utils.read_file(report_filename_b))
+
+    assert len(report_a) == 8
+    assert report_a["datetime"]["support"] == 1.0
+    assert report_b["macro avg"]["recall"] == 0.2
+    assert result["A"]["accuracy"] == 0.75
+
+
 def test_empty_intent_removal():
     intent_results = [
         IntentEvaluationResult("", "restaurant_search",
@@ -308,7 +338,7 @@ def test_empty_intent_removal():
     assert intent_results[0].message == "hello"
 
 
-def test_evaluate_entities():
+def test_evaluate_entities_cv():
     mock_extractors = ["A", "B"]
     result = align_entity_predictions(EN_targets, EN_predicted,
                                       EN_tokens, mock_extractors)
diff --git a/tests/base/test_featurizers.py b/tests/base/test_featurizers.py
index 855311957ac1..931b0cbfb5c5 100644
--- a/tests/base/test_featurizers.py
+++ b/tests/base/test_featurizers.py
@@ -44,14 +44,14 @@ def test_ngram_featurizer(spacy_nlp):
     greet = {"intent": "greet", "text_features": [0.5]}
     goodbye = {"intent": "goodbye", "text_features": [0.5]}
     labeled_sentences = [
-                            Message("heyheyheyhey", greet),
-                            Message("howdyheyhowdy", greet),
-                            Message("heyhey howdyheyhowdy", greet),
-                            Message("howdyheyhowdy heyhey", greet),
-                            Message("astalavistasista", goodbye),
-                            Message("astalavistasista sistala", goodbye),
-                            Message("sistala astalavistasista", goodbye),
-                        ] * repetition_factor
+        Message("heyheyheyhey", greet),
+        Message("howdyheyhowdy", greet),
+        Message("heyhey howdyheyhowdy", greet),
+        Message("howdyheyhowdy heyhey", greet),
+        Message("astalavistasista", goodbye),
+        Message("astalavistasista sistala", goodbye),
+        Message("sistala astalavistasista", goodbye),
+    ] * repetition_factor
 
     for m in labeled_sentences:
         m.set("spacy_doc", spacy_nlp(m.text))
@@ -148,7 +148,7 @@ def test_spacy_featurizer_casing(spacy_nlp):
 
         assert np.allclose(vecs, vecs_capitalized, atol=1e-5), \
             "Vectors are unequal for texts '{}' and '{}'".format(
-                    e.text, e.text.capitalize())
+                e.text, e.text.capitalize())
 
 
 @pytest.mark.parametrize("sentence, expected", [
@@ -262,9 +262,9 @@ def test_count_vector_featurizer_using_tokens(tokens, expected):
 
 
 @pytest.mark.parametrize("sentence, expected", [
-     ("ababab", [3, 3, 3, 2]),
-     ("ab ab ab", [2, 2, 3, 3, 3, 2]),
-     ("abc", [1, 1, 1, 1, 1])
+    ("ababab", [3, 3, 3, 2]),
+    ("ab ab ab", [2, 2, 3, 3, 3, 2]),
+    ("abc", [1, 1, 1, 1, 1])
 ])
 def test_count_vector_featurizer(sentence, expected):
     from rasa_nlu.featurizers.count_vectors_featurizer import \
diff --git a/tests/base/test_interpreter.py b/tests/base/test_interpreter.py
index 76aaf9647745..6840a49d24e0 100644
--- a/tests/base/test_interpreter.py
+++ b/tests/base/test_interpreter.py
@@ -25,8 +25,8 @@ def test_interpreter(pipeline_template, component_builder, tmpdir):
     for text in texts:
         result = interpreter.parse(text, time=None)
         assert result['text'] == text
-        assert (not result['intent']['name']
-                or result['intent']['name'] in td.intents)
+        assert (not result['intent']['name'] or
+                result['intent']['name'] in td.intents)
         assert result['intent']['confidence'] >= 0
         # Ensure the model doesn't detect entity types that are not present
         # Models on our test data set are not stable enough to
diff --git a/tests/base/test_multitenancy.py b/tests/base/test_multitenancy.py
index d6a1f2d6de1e..902d758294f2 100644
--- a/tests/base/test_multitenancy.py
+++ b/tests/base/test_multitenancy.py
@@ -38,16 +38,16 @@ def app(component_builder):
 
 @pytest.mark.parametrize("response_test", [
     ResponseTest(
-            "http://dummy-uri/parse?q=food&project=test_project_mitie",
-            {"entities": [], "intent": "affirm", "text": "food"}
+        "http://dummy-uri/parse?q=food&project=test_project_mitie",
+        {"entities": [], "intent": "affirm", "text": "food"}
     ),
     ResponseTest(
-            "http://dummy-uri/parse?q=food&project=test_project_mitie_sklearn",
-            {"entities": [], "intent": "restaurant_search", "text": "food"}
+        "http://dummy-uri/parse?q=food&project=test_project_mitie_sklearn",
+        {"entities": [], "intent": "restaurant_search", "text": "food"}
     ),
     ResponseTest(
-            "http://dummy-uri/parse?q=food&project=test_project_spacy_sklearn",
-            {"entities": [], "intent": "restaurant_search", "text": "food"}
+        "http://dummy-uri/parse?q=food&project=test_project_spacy_sklearn",
+        {"entities": [], "intent": "restaurant_search", "text": "food"}
     ),
 ])
 @pytest.inlineCallbacks
@@ -61,12 +61,12 @@ def test_get_parse(app, response_test):
 
 @pytest.mark.parametrize("response_test", [
     ResponseTest(
-            "http://dummy-uri/parse?q=food",
-            {"error": "No project found with name 'default'."}
+        "http://dummy-uri/parse?q=food",
+        {"error": "No project found with name 'default'."}
     ),
     ResponseTest(
-            "http://dummy-uri/parse?q=food&project=umpalumpa",
-            {"error": "No project found with name 'umpalumpa'."}
+        "http://dummy-uri/parse?q=food&project=umpalumpa",
+        {"error": "No project found with name 'umpalumpa'."}
     )
 ])
 @pytest.inlineCallbacks
@@ -79,19 +79,19 @@ def test_get_parse_invalid_model(app, response_test):
 
 @pytest.mark.parametrize("response_test", [
     ResponseTest(
-            "http://dummy-uri/parse",
-            {"entities": [], "intent": "affirm", "text": "food"},
-            payload={"q": "food", "project": "test_project_mitie"}
+        "http://dummy-uri/parse",
+        {"entities": [], "intent": "affirm", "text": "food"},
+        payload={"q": "food", "project": "test_project_mitie"}
     ),
     ResponseTest(
-            "http://dummy-uri/parse",
-            {"entities": [], "intent": "restaurant_search", "text": "food"},
-            payload={"q": "food", "project": "test_project_mitie_sklearn"}
+        "http://dummy-uri/parse",
+        {"entities": [], "intent": "restaurant_search", "text": "food"},
+        payload={"q": "food", "project": "test_project_mitie_sklearn"}
     ),
     ResponseTest(
-            "http://dummy-uri/parse",
-            {"entities": [], "intent": "restaurant_search", "text": "food"},
-            payload={"q": "food", "project": "test_project_spacy_sklearn"}
+        "http://dummy-uri/parse",
+        {"entities": [], "intent": "restaurant_search", "text": "food"},
+        payload={"q": "food", "project": "test_project_spacy_sklearn"}
     ),
 ])
 @pytest.inlineCallbacks
@@ -128,14 +128,14 @@ def test_post_parse_specific_model(app):
 
 @pytest.mark.parametrize("response_test", [
     ResponseTest(
-            "http://dummy-uri/parse",
-            {"error": "No project found with name 'default'."},
-            payload={"q": "food"}
+        "http://dummy-uri/parse",
+        {"error": "No project found with name 'default'."},
+        payload={"q": "food"}
     ),
     ResponseTest(
-            "http://dummy-uri/parse",
-            {"error": "No project found with name 'umpalumpa'."},
-            payload={"q": "food", "project": "umpalumpa"}
+        "http://dummy-uri/parse",
+        {"error": "No project found with name 'umpalumpa'."},
+        payload={"q": "food", "project": "umpalumpa"}
     ),
 ])
 @pytest.inlineCallbacks
diff --git a/tests/base/test_server.py b/tests/base/test_server.py
index 0d2fc187647a..5e516cc427e5 100644
--- a/tests/base/test_server.py
+++ b/tests/base/test_server.py
@@ -65,25 +65,25 @@ def test_version(app):
 
 @pytest.mark.parametrize("response_test", [
     ResponseTest(
-            "http://dummy-uri/parse?q=hello",
-            {'project': 'default', 'entities': [], 'model': 'fallback',
-             'intent': {'confidence': 1.0, 'name': 'greet'}, 'text': 'hello'}
+        "http://dummy-uri/parse?q=hello",
+        {'project': 'default', 'entities': [], 'model': 'fallback',
+         'intent': {'confidence': 1.0, 'name': 'greet'}, 'text': 'hello'}
     ),
     ResponseTest(
-            "http://dummy-uri/parse?query=hello",
-            {'project': 'default', 'entities': [], 'model': 'fallback',
-             'intent': {'confidence': 1.0, 'name': 'greet'}, 'text': 'hello'}
+        "http://dummy-uri/parse?query=hello",
+        {'project': 'default', 'entities': [], 'model': 'fallback',
+         'intent': {'confidence': 1.0, 'name': 'greet'}, 'text': 'hello'}
     ),
     ResponseTest(
-            "http://dummy-uri/parse?q=hello ńöñàśçií",
-            {'project': 'default', 'entities': [], 'model': 'fallback',
-             'intent': {'confidence': 1.0, 'name': 'greet'},
-             'text': 'hello ńöñàśçií'}
+        "http://dummy-uri/parse?q=hello ńöñàśçií",
+        {'project': 'default', 'entities': [], 'model': 'fallback',
+         'intent': {'confidence': 1.0, 'name': 'greet'},
+         'text': 'hello ńöñàśçií'}
     ),
     ResponseTest(
-            "http://dummy-uri/parse?q=",
-            {'project': 'default', 'entities': [], 'model': 'fallback',
-             'intent': {'confidence': 0.0, 'name': None}, 'text': ''}
+        "http://dummy-uri/parse?q=",
+        {'project': 'default', 'entities': [], 'model': 'fallback',
+         'intent': {'confidence': 0.0, 'name': None}, 'text': ''}
     ),
 ])
 @pytest.inlineCallbacks
@@ -99,25 +99,25 @@ def test_get_parse(app, response_test):
 
 @pytest.mark.parametrize("response_test", [
     ResponseTest(
-            "http://dummy-uri/parse",
-            {'project': 'default', 'entities': [], 'model': 'fallback',
-             'intent': {'confidence': 1.0, 'name': 'greet'},
-             'text': 'hello'},
-            payload={"q": "hello"}
+        "http://dummy-uri/parse",
+        {'project': 'default', 'entities': [], 'model': 'fallback',
+         'intent': {'confidence': 1.0, 'name': 'greet'},
+         'text': 'hello'},
+        payload={"q": "hello"}
     ),
     ResponseTest(
-            "http://dummy-uri/parse",
-            {'project': 'default', 'entities': [], 'model': 'fallback',
-             'intent': {'confidence': 1.0, 'name': 'greet'},
-             'text': 'hello'},
-            payload={"query": "hello"}
+        "http://dummy-uri/parse",
+        {'project': 'default', 'entities': [], 'model': 'fallback',
+         'intent': {'confidence': 1.0, 'name': 'greet'},
+         'text': 'hello'},
+        payload={"query": "hello"}
     ),
     ResponseTest(
-            "http://dummy-uri/parse",
-            {'project': 'default', 'entities': [], 'model': 'fallback',
-             'intent': {'confidence': 1.0, 'name': 'greet'},
-             'text': 'hello ńöñàśçií'},
-            payload={"q": "hello ńöñàśçií"}
+        "http://dummy-uri/parse",
+        {'project': 'default', 'entities': [], 'model': 'fallback',
+         'intent': {'confidence': 1.0, 'name': 'greet'},
+         'text': 'hello ńöñàśçií'},
+        payload={"q": "hello ńöñàśçií"}
     ),
 ])
 @pytest.inlineCallbacks
diff --git a/tests/base/test_training_data.py b/tests/base/test_training_data.py
index d93cc6332069..00f33dbb5f0b 100644
--- a/tests/base/test_training_data.py
+++ b/tests/base/test_training_data.py
@@ -73,7 +73,7 @@ def test_dialogflow_data():
 def test_lookup_table_json():
     lookup_fname = 'data/test/lookup_tables/plates.txt'
     td_lookup = training_data.load_data(
-            'data/test/lookup_tables/lookup_table.json')
+        'data/test/lookup_tables/lookup_table.json')
     assert td_lookup.lookup_tables[0]['name'] == 'plates'
     assert td_lookup.lookup_tables[0]['elements'] == lookup_fname
     assert td_lookup.lookup_tables[1]['name'] == 'drinks'
@@ -84,7 +84,7 @@ def test_lookup_table_json():
 def test_lookup_table_md():
     lookup_fname = 'data/test/lookup_tables/plates.txt'
     td_lookup = training_data.load_data(
-            'data/test/lookup_tables/lookup_table.md')
+        'data/test/lookup_tables/lookup_table.md')
     assert td_lookup.lookup_tables[0]['name'] == 'plates'
     assert td_lookup.lookup_tables[0]['elements'] == lookup_fname
     assert td_lookup.lookup_tables[1]['name'] == 'drinks'
@@ -108,8 +108,8 @@ def test_demo_data(filename):
                                   'vegg': 'vegetarian',
                                   'veggie': 'vegetarian'}
 
-    assert td.regex_features == [{"name": "greet", "pattern": "hey[^\s]*"},
-                                 {"name": "zipcode", "pattern": "[0-9]{5}"}]
+    assert td.regex_features == [{"name": "greet", "pattern": r"hey[^\s]*"},
+                                 {"name": "zipcode", "pattern": r"[0-9]{5}"}]
 
 
 @pytest.mark.parametrize("filename", ['data/examples/rasa/demo-rasa.md'])
@@ -144,12 +144,12 @@ def test_data_merging(files):
 
 def test_markdown_single_sections():
     td_regex_only = training_data.load_data(
-            'data/test/markdown_single_sections/regex_only.md')
+        'data/test/markdown_single_sections/regex_only.md')
     assert (td_regex_only.regex_features ==
-            [{"name": "greet", "pattern": "hey[^\s]*"}])
+            [{"name": "greet", "pattern": r"hey[^\s]*"}])
 
     td_syn_only = training_data.load_data(
-            'data/test/markdown_single_sections/synonyms_only.md')
+        'data/test/markdown_single_sections/synonyms_only.md')
     assert td_syn_only.entity_synonyms == {'Chines': 'chinese',
                                            'Chinese': 'chinese'}
 
diff --git a/tests/base/test_utils.py b/tests/base/test_utils.py
index b48bf13c52fc..bd0e9f41cc79 100644
--- a/tests/base/test_utils.py
+++ b/tests/base/test_utils.py
@@ -60,12 +60,12 @@ def test_ordered():
 
 
 @pytest.mark.parametrize(
-        ("model_dir", "expected"),
-        [("test_models/test_model_mitie/model_20170628-002704", True),
-         ("test_models/test_model_mitie_sklearn/model_20170628-002712", True),
-         ("test_models/test_model_spacy_sklearn/model_20170628-002705", True),
-         ("test_models/", False),
-         ("test_models/nonexistent_for_sure_123", False)])
+    ("model_dir", "expected"),
+    [("test_models/test_model_mitie/model_20170628-002704", True),
+     ("test_models/test_model_mitie_sklearn/model_20170628-002712", True),
+     ("test_models/test_model_spacy_sklearn/model_20170628-002705", True),
+     ("test_models/", False),
+     ("test_models/nonexistent_for_sure_123", False)])
 def test_is_model_dir(model_dir, expected):
     assert is_model_dir(model_dir) == expected
 
@@ -111,20 +111,20 @@ def test_is_url():
 
 def test_endpoint_config():
     endpoint = EndpointConfig(
-            "https://abc.defg/",
-            params={"A": "B"},
-            headers={"X-Powered-By": "Rasa"},
-            basic_auth={"username": "user",
-                        "password": "pass"},
-            token="mytoken",
-            token_name="letoken"
+        "https://abc.defg/",
+        params={"A": "B"},
+        headers={"X-Powered-By": "Rasa"},
+        basic_auth={"username": "user",
+                    "password": "pass"},
+        token="mytoken",
+        token_name="letoken"
     )
 
     httpretty.register_uri(
-            httpretty.POST,
-            'https://abc.defg/test',
-            status=500,
-            body='')
+        httpretty.POST,
+        'https://abc.defg/test',
+        status=500,
+        body='')
 
     httpretty.enable()
     endpoint.request("post", subpath="test",
diff --git a/tests/training/test_train.py b/tests/training/test_train.py
index db8fa867988b..b67a7b0b3330 100644
--- a/tests/training/test_train.py
+++ b/tests/training/test_train.py
@@ -69,10 +69,10 @@ def test_all_components_are_in_at_least_one_test_pipeline():
 def test_train_model(pipeline_template, component_builder, tmpdir):
     _config = utilities.base_test_conf(pipeline_template)
     (trained, _, persisted_path) = train.do_train(
-            _config,
-            path=tmpdir.strpath,
-            data=DEFAULT_DATA_PATH,
-            component_builder=component_builder)
+        _config,
+        path=tmpdir.strpath,
+        data=DEFAULT_DATA_PATH,
+        component_builder=component_builder)
     assert trained.pipeline
     loaded = Interpreter.load(persisted_path, component_builder)
     assert loaded.pipeline
@@ -90,16 +90,16 @@ def test_random_seed(component_builder, tmpdir):
                                random_seed=1)
     # first run
     (trained_a, _, persisted_path_a) = train.do_train(
-            _config,
-            path=tmpdir.strpath + "_a",
-            data=DEFAULT_DATA_PATH,
-            component_builder=component_builder)
+        _config,
+        path=tmpdir.strpath + "_a",
+        data=DEFAULT_DATA_PATH,
+        component_builder=component_builder)
     # second run
     (trained_b, _, persisted_path_b) = train.do_train(
-            _config,
-            path=tmpdir.strpath + "_b",
-            data=DEFAULT_DATA_PATH,
-            component_builder=component_builder)
+        _config,
+        path=tmpdir.strpath + "_b",
+        data=DEFAULT_DATA_PATH,
+        component_builder=component_builder)
     loaded_a = Interpreter.load(persisted_path_a, component_builder)
     loaded_b = Interpreter.load(persisted_path_b, component_builder)
     result_a = loaded_a.parse("hello")["intent"]["confidence"]
@@ -113,10 +113,10 @@ def test_train_model_on_test_pipelines(language, pipeline,
                                        component_builder, tmpdir):
     _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
     (trained, _, persisted_path) = train.do_train(
-            _config,
-            path=tmpdir.strpath,
-            data=DEFAULT_DATA_PATH,
-            component_builder=component_builder)
+        _config,
+        path=tmpdir.strpath,
+        data=DEFAULT_DATA_PATH,
+        component_builder=component_builder)
     assert trained.pipeline
     loaded = Interpreter.load(persisted_path, component_builder)
     assert loaded.pipeline
@@ -129,10 +129,10 @@ def test_train_model_on_test_pipelines(language, pipeline,
 def test_train_model_noents(language, pipeline, component_builder, tmpdir):
     _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
     (trained, _, persisted_path) = train.do_train(
-            _config,
-            path=tmpdir.strpath,
-            data="./data/test/demo-rasa-noents.json",
-            component_builder=component_builder)
+        _config,
+        path=tmpdir.strpath,
+        data="./data/test/demo-rasa-noents.json",
+        component_builder=component_builder)
     assert trained.pipeline
     loaded = Interpreter.load(persisted_path, component_builder)
     assert loaded.pipeline
@@ -145,11 +145,11 @@ def test_train_model_noents(language, pipeline, component_builder, tmpdir):
 def test_train_model_multithread(language, pipeline, component_builder, tmpdir):
     _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
     (trained, _, persisted_path) = train.do_train(
-            _config,
-            path=tmpdir.strpath,
-            data=DEFAULT_DATA_PATH,
-            component_builder=component_builder,
-            num_threads=2)
+        _config,
+        path=tmpdir.strpath,
+        data=DEFAULT_DATA_PATH,
+        component_builder=component_builder,
+        num_threads=2)
     assert trained.pipeline
     loaded = Interpreter.load(persisted_path, component_builder)
     assert loaded.pipeline
@@ -162,19 +162,19 @@ def test_train_model_empty_pipeline(component_builder):
     _config = utilities.base_test_conf(pipeline_template=None)
     with pytest.raises(ValueError):
         train.do_train(
-                _config,
-                data=DEFAULT_DATA_PATH,
-                component_builder=component_builder)
+            _config,
+            data=DEFAULT_DATA_PATH,
+            component_builder=component_builder)
 
 
 def test_train_named_model(component_builder, tmpdir):
     _config = utilities.base_test_conf("keyword")
     (trained, _, persisted_path) = train.do_train(
-            _config,
-            path=tmpdir.strpath,
-            project="my_keyword_model",
-            data=DEFAULT_DATA_PATH,
-            component_builder=component_builder)
+        _config,
+        path=tmpdir.strpath,
+        project="my_keyword_model",
+        data=DEFAULT_DATA_PATH,
+        component_builder=component_builder)
     assert trained.pipeline
     normalized_path = os.path.dirname(os.path.normpath(persisted_path))
     # should be saved in a dir named after a project
@@ -186,9 +186,9 @@ def test_handles_pipeline_with_non_existing_component(component_builder):
     _config.pipeline.append({"name": "my_made_up_component"})
     with pytest.raises(Exception) as execinfo:
         train.do_train(
-                _config,
-                data=DEFAULT_DATA_PATH,
-                component_builder=component_builder)
+            _config,
+            data=DEFAULT_DATA_PATH,
+            component_builder=component_builder)
     assert "Failed to find component" in str(execinfo.value)