Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Rasa Entities Role and Groups #52

Open
wants to merge 28 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
5b1cd72
added encoding utf-8 to io
tomgun132 Mar 25, 2020
eef5212
added rasa YAML adapter
tomgun132 Aug 17, 2020
5cddba4
Merge pull request #1 from tomgun132/yml_output
tomgun132 Mar 8, 2021
02e72f5
add annotation after slot in parser
tomgun132 Mar 8, 2021
95bdfa4
add annotation check in parser
tomgun132 Mar 9, 2021
70af8c8
add rolegroup annotation check
tomgun132 Mar 10, 2021
479cc32
add comment
tomgun132 Mar 10, 2021
ceb9da3
added choice to choose unitref
tomgun132 Mar 10, 2021
3d34c25
added new slotref class for rolegroup
tomgun132 Mar 10, 2021
f453135
added role group to Entity unit
tomgun132 Mar 10, 2021
7a34afe
fixed entity repr with rolegroup
tomgun132 Mar 10, 2021
5e62dcf
added role,group,value format to adapter
tomgun132 Mar 15, 2021
2d9ee0b
edited example to test role,group
tomgun132 Mar 15, 2021
4309095
added ruamel.yml to reqs
tomgun132 Mar 15, 2021
b52beef
added encoding utf-8 to io
tomgun132 Mar 25, 2020
aba7c75
added rasa YAML adapter
tomgun132 Aug 17, 2020
d4fb7b2
Merge branch 'master' of origin
tomgun132 Mar 15, 2021
62a233e
fixed wrong logic
tomgun132 Mar 26, 2021
398147e
changed str format to support older python
tomgun132 May 24, 2021
61d23bc
add base_file reading for yaml adapter
tomgun132 May 24, 2021
d80f300
add ruamel to requirements in setup.py
tomgun132 Jun 21, 2021
3883d52
remove unnecessary unit_type assignment
tomgun132 Jun 21, 2021
c8e7855
add rolegroupreference unit testing
tomgun132 Jun 21, 2021
41eddc4
add test system for rolegroup
tomgun132 Jun 21, 2021
2dfdd0e
added missing quotation to entity text
tomgun132 Jun 21, 2021
9b814ad
Merge pull request #2 from tomgun132/rasa_role_group
tomgun132 Jun 27, 2021
29f7109
changed OrderedDict to ruamel CommentedMap
tomgun132 Aug 12, 2021
b2be5f7
Merge pull request #3 from tomgun132/rasa_role_group
tomgun132 Aug 12, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 1 addition & 2 deletions chatette/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ def _add_optional_arguments(argument_parser):
argument_parser.add_argument(
"-a", "--adapter", dest="adapter", required=False,
type=str, default="rasa",
help="Write adapter. " + \
"Possible values: ['rasa', 'rasamd' or 'rasa-md', 'jsonl']"
help="Write adapter. Possible values: ['rasa', 'jsonl', 'rasamd' or 'rasa-md', 'rasayml' or 'rasayaml']"
)
argument_parser.add_argument(
"--base-file", dest="base_filepath",
Expand Down
2 changes: 1 addition & 1 deletion chatette/adapters/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def write(self, output_directory, examples, synonyms):
self.__get_file_name(
batch, output_directory, single_file_output
)
with io.open(output_file_path, 'w') as output_file:
with io.open(output_file_path, 'w', encoding='utf-8') as output_file:
self._write_batch(output_file, batch)

@classmethod
Expand Down
3 changes: 3 additions & 0 deletions chatette/adapters/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from chatette.adapters.jsonl import JsonListAdapter
from chatette.adapters.rasa import RasaAdapter
from chatette.adapters.rasa_md import RasaMdAdapter
from chatette.adapters.rasa_yml import RasaYMLAdapter


def create_adapter(adapter_name, base_filepath=None):
Expand All @@ -23,6 +24,8 @@ def create_adapter(adapter_name, base_filepath=None):
return RasaAdapter(base_filepath)
elif adapter_name in ('rasa-md', 'rasamd'):
return RasaMdAdapter(base_filepath)
elif adapter_name in ('rasa-yml', 'rasayml'):
return RasaYMLAdapter(base_filepath)
elif adapter_name == 'jsonl':
return JsonListAdapter(base_filepath)
raise ValueError("Unknown adapter was selected.")
10 changes: 8 additions & 2 deletions chatette/adapters/rasa.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,18 @@ def _write_batch(self, output_file_handle, batch):

def prepare_example(self, example):
def entity_to_rasa(entity):
return {
entity_dict = {
"entity": entity.slot_name,
"value": entity.value,
"start": entity._start_index,
"end": entity._start_index + entity._len,
}
if entity.role is not None:
entity_dict['role'] = entity.role
if entity.group is not None:
entity_dict['group'] = entity.group

return entity_dict

return {
"intent": example.intent_name,
Expand All @@ -68,7 +74,7 @@ def _get_base_to_extend(self):
if self._base_file_contents is None:
if self._base_filepath is None:
return self._get_empty_base()
with io.open(self._base_filepath, 'r') as base_file:
with io.open(self._base_filepath, 'r', encoding='utf-8') as base_file:
self._base_file_contents = json.load(base_file)
self.check_base_file_contents()
return self._base_file_contents
Expand Down
15 changes: 11 additions & 4 deletions chatette/adapters/rasa_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,18 @@ def prepare_example(self, example):
)
result = example.text[:]
for entity in sorted_entities:
entity_annotation_text = ']{"entity": "' + entity.slot_name
entity_text = result[entity._start_index:entity._start_index + entity._len]
if entity_text != entity.value:
entity_annotation_text += '", "value": "{}'.format(entity.value)
if entity.role is not None:
entity_annotation_text += '", "role": "{}'.format(entity.role)
if entity.group is not None:
entity_annotation_text += '", "group": "{}'.format(entity.group)
result = \
result[:entity._start_index] + "[" + \
result[entity._start_index:entity._start_index + entity._len] + \
"](" + entity.slot_name + ")" + \
result[entity._start_index + entity._len:]
entity_text + entity_annotation_text + '"}' + \
result[entity._start_index + entity._len:] # New rasa entity format
return result


Expand All @@ -105,7 +112,7 @@ def _get_base_to_extend(self):
if self._base_file_contents is None:
if self._base_filepath is None:
return self._get_empty_base()
with io.open(self._base_filepath, 'r') as base_file:
with io.open(self._base_filepath, 'r', encoding='utf-8') as base_file:
self._base_file_contents = ''.join(base_file.readlines())
self.check_base_file_contents()
return self._base_file_contents
Expand Down
189 changes: 189 additions & 0 deletions chatette/adapters/rasa_yml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import os
import io
import ruamel.yaml as yaml
from ruamel.yaml.scalarstring import DoubleQuotedScalarString
from ruamel.yaml.error import YAMLError
from ruamel.yaml.constructor import DuplicateKeyError
from ruamel.yaml.comments import CommentedMap as OrderedDict
from chatette.adapters._base import Adapter
from chatette.utils import append_to_list_in_dict, cast_to_unicode

YAML_VERSION = (1, 2)

def intent_dict_to_list_of_dict(data):
list_data = []
for key, values in data.items():
list_data.append(
{
"intent": key,
"examples": '\n'.join(['- ' + v for v in values]) + '\n'
}
)

return list_data

def fix_yaml_loader() -> None:
"""Ensure that any string read by yaml is represented as unicode."""
"""Code from Rasa yaml reader"""
def construct_yaml_str(self, node):
# Override the default string handling function
# to always return unicode objects
return self.construct_scalar(node)

yaml.Loader.add_constructor("tag:yaml.org,2002:str", construct_yaml_str)
yaml.SafeLoader.add_constructor("tag:yaml.org,2002:str", construct_yaml_str)


class RasaYMLAdapter(Adapter):
def __init__(self, base_filepath=None):
super(RasaYMLAdapter, self).__init__(base_filepath, None)
self._base_file_contents = None

@classmethod
def _get_file_extension(cls):
return "yml"

def __get_file_name(self, batch, output_directory, single_file):
if single_file:
return \
os.path.join(
output_directory, "nlu." + self._get_file_extension()
)
raise ValueError(
"Tried to generate several files with Rasa YAML adapter."
)

def _write_batch(self, output_file_handle, batch):
data = self._get_base_to_extend()
prepared_examples = dict()
for example in batch.examples:
append_to_list_in_dict(
prepared_examples,
example.intent_name, self.prepare_example(example)
)
prepared_examples = intent_dict_to_list_of_dict(prepared_examples)
prepared_examples.extend(
self.__format_synonyms(batch.synonyms)
)
data['nlu'] = prepared_examples
data = cast_to_unicode(data)

yaml.scalarstring.walk_tree(data)
yaml.round_trip_dump(data, output_file_handle, default_flow_style=False, allow_unicode=True)


def prepare_example(self, example):
if len(example.entities) == 0:
return example.text

sorted_entities = \
sorted(
example.entities,
reverse=True,
key=lambda entity: entity._start_index
)
result = example.text[:]
for entity in sorted_entities:
entity_annotation_text = ']{"entity": "' + entity.slot_name
entity_text = result[entity._start_index:entity._start_index + entity._len]
if entity_text != entity.value:
entity_annotation_text += '", "value": "{}'.format(entity.value)
if entity.role is not None:
entity_annotation_text += '", "role": "{}'.format(entity.role)
if entity.group is not None:
entity_annotation_text += '", "group": "{}'.format(entity.group)
result = \
result[:entity._start_index] + "[" + \
entity_text + entity_annotation_text + '"}' + \
result[entity._start_index + entity._len:] # New rasa entity format
return result

@classmethod
def __format_synonyms(cls, synonyms):
# {str: [str]} -> [{"value": str, "synonyms": [str]}]
return [
{
"synonym": slot_name,
"examples": '\n'.join(['- ' + s for s in synonyms[slot_name]]) + '\n'
}
for slot_name in synonyms
if len(synonyms[slot_name]) > 1
]

def _read_yaml(self, content):
fix_yaml_loader()
yaml_parser = yaml.YAML(typ='safe')
yaml_parser.version = YAML_VERSION
yaml_parser.preserve_quotes = True
yaml.allow_duplicate_keys = False

return yaml_parser.load(content)

def _get_base_to_extend(self):
if self._base_file_contents is None:
if self._base_filepath is None:
return self._get_empty_base()
with io.open(self._base_filepath, 'r', encoding='utf-8') as base_file:
try:
self._base_file_contents = self._read_yaml(base_file.read())
except (YAMLError, DuplicateKeyError) as e:
raise YamlSyntaxException(self._base_filepath, e)
self.check_base_file_contents()
return self._base_file_contents

def _get_empty_base(self):
base = OrderedDict()
base['version'] = DoubleQuotedScalarString('2.0')
base['nlu'] = list()
return base

def check_base_file_contents(self):
"""
Checks that `self._base_file_contents` contains well formatted NLU dictionary.
Throws a `SyntaxError` if the data is incorrect.
"""
if self._base_file_contents is None:
return
if not isinstance(self._base_file_contents, dict):
self._base_file_contents = None
raise SyntaxError(
"Couldn't load valid data from base file '" + \
self._base_filepath + "'"
)
else:
if "nlu" not in self._base_file_contents:
self._base_file_contents = None
raise SyntaxError(
"Expected 'nlu' as a root of base file '" + \
self._base_filepath + "'")


class YamlSyntaxException(Exception):
"""Raised when a YAML file can not be parsed properly due to a syntax error."""
"""code from rasa.shared.exceptions.YamlSyntaxException"""

def __init__(self, filename, underlying_yaml_exception):
self.filename = filename
self.underlying_yaml_exception = underlying_yaml_exception

def __str__(self):
if self.filename:
exception_text = "Failed to read '{}'.".format(self.filename)
else:
exception_text = "Failed to read YAML."

if self.underlying_yaml_exception:
self.underlying_yaml_exception.warn = None
self.underlying_yaml_exception.note = None
exception_text += " {}".format(self.underlying_yaml_exception)

if self.filename:
exception_text = exception_text.replace(
'in "<unicode string>"', 'in "{}"'.format(self.filename)
)

exception_text += (
"\n\nYou can use https://yamlchecker.com/ to validate the "
"YAML syntax of your file."
)
return exception_text
9 changes: 8 additions & 1 deletion chatette/parsing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from future.utils import with_metaclass

from chatette.units.modifiable.choice import Choice
from chatette.units.modifiable.unit_reference import UnitReference
from chatette.units.modifiable.unit_reference import UnitReference, SlotRoleGroupReference
from chatette.units.modifiable.definitions.alias import AliasDefinition
from chatette.units.modifiable.definitions.slot import SlotDefinition
from chatette.units.modifiable.definitions.intent import IntentDefinition
Expand Down Expand Up @@ -91,6 +91,7 @@ def __init__(self):
self.identifier = None
self.variation = None
self.arg_value = None
self.slot_rolegroup = None

def _check_information(self):
super(UnitRefBuilder, self)._check_information()
Expand All @@ -108,6 +109,12 @@ def _build_modifiers_repr(self):

def create_concrete(self):
self._check_information()
if self.slot_rolegroup is not None:
return SlotRoleGroupReference(
self.identifier, self.type,
self.leading_space, self._build_modifiers_repr(),
self.slot_rolegroup
)
return UnitReference(
self.identifier, self.type,
self.leading_space, self._build_modifiers_repr()
Expand Down
15 changes: 14 additions & 1 deletion chatette/parsing/lexing/rule_unit_ref.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
extract_identifier, \
CASE_GEN_SYM, UNIT_END_SYM

from chatette.parsing.lexing.rule_annotation import RuleAnnotation
from chatette.parsing.lexing.rule_unit_start import RuleUnitStart
from chatette.parsing.lexing.rule_variation import RuleVariation
from chatette.parsing.lexing.rule_rand_gen import RuleRandGen
Expand Down Expand Up @@ -55,11 +56,13 @@ def _apply_strategy(self, **kwargs):
"using character '" + UNIT_END_SYM + "')."
return False

is_slot = False
# TODO maybe making a function for this would be useful
if self._tokens[0].type == TerminalType.alias_ref_start:
unit_end_type = TerminalType.alias_ref_end
elif self._tokens[0].type == TerminalType.slot_ref_start:
unit_end_type = TerminalType.slot_ref_end
is_slot = True
elif self._tokens[0].type == TerminalType.intent_ref_start:
unit_end_type = TerminalType.intent_ref_end
else: # Should never happen
Expand All @@ -72,5 +75,15 @@ def _apply_strategy(self, **kwargs):
self._next_index += 1
self._update_furthest_matched_index()
self._tokens.append(LexicalToken(unit_end_type, UNIT_END_SYM))


# This is for adding new rasa training mode that has role and group entity
# Reference: https://rasa.com/docs/rasa/nlu-training-data/#entities-roles-and-groups
annotation_rule = RuleAnnotation(self._text, self._next_index)

# ? Should we raise error if RuleAnnotation doesn't match, i.e. wrong pattern
if is_slot and annotation_rule.matches():
self._next_index = annotation_rule.get_next_index_to_match()
self._update_furthest_matched_index()
self._tokens.extend(annotation_rule.get_lexical_tokens())

return True
2 changes: 1 addition & 1 deletion chatette/parsing/line_count_file_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ class LineCountFileWrapper(object):

def __init__(self, filepath, mode='r'):
self.name = cast_to_unicode(filepath)
self.f = io.open(filepath, mode)
self.f = io.open(filepath, mode, encoding='utf-8')
self.line_nb = 0

def close(self):
Expand Down