diff --git a/auchann/align_words.py b/auchann/align_words.py index 71aaced..08bdf85 100644 --- a/auchann/align_words.py +++ b/auchann/align_words.py @@ -14,6 +14,17 @@ class TokenOperation(Enum): class TokenCorrection: + """Represents a way in which a (group of) token(s) can be represented + in a CHAT annotation. + + Params: + insert: token(s) to potentially replace a token or be inserted + in the annotation (derived from correction string) + remove: token(s) to potentially be removed or replaced in the + annotation (derived form the transcript string) + operation: INSERT, REPLACE, REMOVE, or COPY + is_filler/is_fragment: True if removed token is a filler or fragment + """ insert: List[Optional[str]] remove: List[Optional[str]] operation: TokenOperation @@ -64,6 +75,7 @@ def __str__(self): return f'UNKNOWN OPERATION {self.operation}' def __str__remove(self): + """Adds special annotations for specific remove operations""" remove = ' '.join(self.remove) if self.is_filler: return f'&-{remove}' @@ -417,6 +429,7 @@ def align_split(self, transcript_offset+1, correction_offset+lookahead) distance = sum(len(token) for token in correction_tokens) - len(transcript_token) + distance += len(correction_tokens) - 1 # number of spaces added corrections += prepend_correction( correction, diff --git a/unit-tests/test_align.py b/unit-tests/test_align.py index 7bf0827..66934d0 100644 --- a/unit-tests/test_align.py +++ b/unit-tests/test_align.py @@ -6,7 +6,7 @@ def test_replace(): correction_line = "doet zij even de armen weg" expected_chat_line = "doet zij even de armen wes [: weg]" - assertAlign(transcript_line, correction_line, expected_chat_line) + assert_align(transcript_line, correction_line, expected_chat_line) def test_remove(): @@ -14,15 +14,18 @@ def test_remove(): correction_line = "alleen dit" expected_chat_line = "alleen dit [/] dit" - assertAlign(transcript_line, correction_line, expected_chat_line) + assert_align(transcript_line, correction_line, expected_chat_line) def test_insert(): - transcript_line = "magge zien" - correction_line = "mag ik zien" - expected_chat_line = "magge [: mag] 0ik zien" + data = [ + ("magge zien", "mag ik zien", "magge [: mag] 0ik zien"), + ("dit is huis", "dit is het huis", "dit is 0het huis"), + ("dit is huis ja", "dit is het huis ja", "dit is 0het huis ja") + ] - assertAlign(transcript_line, correction_line, expected_chat_line) + for transcript_line, correction_line, expected_chat_line in data: + assert_align(transcript_line, correction_line, expected_chat_line) def test_repetition(): @@ -30,7 +33,7 @@ def test_repetition(): correction_line = "toen kwam hij bij een weiland" expected_chat_line = "toen kwam hij bij een [/] een weiland" - assertAlign(transcript_line, correction_line, expected_chat_line) + assert_align(transcript_line, correction_line, expected_chat_line) def test_error_detection(): @@ -38,7 +41,7 @@ def test_error_detection(): correction_line = "het meisje sliep thuis" expected_chat_line = "de [: het] [* s:r:gc:art] meisje slaapte [: sliep] [* m] thuis" - assertAlign(transcript_line, correction_line, expected_chat_line) + assert_align(transcript_line, correction_line, expected_chat_line) def test_split(): @@ -67,10 +70,10 @@ def test_multi_word(): ] for transcript_line, correction_line, expected_chat_line in data: - assertAlign(transcript_line, correction_line, expected_chat_line) + assert_align(transcript_line, correction_line, expected_chat_line) -def assertAlign(transcript_line: str, correction_line: str, expected_chat_line: str): +def assert_align(transcript_line: str, correction_line: str, expected_chat_line: str): settings = AlignmentSettings() def detect_error(original: str, correction: str): if original == "slaapte" and correction == "sliep":