Skip to content

Commit

Permalink
Split replace penalty (#21)
Browse files Browse the repository at this point in the history
Added splitting distance:  spaces are inserts

---------

Co-authored-by: Sheean Spoel <s.j.j.spoel@uu.nl>
  • Loading branch information
Meesch and oktaal committed May 31, 2023
1 parent 481309b commit 67ef6bb
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 10 deletions.
13 changes: 13 additions & 0 deletions auchann/align_words.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@ class TokenOperation(Enum):


class TokenCorrection:
"""Represents a way in which a (group of) token(s) can be represented
in a CHAT annotation.
Params:
insert: token(s) to potentially replace a token or be inserted
in the annotation (derived from correction string)
remove: token(s) to potentially be removed or replaced in the
annotation (derived form the transcript string)
operation: INSERT, REPLACE, REMOVE, or COPY
is_filler/is_fragment: True if removed token is a filler or fragment
"""
insert: List[Optional[str]]
remove: List[Optional[str]]
operation: TokenOperation
Expand Down Expand Up @@ -64,6 +75,7 @@ def __str__(self):
return f'UNKNOWN OPERATION {self.operation}'

def __str__remove(self):
"""Adds special annotations for specific remove operations"""
remove = ' '.join(self.remove)
if self.is_filler:
return f'&-{remove}'
Expand Down Expand Up @@ -417,6 +429,7 @@ def align_split(self,
transcript_offset+1, correction_offset+lookahead)
distance = sum(len(token)
for token in correction_tokens) - len(transcript_token)
distance += len(correction_tokens) - 1 # number of spaces added

corrections += prepend_correction(
correction,
Expand Down
23 changes: 13 additions & 10 deletions unit-tests/test_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,39 +6,42 @@ def test_replace():
correction_line = "doet zij even de armen weg"
expected_chat_line = "doet zij even de armen wes [: weg]"

assertAlign(transcript_line, correction_line, expected_chat_line)
assert_align(transcript_line, correction_line, expected_chat_line)


def test_remove():
transcript_line = "alleen dit dit"
correction_line = "alleen dit"
expected_chat_line = "alleen dit [/] dit"

assertAlign(transcript_line, correction_line, expected_chat_line)
assert_align(transcript_line, correction_line, expected_chat_line)


def test_insert():
transcript_line = "magge zien"
correction_line = "mag ik zien"
expected_chat_line = "magge [: mag] 0ik zien"
data = [
("magge zien", "mag ik zien", "magge [: mag] 0ik zien"),
("dit is huis", "dit is het huis", "dit is 0het huis"),
("dit is huis ja", "dit is het huis ja", "dit is 0het huis ja")
]

assertAlign(transcript_line, correction_line, expected_chat_line)
for transcript_line, correction_line, expected_chat_line in data:
assert_align(transcript_line, correction_line, expected_chat_line)


def test_repetition():
transcript_line = "toen kwam hij bij een een weiland"
correction_line = "toen kwam hij bij een weiland"
expected_chat_line = "toen kwam hij bij een [/] een weiland"

assertAlign(transcript_line, correction_line, expected_chat_line)
assert_align(transcript_line, correction_line, expected_chat_line)


def test_error_detection():
transcript_line = "de meisje slaapte thuis"
correction_line = "het meisje sliep thuis"
expected_chat_line = "de [: het] [* s:r:gc:art] meisje slaapte [: sliep] [* m] thuis"

assertAlign(transcript_line, correction_line, expected_chat_line)
assert_align(transcript_line, correction_line, expected_chat_line)


def test_split():
Expand Down Expand Up @@ -67,10 +70,10 @@ def test_multi_word():
]

for transcript_line, correction_line, expected_chat_line in data:
assertAlign(transcript_line, correction_line, expected_chat_line)
assert_align(transcript_line, correction_line, expected_chat_line)


def assertAlign(transcript_line: str, correction_line: str, expected_chat_line: str):
def assert_align(transcript_line: str, correction_line: str, expected_chat_line: str):
settings = AlignmentSettings()
def detect_error(original: str, correction: str):
if original == "slaapte" and correction == "sliep":
Expand Down

0 comments on commit 67ef6bb

Please sign in to comment.