-
Notifications
You must be signed in to change notification settings - Fork 63
/
parsing.py
45 lines (32 loc) · 1.44 KB
/
parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import re
BAD_CHARS_FOR_REGEX_REGEX = re.compile(r"[-\/\\^$*+?.()|[\]{}]")
def _sanitize_string_for_use_in_a_regex(string: str) -> str:
'''Sanitizes `string` so it can be used inside of a regexp.'''
return BAD_CHARS_FOR_REGEX_REGEX.sub(r"\\\g<0>", string)
def parse_messages_from_str(string: str, names: list[str]) -> list[str]:
'''
Given a big string containing raw chat history, this function attempts to
parse it out into a list where each item is an individual message.
'''
sanitized_names = [
_sanitize_string_for_use_in_a_regex(name) for name in names
]
speaker_regex = re.compile(rf"^({'|'.join(sanitized_names)}): ?",
re.MULTILINE)
message_start_indexes = []
for match in speaker_regex.finditer(string):
message_start_indexes.append(match.start())
# FIXME(11b): One of these returns is silently dropping the last message.
if len(message_start_indexes) < 2:
# Single message in the string.
return [string.strip()]
prev_start_idx = message_start_indexes[0]
messages = []
for start_idx in message_start_indexes[1:]:
message = string[prev_start_idx:start_idx].strip()
messages.append(message)
prev_start_idx = start_idx
return messages
def serialize_chat_history(history: list[str]) -> str:
'''Given a structured chat history object, collapses it down to a string.'''
return "\n".join(history)