Add Claude Code and Gemini transcript support for context importer.#160
Add Claude Code and Gemini transcript support for context importer.#160Ankit-Kotnala wants to merge 2 commits intoXortexAI:mainfrom
Conversation
There was a problem hiding this comment.
Code Review
This pull request centralizes transcript parsing logic into a new utility module, src/utils/transcripts.py, and removes redundant implementations from server.py and src/api/routes/memory.py. The new shared parser adds support for Claude Code (JSONL) and Gemini formats while improving the filtering of non-message content like tool calls and thinking blocks. Feedback includes suggestions to improve the Cursor parser by accumulating multiple assistant blocks, removing a redundant check in the JSON record pairing logic to better handle mixed messages, and ensuring tool markdown is stripped from Antigravity transcripts for consistency.
| def _parse_cursor_transcript(text: str) -> list[ParsedMessagePair]: | ||
| """Parse a Cursor-exported markdown transcript into message pairs.""" | ||
| pairs: list[ParsedMessagePair] = [] | ||
| sections = text.split("---") | ||
|
|
||
| start_idx = 0 | ||
| if sections and "Exported on" in sections[0]: | ||
| start_idx = 1 | ||
|
|
||
| current_user_query: str | None = None | ||
|
|
||
| for section in sections[start_idx:]: | ||
| section = section.strip() | ||
| if not section: | ||
| continue | ||
|
|
||
| if section.startswith("**User**"): | ||
| content = section.replace("**User**", "", 1).strip() | ||
| current_user_query = _append_user_text(current_user_query, content) | ||
| elif section.startswith("**Cursor**") or section.startswith("**Assistant**"): | ||
| content = ( | ||
| section.replace("**Cursor**", "", 1) | ||
| .replace("**Assistant**", "", 1) | ||
| .strip() | ||
| ) | ||
| if current_user_query: | ||
| pairs.append( | ||
| { | ||
| "user_query": current_user_query, | ||
| "agent_response": content, | ||
| } | ||
| ) | ||
| current_user_query = None | ||
|
|
||
| return pairs |
There was a problem hiding this comment.
The Cursor transcript parser currently only captures the first assistant block per turn. For consistency with the Antigravity and JSON parsers, it should accumulate multiple assistant blocks (if they exist) and flush them as a single response when a new user turn begins.
def _parse_cursor_transcript(text: str) -> list[ParsedMessagePair]:
"""Parse a Cursor-exported markdown transcript into message pairs."""
pairs: list[ParsedMessagePair] = []
sections = text.split("---")
start_idx = 1 if sections and "Exported on" in sections[0] else 0
current_user_query: str | None = None
assistant_chunks: list[str] = []
def flush_pair() -> None:
nonlocal current_user_query, assistant_chunks
if current_user_query and assistant_chunks:
pairs.append(
{
"user_query": current_user_query,
"agent_response": "\n\n".join(assistant_chunks).strip(),
}
)
assistant_chunks = []
for section in sections[start_idx:]:
section = section.strip()
if not section:
continue
if section.startswith("**User**"):
if assistant_chunks:
flush_pair()
current_user_query = None
content = section.replace("**User**", "", 1).strip()
current_user_query = _append_user_text(current_user_query, content)
elif section.startswith("**Cursor**") or section.startswith("**Assistant**"):
content = (
section.replace("**Cursor**", "", 1)
.replace("**Assistant**", "", 1)
.strip()
)
if current_user_query:
assistant_chunks.append(content)
flush_pair()
return pairs| if role in _USER_ROLES and _record_has_tool_result(record): | ||
| continue |
There was a problem hiding this comment.
This check is redundant because _record_text (via _extract_text) already filters out tool-related blocks. If a message consists entirely of tool results, text will be empty and the record will be skipped at line 280. Removing this allows mixed messages (text + tool results) to still contribute their text content to the transcript.
| def _parse_antigravity_transcript(text: str) -> list[ParsedMessagePair]: | ||
| """Parse an Antigravity-exported markdown transcript into message pairs.""" | ||
| pairs: list[ParsedMessagePair] = [] | ||
| blocks = re.split(r"(?m)^(###\s+.+)$", text) | ||
|
|
||
| current_user_query: str | None = None | ||
| planner_chunks: list[str] = [] | ||
|
|
||
| for i, block in enumerate(blocks): | ||
| block = block.strip() | ||
| if not block: | ||
| continue | ||
|
|
||
| if re.match(r"###\s+User Input", block, re.IGNORECASE): | ||
| if current_user_query and planner_chunks: | ||
| pairs.append( | ||
| { | ||
| "user_query": current_user_query, | ||
| "agent_response": "\n\n".join(planner_chunks).strip(), | ||
| } | ||
| ) | ||
| planner_chunks = [] | ||
| current_user_query = None | ||
|
|
||
| elif re.match(r"###\s+Planner Response", block, re.IGNORECASE): | ||
| continue | ||
|
|
||
| elif i > 0: | ||
| prev_heading = blocks[i - 1].strip() if i >= 1 else "" | ||
| if re.match(r"###\s+User Input", prev_heading, re.IGNORECASE): | ||
| if current_user_query and planner_chunks: | ||
| pairs.append( | ||
| { | ||
| "user_query": current_user_query, | ||
| "agent_response": "\n\n".join(planner_chunks).strip(), | ||
| } | ||
| ) | ||
| planner_chunks = [] | ||
| current_user_query = block | ||
| else: | ||
| current_user_query = _append_user_text(current_user_query, block) | ||
|
|
||
| elif re.match(r"###\s+Planner Response", prev_heading, re.IGNORECASE): | ||
| if block: | ||
| planner_chunks.append(block) | ||
|
|
||
| if current_user_query and planner_chunks: | ||
| pairs.append( | ||
| { | ||
| "user_query": current_user_query, | ||
| "agent_response": "\n\n".join(planner_chunks).strip(), | ||
| } | ||
| ) | ||
|
|
||
| return pairs |
There was a problem hiding this comment.
The Antigravity parser should also apply _strip_tool_markdown to its content blocks. While Antigravity exports usually separate tool steps into different Planner Response blocks, those blocks themselves might still contain the raw tool markdown that should be cleaned before ingestion.
def _parse_antigravity_transcript(text: str) -> list[ParsedMessagePair]:
"""Parse an Antigravity-exported markdown transcript into message pairs."""
pairs: list[ParsedMessagePair] = []
blocks = re.split(r"(?m)^(###\s+.+)$", text)
current_user_query: str | None = None
planner_chunks: list[str] = []
for i, block in enumerate(blocks):
block = block.strip()
if not block:
continue
if re.match(r"###\s+User Input", block, re.IGNORECASE):
if current_user_query and planner_chunks:
pairs.append(
{
"user_query": current_user_query,
"agent_response": "\n\n".join(planner_chunks).strip(),
}
)
planner_chunks = []
current_user_query = None
elif re.match(r"###\s+Planner Response", block, re.IGNORECASE):
continue
elif i > 0:
prev_heading = blocks[i - 1].strip() if i >= 1 else ""
content = _strip_tool_markdown(block)
if not content:
continue
if re.match(r"###\s+User Input", prev_heading, re.IGNORECASE):
if current_user_query and planner_chunks:
pairs.append(
{
"user_query": current_user_query,
"agent_response": "\n\n".join(planner_chunks).strip(),
}
)
planner_chunks = []
current_user_query = content
else:
current_user_query = _append_user_text(current_user_query, content)
elif re.match(r"###\s+Planner Response", prev_heading, re.IGNORECASE):
planner_chunks.append(content)
if current_user_query and planner_chunks:
pairs.append(
{
"user_query": current_user_query,
"agent_response": "\n\n".join(planner_chunks).strip(),
}
)
return pairs
Summary
Fixes #155.
Adds deterministic transcript parsing support for additional
/contextupload formats:/chat shareJSON exports/chat shareMarkdown exportsAlso keeps existing Cursor and Antigravity behavior by moving transcript parsing into a shared helper used by both the production memory route and the legacy server entrypoint.
Changes
src/utils/transcripts.pyas the shared transcript parser module./v1/memory/parse_transcriptto use the shared parser.server.pyparsing wrapper to use the same shared parser.