In [1]:
import os
import json
from llm import get_llm_answers
import re
from typing import List, Tuple

def tokenize_code_with_lines(code: str) -> List[Tuple[int, int, str, int]]:
    """
    使用正则分词，直接在原始代码上通过 finditer() 获取匹配位置，
    并保留换行符，让 LLM 能识别多行结构。
    返回形式: [(start_offset, end_offset, token_text, line_number), ...]
    """
    token_pattern = (
        r'[A-Za-z_]\w*|'           # 标识符
        r'[0-9]+(?:\.[0-9]+)?|'    # 整数和浮点数
        r'"(?:\\.|[^"\\])*"|'      # 双引号字符串(支持转义)
        r"'(?:\\.|[^'\\])*'|"      # 单引号字符串(支持转义)
        r'`[^`]*`|'                # 反引号字符串(如Go/JS模板字符串)
        r'#.*|'                    # Python风格单行注释
        r'//.*|'                   # C++/Java风格单行注释
        r'/\*[\s\S]*?\*/|'         # C风格多行注释
        r'"""[\s\S]*?"""|'         # Python三引号多行字符串
        r"'''[\s\S]*?'''|"         # Python三引号多行字符串
        r'\\[ntr]|'                # 常见转义字符
        r'\$\{[^}]*\}|'            # 字符串插值语法
        r'\n|\r|\t|'               # 换行/回车/制表符
        r'\S'                      # 其他任意非空白字符
    )

    tokens_with_offset = []
    lines = code.splitlines(keepends=True)
    current_line = 1
    current_pos = 0
    
    for match in re.finditer(token_pattern, code, re.MULTILINE | re.DOTALL):
        tk = match.group(0)
        start_offset, end_offset = match.span()
        
        # 计算当前token所在行号
        while current_line <= len(lines) and current_pos + len(lines[current_line-1]) <= start_offset:
            current_pos += len(lines[current_line-1])
            current_line += 1
            
        tokens_with_offset.append((start_offset, end_offset, tk, current_line))

    return tokens_with_offset

def get_ast_prompt(tokens, language):
    allowed_types = [
        "ABSTRACT", "AS", "BOOLEAN", "BREAK", "CASE", "CATCH", "CHAR", "CLASS", "CONST", "CONTINUE",
        "DO", "ELSE", "ENUM", "ERROR", "EXTEND", "FALSE", "FINALLY", "FOR", "FOREIGN", "FROM", 
        "FUNC", "IF", "IMPORT", "IN", "INIT", "INOUT", "INTERFACE", "INTNATIVE", "IS", "LET",
        "MACRO", "MAIN", "MATCH", "MUT", "OPEN", "OPERATOR", "OVERRIDE", "PACKAGE", "PRIVATE",
        "PROP", "PROTECTED", "PUBLIC", "QUOTE", "REDEF", "RETURN", "SEALED", "SPAWN", "STATIC",
        "STRUCT", "SUPER", "SYNCHRONIZED", "THIS", "THROW", "TRUE", "TRY", "TYPE", "UINTNATIVE",
        "UNIT", "UNSAFE", "VAR", "WHERE", "WHILE", "argumentList", "arrayLiteral", "arrowType",
        "assignmentExpression", "binaryExpression", "block", "body", "booleanLiteral",
        "breakExpression", "builtinFunction", "callExpression", "caseBody", "charLangTypes",
        "characterLiteral", "classDefinition", "classType", "collectionLiteral", "comment",
        "constantPattern", "continueExpression", "dollarIdentifier", "element", "elements",
        "enumBody", "enumDefinition", "enumPattern", "enumPatternParameters", "escapeSeq",
        "exceptionTypePattern", "extendBody", "extendDefinition", "extendMemberDeclaration",
        "fieldExpression", "floatLiteral", "forInExpression", "foreignDeclaration",
        "foreignMemberDeclaration", "functionDefinition", "functionParameters", "genericConstraints",
        "genericsType", "identifier", "ifExpression", "ifLetExpression", "importAll", "importContent",
        "importList", "importSpecified", "incDecExpression", "initBody", "initialize",
        "integerLiteral", "interfaceBody", "interfaceDefinition", "interfaceMemberDeclaration",
        "lambdaExpression", "lambdaParameter", "lambdaParameters", "lineStringContent",
        "lineStringExpression", "lineStringLiteral", "macroAttrExpr", "macroDecl", "macroDefinition",
        "macroExpression", "macroInputExprWithParens", "macroInputExprWithoutParens",
        "macroWithAttrParam", "macroWithoutAttrParam", "mainDefinition", "matchCase",
        "matchExpression", "memberDeclaration", "modifiers", "multiLineStringContent",
        "multiLineStringExpression", "multiLineStringLiteral", "multilineRawStringLiteral",
        "operatorFunctionDefinition", "overloadedOperators", "packageHeader", "packageNameIdentifier",
        "parameter", "parameterList", "parenthesizedExpression", "patternGuard", "prefixType",
        "primaryInit", "propertyBody", "propertyDefinition", "propertyMemberDeclaration",
        "quoteClose", "quoteExpression", "quoteOpen", "quoteParameters", "quoteParametersToken",
        "quoteToken", "rangeExpression", "resourceSpecification", "resourceSpecifications",
        "returnExpression", "sourceFile", "spawnExpression", "stringLiteral", "structDefinition",
        "subscriptExpression", "superInterfaces", "synchronizedExpression", "throwExpression",
        "tripleQuoteClose", "tripleQuoteOpen", "tryExpression", "tupleLiteral", "tuplePattern",
        "tupleType", "typeAlias", "typeArguments", "typeExpression", "typeIdentifier",
        "typeParameters", "typePattern", "unaryExpression", "unitLiteral", "unsafeExpression",
        "upperBounds", "userType", "varBindingPattern", "variableDeclaration", "variableModifiers",
        "whileExpression", "wildcardPattern"
    ]
    allowed_types_str = ", ".join(allowed_types)

    prompt = f"""## Task Objective
Convert the provided {language} code into a maximally detailed Abstract Syntax Tree (AST) with atomic-level decomposition. The AST must reach the smallest parsable units per language grammar.

## Node Expansion Rules
1. **MANDATORY Decomposition** to these atomic units:
   - Literals (integerLiteral/stringLiteral/booleanLiteral/etc.)
   - Operators (operator='+', '-', etc.)
   - Identifiers (variable/function names)
   - Type annotations
   - Individual parameters in parameter lists
   - Sub-expressions in complex expressions

2. **STRICTLY FORBIDDEN** Merging:
   - Compound expressions must split into operator+left+right structure
   - Function parameters must be individual nodes
   - Multi-element statements (e.g., comma-separated imports) require separate nodes

## Type Constraints
VALID NODE TYPES ({len(allowed_types)} allowed):
{allowed_types_str}

- Root node MUST be type=module (exactly one)
- Control flow requires exact typing (ifStatement/forStatement/etc.)
- Expression types must specify subcategories (binaryExpression/assignmentExpression/etc.)

## Structural Validation
Your output MUST satisfy:
1. Parent-Child Containment: parent.start_token ≤ child.start_token AND parent.end_token ≥ child.end_token
2. Sibling Order: Sequential token ranges without overlap
3. Leaf Nodes: Must have start_token == end_token AND empty children array

## Required Output
ONLY output raw JSON with this structure:
{{
  "type": "module",
  "start_token": 0,
  "end_token": 42,
  "children": [
    {{
      "type": "functionDefinition",
      "start_token": 3,
      "end_token": 40,
      "children": [...] 
    }}
  ]
}}

## Input Format
A dictionary of tokens with key as token index and value as token text.
The tokenized code:
{tokens}
"""
    return prompt

def get_label_according_to_token_recursively(ast, code, tokenize_code):
    start_token = ast["start_token"]
    end_token = ast["end_token"]

    # 从tokenize_code中提取对应的代码片段
    start_offset = tokenize_code[start_token][0]
    end_offset = tokenize_code[end_token][1]
    label = code[start_offset:end_offset]
    
    # 将提取的代码片段添加到ast中
    # 保存原始属性
    original_type = ast["type"]
    original_start = ast["start_token"] 
    original_end = ast["end_token"]
    original_children = ast.get("children", [])
    
    # 重新按顺序构建字典
    ast.clear()
    ast["type"] = original_type
    ast["label"] = label
    ast["start_token"] = original_start
    ast["end_token"] = original_end
    if original_children:
        ast["children"] = original_children
    
    if "children" in ast:
        for child in ast["children"]:
            get_label_according_to_token_recursively(child, code, tokenize_code)
    return ast


In [2]:
from llm import get_llm_answers



def get_label_according_to_lines_recursively(cfg, code_lines):
    # 处理 entryBlock
    if "entryBlock" in cfg:
        entry = cfg["entryBlock"]
        if "start_line" in entry and "end_line" in entry:
            # 确保 label 在 id 和 type 后面
            label = code_lines[entry["start_line"]-1:entry["end_line"]]
            entry_new = {
                "id": entry["id"],
                "type": entry["type"],
                "label": label
            }
            for k,v in entry.items():
                if k not in ["id", "type", "label"]:
                    entry_new[k] = v
            cfg["entryBlock"] = entry_new
    
    # 处理 blocks
    if "blocks" in cfg:
        for i, block in enumerate(cfg["blocks"]):
            if "start_line" in block and "end_line" in block:
                # 确保 label 在 id 和 type 后面
                label = code_lines[block["start_line"]-1:block["end_line"]]
                block_new = {
                    "id": block["id"],
                    "type": block["type"], 
                    "label": label
                }
                for k,v in block.items():
                    if k not in ["id", "type", "label"]:
                        block_new[k] = v
                cfg["blocks"][i] = block_new
            # 递归处理 subCFG
            if "subCFG" in block:
                get_label_according_to_lines_recursively(block["subCFG"], code_lines)
    
    return cfg

def get_cfg_prompt(code_lines, language, ast):
    prompt = f"""

# Control Flow Graph Generation Protocol  
**Objective**: Convert source code into standardized CFG JSON format according to the Abstract Syntax Tree (AST)

## Input Requirements  
Submit code and language using this pattern:  
```
[CODE]  
{code_lines}
[/CODE]  

[AST]  
{ast}
[/AST]  

[LANGUAGE]  
{language}  
[/LANGUAGE]  
```

## Output Specifications  
### Required JSON Structure  
{{
  "name": "function_name",  
  "entryBlock": {{  
    "id": "B0",  
    "start_line": 1,
    "end_line": 1,
    "type": "branch|loop|normal|error"  
  }},  
  "blocks": [  
    {{  
      "id": "B1",  
      "start_line": 1,
      "end_line": 1,
      "type": "branch",  
      "subCFG": [{{ /* Nested structure */ }}]  
    }}  
  ],  
  "edges": [  
    {{  
      "sourceId": "B0",  
      "targetId": "B1", 
      "label": "edge_description",  
      "isError": false  
    }}  
  ]  
}}  
"""
    return prompt

In [5]:
from multiprocessing import cpu_count
import os
import json

language = "cangjie"
source_code_dir = "../dataset/100rows/cangjie"
ast_dir = "../dataset/100rows/cangjie/ast"
cfg_dir = "../dataset/100rows/cangjie/cfg"

os.makedirs(ast_dir, exist_ok=True)
os.makedirs(cfg_dir, exist_ok=True)

def process_code(index):
    code_path = os.path.join(source_code_dir, f"{index}.cj")
    if not os.path.exists(code_path):
        return
    
    if os.path.exists(os.path.join(ast_dir, f"{index}.json")) and os.path.exists(os.path.join(cfg_dir, f"{index}.json")):
        return
    
    code = open(code_path, "r").read()
    
    tokenize_code = tokenize_code_with_lines(code)
    tokens = {i: tokenize_code[i][2] for i in range(len(tokenize_code))}
    prompt = get_ast_prompt(tokens, language)
    answer = json.loads(get_llm_answers(prompt, model_name="deepseek-r1:70b", require_json=True))
    ast = get_label_according_to_token_recursively(answer, code, tokenize_code)
    
    ast_path = os.path.join(ast_dir, f"{index}.json")

    with open(ast_path, "w") as f:
        json.dump(ast, f, indent=4)
        
    code_lines = "\n".join([f"{i+1} {line}" for i, line in enumerate(code.splitlines())])
    prompt = get_cfg_prompt(code_lines, language, ast)
    # print(prompt)

    answer = get_llm_answers(prompt, model_name="deepseek-r1:70b", require_json=True)
    cfg = json.loads(answer)
    cfg = get_label_according_to_lines_recursively(cfg, code.splitlines())
    with open(os.path.join(cfg_dir, f"{index}.json"), "w") as f:
        json.dump(cfg, f, indent=4)


if __name__ == "__main__":
    from tqdm import tqdm
    from concurrent.futures import ThreadPoolExecutor
    for i in tqdm(range(200)):
        process_code(i)
    # process_code(0)
    # with tqdm(total=20) as pbar:
    #     with ThreadPoolExecutor(max_workers=cpu_count()) as executor:
    #         def process_and_update(i):
    #             process_code(i)
    #             pbar.update(1)
    #         executor.map(process_and_update, range(20))

  4%|▎         | 7/200 [05:00<2:18:01, 42.91s/it]


KeyError: 'start_token'

In [3]:
code_path = os.path.join(source_code_dir, f"7.cj")
code = open(code_path, "r").read()

tokenize_code = tokenize_code_with_lines(code)
tokens = {i: tokenize_code[i][2] for i in range(len(tokenize_code))}
prompt = get_ast_prompt(tokens, language)
answer = json.loads(get_llm_answers(prompt, model_name="deepseek-r1:70b"))
ast = get_label_according_to_token_recursively(answer, code, tokenize_code)

ast_path = os.path.join(ast_dir, f"{i}.json")
with open(ast_path, "w") as f:
    json.dump(ast, f, indent=4)
    
code_lines = "\n".join([f"{i+1} {line}" for i, line in enumerate(code.splitlines())])
prompt = get_cfg_prompt(code_lines, language, ast)
# print(prompt)
answer = get_llm_answers(prompt, model_name="deepseek-r1:70b")
cfg = json.loads(answer)
cfg = get_label_according_to_lines_recursively(cfg, code.splitlines())
with open(os.path.join(cfg_dir, f"{i}.json"), "w") as f:
    json.dump(cfg, f, indent=4)

NameError: name 'source_code_dir' is not defined