In [5]:
source_code_dir = "../../dataset/ts"
static_ast_dir = "../../dataset/ts_ast"

from typing import List, Tuple
import re

def tokenize_code_with_lines(code: str) -> List[Tuple[int, int, str, int]]:
    """
    使用正则分词，直接在原始代码上通过 finditer() 获取匹配位置，
    并保留换行符，让 LLM 能识别多行结构。
    返回形式: [(start_offset, end_offset, token_text, line_number), ...]
    """
    token_pattern = (
        r'[A-Za-z_]\w*|'  # 标识符
        r'[0-9]+|'        # 数字
        r'"[^"]*"|'       # 双引号字符串
        r"'[^']*'|"       # 单引号字符串
        r'\\[ntr]|'       # 转义符 \n \t \r
        r'//.*|'          # 单行注释 (如 C++/Java/JS 风格)
        r'/\*.*?\*/|'     # 多行注释 (如 C 风格)
        r'\n|\r|\t|'      # 换行/回车/制表符
        r'\S'             # 其他符号(如 +, -, {, }, 以及任何其它非空白字符)
    )

    tokens_with_offset = []
    lines = code.splitlines(keepends=True)
    current_line = 1
    current_pos = 0
    
    for match in re.finditer(token_pattern, code, re.MULTILINE | re.DOTALL):
        tk = match.group(0)
        start_offset, end_offset = match.span()
        
        # 计算当前token所在行号
        while current_line <= len(lines) and current_pos + len(lines[current_line-1]) <= start_offset:
            current_pos += len(lines[current_line-1])
            current_line += 1
            
        tokens_with_offset.append((start_offset, end_offset, tk, current_line))

    return tokens_with_offset

import os
import json
for i in range(200):
    file_path = os.path.join(source_code_dir, f"{i}.ts")
    if not os.path.exists(file_path):
        continue
    with open(file_path, 'r', encoding='utf-8') as file:
        source_code = file.read()

    ast_file_path = os.path.join(static_ast_dir, f"{i}.json")
    if not os.path.exists(ast_file_path):
        continue
    with open(ast_file_path, 'r', encoding='utf-8') as file:
        ast = json.load(file)

    # 根据AST节点的name找到对应的token范围
    tokens = tokenize_code_with_lines(source_code)
    
    def find_token_range(node, tokens):
        """递归查找AST节点对应的token范围"""
        if not isinstance(node, dict):
            return
            
        # 获取节点文本
        label = node.get('label', '')
        
        # 在tokens中查找匹配的范围
        start_token = None
        end_token = None
        
        # 遍历所有tokens,找到最长的匹配
        for i, (start, end, text, line) in enumerate(tokens):
            # 如果当前token匹配label的开头
            if label.startswith(text):
                start_token = i
                # 从当前位置继续向后匹配,直到完整匹配label
                current_text = text
                current_end = i
                while current_end < len(tokens)-1 and len(current_text) < len(label):
                    current_end += 1
                    current_text += tokens[current_end][2]
                    if current_text == label:
                        end_token = current_end
                        break
                if end_token is not None:
                    break
                    
        # 更新节点信息
        if start_token is not None and end_token is not None:
            node['start_token'] = start_token
            node['end_token'] = end_token
            
        # 递归处理子节点
        children = node.get('children', [])
        for child in children:
            find_token_range(child, tokens)
            
    # 处理AST
    find_token_range(ast, tokens)
    
    # 保存更新后的AST
    with open(ast_file_path, 'w', encoding='utf-8') as f:
        json.dump(ast, f, indent=2, ensure_ascii=False)


In [1]:
import json
import os
from typing import Dict, Set

def extract_types_from_cfg(cfg: Dict) -> set:
    """从AST配置中提取所有可能的类型"""
    types = set()
    
    def traverse(node):
        if isinstance(node, dict):
            if "type" in node:
                types.add(node["type"])
            for value in node.values():
                traverse(value)
        elif isinstance(node, list):
            for item in node:
                traverse(item)
                
    traverse(cfg)
    return types

# 遍历所有json文件并去重
all_types = set()
processed_files = set()

for root in ["../../dataset/ts_ast"]:
    for file in os.listdir(root):
        if not file.endswith('.json'):
            continue
            
        # 检查文件是否已处理过(通过文件名去重)
        if file in processed_files:
            continue
            
        processed_files.add(file)
        file_path = os.path.join(root, file)
        
        with open(file_path) as f:
            cfg = json.load(f)
            types = extract_types_from_cfg(cfg)
            all_types.update(types)

print("所有文件中提取的唯一类型:")
for t in sorted(all_types):
    print(f"- {t}")


所有文件中提取的唯一类型:
- AmpersandAmpersandToken
- AnyKeyword
- ArrayLiteralExpression
- ArrayType
- ArrowFunction
- AsExpression
- AsteriskToken
- AsyncKeyword
- AwaitExpression
- BarBarToken
- BarEqualsToken
- BarToken
- BinaryExpression
- Block
- BooleanKeyword
- BreakStatement
- CallExpression
- CaseBlock
- CaseClause
- CatchClause
- ClassDeclaration
- ColonToken
- ConditionalExpression
- Constructor
- ContinueStatement
- DefaultClause
- DefaultKeyword
- DotDotDotToken
- ElementAccessExpression
- EmptyStatement
- EndOfFileToken
- EnumDeclaration
- EnumMember
- EqualsEqualsEqualsToken
- EqualsEqualsToken
- EqualsGreaterThanToken
- ExclamationEqualsEqualsToken
- ExclamationEqualsToken
- ExportAssignment
- ExportKeyword
- ExpressionStatement
- ExpressionWithTypeArguments
- FalseKeyword
- FirstAssignment
- FirstBinaryOperator
- FirstCompoundAssignment
- FirstContextualKeyword
- FirstLiteralToken
- FirstNode
- FirstStatement
- FirstTemplateToken
- ForInStatement
- ForOfStatement
- ForStatement
-