In [2]:
import os
import sys
import json
from typing import Any, Dict, List, Tuple
import re
import concurrent.futures
from multiprocessing import cpu_count

from tqdm import tqdm

def tokenize_code_with_lines(code: str) -> List[Tuple[int, int, str, int]]:
    """
    使用正则分词，直接在原始代码上通过 finditer() 获取匹配位置，
    并保留换行符，让 LLM 能识别多行结构。
    返回形式: [(start_offset, end_offset, token_text, line_number), ...]
    """
    token_pattern = (
        r'[A-Za-z_]\w*|'       # 标识符
        r'[0-9]+(?:\.[0-9]+)?|'# 数字(包含小数)
        r'"[^"]*"|'            # 双引号字符串
        r"'[^']*'|"            # 单引号字符串
        r'`[^`]*`|'            # 模板字符串
        r'//.*?(?=\n|$)|'      # 单行注释(到行尾)
        r'/\*[\s\S]*?\*/|'     # 多行注释
        r'=>|'                 # 箭头函数
        r'===|!==|==|!=|'      # 相等性操作符
        r'&&|\|\||'            # 逻辑操作符
        r'[-+*/=<>!&|^~?:;,.(){}[\]]|' # 其他操作符和分隔符
        r'\n|\r|\t|'           # 换行/回车/制表符
        r'\s+'                 # 其他空白字符
    )

    tokens_with_offset = []
    lines = code.splitlines(keepends=True)
    current_line = 1
    current_pos = 0
    
    for match in re.finditer(token_pattern, code, re.MULTILINE | re.DOTALL):
        tk = match.group(0)
        start_offset, end_offset = match.span()
        
        # 计算当前token所在行号
        while current_line <= len(lines) and current_pos + len(lines[current_line-1]) <= start_offset:
            current_pos += len(lines[current_line-1])
            current_line += 1
            
        # 跳过纯空白token
        if not tk.isspace():
            tokens_with_offset.append((start_offset, end_offset, tk, current_line))

    return tokens_with_offset

with open("../../dataset/ts/0.ts", "r") as f:
    code = f.read()

tokens = tokenize_code_with_lines(code)
print(len(tokens))

219
