In [None]:
import os
import sys
current_dir = os.path.dirname(os.path.abspath('.'))
sys.path.append(current_dir)
os.chdir(current_dir)

In [None]:
import json
from pathlib import Path
from typing import Dict, List, Any


def can_skip(field_data: Any, rewritten_data: Dict) -> bool:
    """
    判断某个字段是否可以跳过或直接生成
    
    Args:
        field_data: dimension/measure/filter的数据
        rewritten_data: rewritten中的数据
    
    Returns:
        bool: True表示可以跳过或生成
    """
    # 如果是空列表，可以跳过
    if isinstance(field_data, list) and len(field_data) == 0:
        return True

    # 如果是None或空，可以跳过
    if field_data is None or field_data == {}:
        return True

    return False


def can_generate_from_rewritten(field_list: List[Dict], rewritten_data: Dict) -> bool:
    """
    判断字段列表是否可以从rewritten中直接生成
    放宽匹配条件：只要核心字段信息在rewritten中任何位置出现即可
    
    Args:
        field_list: config中的字段列表
        rewritten_data: rewritten数据
    
    Returns:
        bool: True表示可以直接生成
    """
    if not isinstance(field_list, list) or len(field_list) == 0:
        return False

    if not rewritten_data:
        return False

    # 将rewritten整个字典转换为字符串（用于模糊搜索）
    rewritten_str = str(rewritten_data).lower()

    # 提取config中所有字段的核心信息
    config_field_names = set()
    for item in field_list:
        if isinstance(item, dict):
            value = item.get("字段名称")
            if value and isinstance(value, str) and len(value.strip()) > 0:
                cleaned_value = value.strip().lower()
                if len(cleaned_value) > 1:  # 只考虑长度>1的字段
                    config_field_names.add(cleaned_value)
            value = item.get("条件")
            if value and isinstance(value, str) and len(value.strip()) > 0:
                cleaned_value = value.strip().lower()
                if len(cleaned_value) > 1:
                    config_field_names.add(cleaned_value)

    if not config_field_names:
        return False

    # 检查每个字段是否在rewritten中出现
    match_count = 0
    for field_name in config_field_names:
        # 模糊匹配：只要字段名在rewritten字符串中出现即可
        if field_name in rewritten_str:
            match_count += 1

    # 如果超过一半的字段都能在rewritten中找到，认为可以生成
    # 可以调整这个阈值
    match_ratio = match_count / len(config_field_names)
    return match_ratio >= 0.99  # 至少90%的字段匹配


def analyze_single_item(data: Dict) -> Dict:
    """分析单个数据项"""
    config = data.get('config', {})
    rewritten = data.get('rewritten', {})

    results = {
        'dimension': {'status': 'unknown', 'can_generate': False},
        'measure': {'status': 'unknown', 'can_generate': False},
        'filter': {'status': 'unknown', 'can_generate': False}
    }

    # 分析dimension
    dimension = config.get('dimension', [])
    if can_skip(dimension, rewritten):
        results['dimension'] = {'status': 'skip', 'can_generate': True}
    elif can_generate_from_rewritten(dimension, rewritten):
        results['dimension'] = {'status': 'generate', 'can_generate': True}
    else:
        results['dimension'] = {'status': 'manual', 'can_generate': False}

    # 分析measure
    measure = config.get('measure', [])
    if can_skip(measure, rewritten):
        results['measure'] = {'status': 'skip', 'can_generate': True}
    elif can_generate_from_rewritten(measure, rewritten):
        results['measure'] = {'status': 'generate', 'can_generate': True}
    else:
        results['measure'] = {'status': 'manual', 'can_generate': False}

    # 分析filter
    filter_data = config.get('filter', [])
    if can_skip(filter_data, rewritten):
        results['filter'] = {'status': 'skip', 'can_generate': True}
    elif can_generate_from_rewritten(filter_data, rewritten):
        results['filter'] = {'status': 'generate', 'can_generate': True}
    else:
        results['filter'] = {'status': 'manual', 'can_generate': False}

    return results


def analyze_file(file_path: Path) -> List[Dict]:
    """分析单个文件"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # 判断数据是列表还是字典
        if isinstance(data, list):
            # 如果是列表，分析每一项
            return [analyze_single_item(item) for item in data if isinstance(item, dict)]
        elif isinstance(data, dict):
            # 如果是字典，直接分析
            return [analyze_single_item(data)]
        else:
            print(f"Unexpected data type in {file_path}: {type(data)}")
            return []

    except Exception as e:
        print(f"Error analyzing {file_path}: {e}")
        import traceback
        traceback.print_exc()
        return []


def main():
    """主函数"""
    # 查找所有JSON文件
    # workspace_path = Path("./data/20250916")
    json_files = {"data/others/20250916_AIO_correct.json"}

    # 统计结果
    total_stats = {
        'dimension': {'skip': 0, 'generate': 0, 'manual': 0, 'total': 0},
        'measure': {'skip': 0, 'generate': 0, 'manual': 0, 'total': 0},
        'filter': {'skip': 0, 'generate': 0, 'manual': 0, 'total': 0}
    }

    analyzed_count = 0
    total_items = 0

    # 分析每个文件
    for json_file in json_files:
        results_list = analyze_file(json_file)
        if results_list:
            analyzed_count += 1
            for results in results_list:
                total_items += 1
                for field in ['dimension', 'measure', 'filter']:
                    status = results[field]['status']
                    total_stats[field][status] += 1
                    total_stats[field]['total'] += 1

    # 输出统计结果
    print(f"\n分析了 {analyzed_count} 个文件，共 {total_items} 条数据\n")
    print("="*60)

    for field in ['dimension', 'measure', 'filter']:
        stats = total_stats[field]
        total = stats['total']
        if total == 0:
            continue

        can_generate_count = stats['skip'] + stats['generate']
        generate_ratio = (can_generate_count / total * 100) if total > 0 else 0

        print(f"\n【{field.upper()}】统计:")
        print(f"  总数: {total}")
        print(f"  可跳过: {stats['skip']} ({stats['skip']/total*100:.1f}%)")
        print(
            f"  可生成: {stats['generate']} ({stats['generate']/total*100:.1f}%)")
        print(f"  需手动: {stats['manual']} ({stats['manual']/total*100:.1f}%)")
        print(f"  ✓ 可跳过或生成比例: {generate_ratio:.1f}%")

    print("\n" + "="*60)


if __name__ == "__main__":
    main()


分析了 6 个文件，共 616 条数据

============================================================

【DIMENSION】统计:
  总数: 616
  可跳过: 107 (17.4%)
  可生成: 375 (60.9%)
  需手动: 134 (21.8%)
  ✓ 可跳过或生成比例: 78.2%

【MEASURE】统计:
  总数: 616
  可跳过: 4 (0.6%)
  可生成: 86 (14.0%)
  需手动: 526 (85.4%)
  ✓ 可跳过或生成比例: 14.6%

【FILTER】统计:
  总数: 616
  可跳过: 0 (0.0%)
  可生成: 244 (39.6%)
  需手动: 372 (60.4%)
  ✓ 可跳过或生成比例: 39.6%

============================================================

分析了 6 个文件，共 1152 条数据

============================================================

【DIMENSION】统计:
  总数: 1152
  可跳过: 256 (22.2%)
  可生成: 656 (56.9%)
  需手动: 240 (20.8%)
  ✓ 可跳过或生成比例: 79.2%

【MEASURE】统计:
  总数: 1152
  可跳过: 14 (1.2%)
  可生成: 174 (15.1%)
  需手动: 964 (83.7%)
  ✓ 可跳过或生成比例: 16.3%

【FILTER】统计:
  总数: 1152
  可跳过: 0 (0.0%)
  可生成: 408 (35.4%)
  需手动: 744 (64.6%)
  ✓ 可跳过或生成比例: 35.4%


分析了 6 个文件，共 2278 条数据

============================================================

【DIMENSION】统计:
  总数: 2278
  可跳过: 575 (25.2%)
  可生成: 1142 (50.1%)
  需手动: 561 (24.6%)
  ✓ 可跳过或生成比例: 75.4%

【MEASURE】统计:
  总数: 2278
  可跳过: 37 (1.6%)
  可生成: 365 (16.0%)
  需手动: 1876 (82.4%)
  ✓ 可跳过或生成比例: 17.6%

【FILTER】统计:
  总数: 2278
  可跳过: 0 (0.0%)
  可生成: 667 (29.3%)
  需手动: 1611 (70.7%)
  ✓ 可跳过或生成比例: 29.3%


分析了 1 个文件，共 4768 条数据

============================================================

【DIMENSION】统计:
  总数: 4768
  可跳过: 1293 (27.1%)
  可生成: 1952 (40.9%)
  需手动: 1523 (31.9%)
  ✓ 可跳过或生成比例: 68.1%

【MEASURE】统计:
  总数: 4768
  可跳过: 196 (4.1%)
  可生成: 779 (16.3%)
  需手动: 3793 (79.6%)
  ✓ 可跳过或生成比例: 20.4%

【FILTER】统计:
  总数: 4768
  可跳过: 0 (0.0%)
  可生成: 965 (20.2%)
  需手动: 3803 (79.8%)
  ✓ 可跳过或生成比例: 20.2%

============================================================