In [2]:
import json
from collections import Counter
from typing import Dict, List, Set

def analyze_jsonl_types(file_path: str) -> Dict:
    """
    分析JSONL文件中所有的type类型
    
    Args:
        file_path: JSONL文件路径
    
    Returns:
        包含统计信息的字典
    """
    types_counter = Counter()
    types_set = set()
    total_records = 0
    error_records = 0
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    # 解析每一行JSON
                    data = json.loads(line.strip())
                    total_records += 1
                    
                    # 提取type字段
                    if 'runtime_ground_truth' in data:
                        runtime_gt = data['runtime_ground_truth']
                        if isinstance(runtime_gt, dict) and 'type' in runtime_gt:
                            type_value = runtime_gt['type']
                            types_counter[type_value] += 1
                            types_set.add(type_value)
                    #     else:
                    #         print(f"第 {line_num} 行: runtime_ground_truth中没有type字段")
                    # else:
                    #     print(f"第 {line_num} 行: 没有runtime_ground_truth字段")
                        
                except json.JSONDecodeError as e:
                    error_records += 1
                    print(f"第 {line_num} 行JSON解析错误: {e}")
                except Exception as e:
                    error_records += 1
                    print(f"第 {line_num} 行处理错误: {e}")
    
    except FileNotFoundError:
        print(f"文件未找到: {file_path}")
        return {}
    except Exception as e:
        print(f"读取文件时发生错误: {e}")
        return {}
    
    # 返回统计结果
    return {
        'total_records': total_records,
        'error_records': error_records,
        'unique_types': sorted(list(types_set)),
        'type_counts': dict(types_counter.most_common()),
        'total_types': len(types_set)
    }

def print_analysis_report(stats: Dict):
    """
    打印分析报告
    
    Args:
        stats: 统计信息字典
    """
    if not stats:
        print("没有统计数据")
        return
    
    print("\n" + "="*60)
    print("JSONL Type字段统计报告")
    print("="*60)
    
    print(f"\n📊 基本统计:")
    print(f"  - 总记录数: {stats['total_records']}")
    print(f"  - 错误记录数: {stats['error_records']}")
    print(f"  - 不同type数量: {stats['total_types']}")
    
    print(f"\n📝 所有的Type类型:")
    for i, type_name in enumerate(stats['unique_types'], 1):
        print(f"  {i}. {type_name}")
    
    print(f"\n📈 Type分布统计:")
    for type_name, count in stats['type_counts'].items():
        percentage = (count / stats['total_records']) * 100 if stats['total_records'] > 0 else 0
        print(f"  - {type_name}: {count} 次 ({percentage:.2f}%)")
    
    print("\n" + "="*60)

# 使用示例
if __name__ == "__main__":
    # 替换为你的JSONL文件路径
    file_path = "original.jsonl"
    
    # 分析文件
    stats = analyze_jsonl_types(file_path)
    
    # 打印报告
    print_analysis_report(stats)
    
    # 如果只想要获取unique types列表
    if stats:
        print("\n快速查看所有Type:")
        for t in stats['unique_types']:
            print(f"  - {t}")


JSONL Type字段统计报告

📊 基本统计:
  - 总记录数: 1033
  - 错误记录数: 0
  - 不同type数量: 30

📝 所有的Type类型:
  1. <class 'NoneType'>
  2. <class '_pytest.python_api.ApproxScalar'>
  3. <class 'bool'>
  4. <class 'bytes'>
  5. <class 'dict'>
  6. <class 'float'>
  7. <class 'int'>
  8. <class 'list'>
  9. <class 'numpy.dtype[int8]'>
  10. <class 'numpy.dtypes.Complex128DType'>
  11. <class 'numpy.dtypes.Float64DType'>
  12. <class 'numpy.dtypes.ObjectDType'>
  13. <class 'numpy.int32'>
  14. <class 'numpy.int64'>
  15. <class 'numpy.uint8'>
  16. <class 'plotly.graph_objs._figure.Figure'>
  17. <class 'plotly.graph_objs._layout.Layout'>
  18. <class 'pyproj.crs.crs.CRS'>
  19. <class 'range'>
  20. <class 'set'>
  21. <class 'shapely.geometry.collection.GeometryCollection'>
  22. <class 'shapely.geometry.linestring.LineString'>
  23. <class 'shapely.geometry.multilinestring.MultiLineString'>
  24. <class 'shapely.geometry.multipoint.MultiPoint'>
  25. <class 'shapely.geometry.multipolygon.MultiPolygon'>
  26.

In [3]:
import json
from collections import Counter
from typing import Dict, List, Set

def analyze_jsonl_types(file_path: str) -> Dict:
    """
    分析JSONL文件中所有的type类型
    
    Args:
        file_path: JSONL文件路径
    
    Returns:
        包含统计信息的字典
    """
    types_counter = Counter()
    types_set = set()
    total_records = 0
    error_records = 0
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    # 解析每一行JSON
                    data = json.loads(line.strip())
                    total_records += 1
                    
                    # 提取type字段
                    if 'runtime_ground_truth' in data:
                        runtime_gt = data['runtime_ground_truth']
                        if isinstance(runtime_gt, dict) and 'type' in runtime_gt:
                            type_value = runtime_gt['type']
                            types_counter[type_value] += 1
                            types_set.add(type_value)
                    #     else:
                    #         print(f"第 {line_num} 行: runtime_ground_truth中没有type字段")
                    # else:
                    #     print(f"第 {line_num} 行: 没有runtime_ground_truth字段")
                        
                except json.JSONDecodeError as e:
                    error_records += 1
                    print(f"第 {line_num} 行JSON解析错误: {e}")
                except Exception as e:
                    error_records += 1
                    print(f"第 {line_num} 行处理错误: {e}")
    
    except FileNotFoundError:
        print(f"文件未找到: {file_path}")
        return {}
    except Exception as e:
        print(f"读取文件时发生错误: {e}")
        return {}
    
    # 返回统计结果
    return {
        'total_records': total_records,
        'error_records': error_records,
        'unique_types': sorted(list(types_set)),
        'type_counts': dict(types_counter.most_common()),
        'total_types': len(types_set)
    }

def print_analysis_report(stats: Dict):
    """
    打印分析报告
    
    Args:
        stats: 统计信息字典
    """
    if not stats:
        print("没有统计数据")
        return
    
    print("\n" + "="*60)
    print("JSONL Type字段统计报告")
    print("="*60)
    
    print(f"\n📊 基本统计:")
    print(f"  - 总记录数: {stats['total_records']}")
    print(f"  - 错误记录数: {stats['error_records']}")
    print(f"  - 不同type数量: {stats['total_types']}")
    
    print(f"\n📝 所有的Type类型:")
    for i, type_name in enumerate(stats['unique_types'], 1):
        print(f"  {i}. {type_name}")
    
    print(f"\n📈 Type分布统计:")
    for type_name, count in stats['type_counts'].items():
        percentage = (count / stats['total_records']) * 100 if stats['total_records'] > 0 else 0
        print(f"  - {type_name}: {count} 次 ({percentage:.2f}%)")
    
    print("\n" + "="*60)

# 使用示例
if __name__ == "__main__":
    # 替换为你的JSONL文件路径
    file_path = "rewrite.jsonl"
    
    # 分析文件
    stats = analyze_jsonl_types(file_path)
    
    # 打印报告
    print_analysis_report(stats)
    
    # 如果只想要获取unique types列表
    if stats:
        print("\n快速查看所有Type:")
        for t in stats['unique_types']:
            print(f"  - {t}")


JSONL Type字段统计报告

📊 基本统计:
  - 总记录数: 1033
  - 错误记录数: 0
  - 不同type数量: 28

📝 所有的Type类型:
  1. <class 'NoneType'>
  2. <class '_pytest.python_api.ApproxScalar'>
  3. <class 'bool'>
  4. <class 'bytes'>
  5. <class 'dict'>
  6. <class 'float'>
  7. <class 'int'>
  8. <class 'list'>
  9. <class 'numpy.dtype[int8]'>
  10. <class 'numpy.dtypes.Float64DType'>
  11. <class 'numpy.dtypes.ObjectDType'>
  12. <class 'numpy.float64'>
  13. <class 'numpy.int32'>
  14. <class 'numpy.int64'>
  15. <class 'plotly.graph_objs._layout.Layout'>
  16. <class 'pyproj.crs.crs.CRS'>
  17. <class 'range'>
  18. <class 'set'>
  19. <class 'shapely.geometry.collection.GeometryCollection'>
  20. <class 'shapely.geometry.linestring.LineString'>
  21. <class 'shapely.geometry.multilinestring.MultiLineString'>
  22. <class 'shapely.geometry.multipoint.MultiPoint'>
  23. <class 'shapely.geometry.multipolygon.MultiPolygon'>
  24. <class 'shapely.geometry.point.Point'>
  25. <class 'shapely.geometry.polygon.LinearRing'>
