In [27]:
import json
import os
import glob
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
from sklearn.metrics import f1_score, precision_score, recall_score
import re

class RolloutAnalyzer:
    """Rollout结果分析器"""
    
    # 11种允许的疾病大类
    ALLOWED_DISEASES = {'F32', 'F41', 'F39', 'F51', 'F98', 'F42', 'F31', 'F43', 'F45', 'F20', 'Z71'}
    
    def __init__(self):
        self.results = []
        
    def extract_major_class(self, code):
        """从ICD-10代码中提取大类"""
        if pd.isna(code) or code is None:
            return None
        
        code_str = str(code).strip()
        major_match = re.match(r'(F\d+|Z71)', code_str)
        if major_match:
            major_code = major_match.group(1)
            return major_code if major_code in self.ALLOWED_DISEASES else None
        return None
    
    def extract_major_classes_from_list(self, diagnosis_codes):
        """从诊断代码列表中提取大类"""
        if not isinstance(diagnosis_codes, list):
            diagnosis_codes = [diagnosis_codes] if diagnosis_codes is not None else []
        
        major_classes = []
        for code in diagnosis_codes:
            major_class = self.extract_major_class(code)
            if major_class and major_class not in major_classes:
                major_classes.append(major_class)
        
        return major_classes
    
    def extract_recommended_codes(self, response_content):
        """从响应中提取推荐的ICD-10代码"""
        if not response_content:
            return []
        
        # 查找<box>标签中的内容
        box_pattern = r'<box>(.*?)</box>'
        box_match = re.search(box_pattern, response_content, re.DOTALL)
        
        if box_match:
            codes_text = box_match.group(1).strip()
            # 用分号分割代码
            codes = [code.strip() for code in codes_text.split(';') if code.strip()]
            return codes
        
        return []
    
    def load_json_files(self, json_files):
        """加载JSON文件并提取数据"""
        all_data = []
        
        for json_file in json_files:
            print(f"Loading {json_file}")
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                if 'logs' in data:
                    for log_entry in data['logs']:
                        # 提取ground truth
                        ground_truth = log_entry.get('ground_truth', [])
                        gt_major_classes = self.extract_major_classes_from_list(ground_truth)
                        
                        # 提取推荐代码
                        response_content = log_entry.get('response', {}).get('content', '')
                        recommended_codes = self.extract_recommended_codes(response_content)
                        rec_major_classes = self.extract_major_classes_from_list(recommended_codes)
                        
                        all_data.append({
                            'visit_number': log_entry.get('visit_number'),
                            'ground_truth_codes': ground_truth,
                            'ground_truth_major': gt_major_classes,
                            'recommended_codes': recommended_codes,
                            'recommended_major': rec_major_classes,
                            'file': os.path.basename(json_file)
                        })
            except Exception as e:
                print(f"Error loading {json_file}: {e}")
        
        return all_data
    
    # def calculate_rollout_accuracy(self, data):
    #     """计算rollout正确率 - 修正版：与原始评估方式保持一致"""
    #     # 按visit_number聚合数据
    #     sample_groups = defaultdict(list)
    #     for item in data:
    #         visit_number = item['visit_number']
    #         sample_groups[visit_number].append(item)
        
    #     print(f"聚合后的唯一样本数: {len(sample_groups)}")
        
    #     results = {
    #         'total_samples': len(sample_groups),
    #         'exact_match_count': 0,
    #         'disease_stats': defaultdict(lambda: {'total': 0, 'correct': 0}),
    #         'accuracy_bins': defaultdict(int),
    #         'individual_accuracies': [],
    #         'rollout_counts': []
    #     }
        
    #     # 🔧 关键修复：与原始方式一致，每个rollout都为每种疾病创建标签
    #     all_gt_labels = []
    #     all_pred_labels = []
        
    #     for visit_number, rollout_items in sample_groups.items():
    #         # 获取ground truth（所有rollout应该相同）
    #         gt_major_set = set(rollout_items[0]['ground_truth_major'])
            
    #         # 计算多次rollout的平均性能
    #         rollout_count = len(rollout_items)
    #         results['rollout_counts'].append(rollout_count)
            
    #         # 计算每次rollout的exact match
    #         exact_matches = 0
    #         individual_accuracies = []
            
    #         # 为每种疾病统计多次rollout的平均表现（用于显示）
    #         disease_correct_counts = defaultdict(int)
            
    #         # 🔧 关键修复：每个rollout都产生11个标签（与原始方式一致）
    #         for rollout_item in rollout_items:
    #             rec_major_set = set(rollout_item['recommended_major'])
                
    #             # Exact Match统计
    #             if gt_major_set == rec_major_set:
    #                 exact_matches += 1
                
    #             # 计算单次rollout的个体准确率
    #             if len(gt_major_set) > 0:
    #                 individual_acc = len(gt_major_set.intersection(rec_major_set)) / len(gt_major_set)
    #             else:
    #                 individual_acc = 1.0 if len(rec_major_set) == 0 else 0.0
    #             individual_accuracies.append(individual_acc)
                
    #             # 🔧 关键修复：每个rollout为每种疾病创建标签（与原始评估一致）
    #             for disease in self.ALLOWED_DISEASES:
    #                 gt_label = 1 if disease in gt_major_set else 0
    #                 pred_label = 1 if disease in rec_major_set else 0
                    
    #                 all_gt_labels.append(gt_label)
    #                 all_pred_labels.append(pred_label)
                    
    #                 # 统计用于显示的疾病准确率
    #                 if gt_label == pred_label:
    #                     disease_correct_counts[disease] += 1
            
    #         # # 计算该样本的平均性能
    #         # sample_exact_match_rate = exact_matches / rollout_count
    #         # sample_individual_accuracy = np.mean(individual_accuracies)
            
    #         # # 如果样本的平均exact match率 >= 0.5，则认为该样本exact match正确
    #         # if sample_exact_match_rate == 1:
    #         #     results['exact_match_count'] += 1
            
    #         # # 记录样本的平均个体准确率
    #         # results['individual_accuracies'].append(sample_individual_accuracy)
            
    #         # 修正版：基于exact match的rollout正确率
    #         sample_exact_match_rate = exact_matches / rollout_count  # 这个才是0%, 33%, 67%, 100%
    #         # 如果样本的平均exact match率 >= 0.5，则认为该样本exact match正确
    #         if sample_exact_match_rate == 1:
    #             results['exact_match_count'] += 1
    #         results['individual_accuracies'].append(sample_exact_match_rate)  # 而不是sample_individual_accuracy

    #         # 为每种疾病统计（用于显示的准确率）
    #         for disease in self.ALLOWED_DISEASES:
    #             disease_accuracy = disease_correct_counts[disease] / rollout_count
    #             results['disease_stats'][disease]['total'] += 1
    #             if disease_accuracy >= 0.5:  # 大多数rollout正确
    #                 results['disease_stats'][disease]['correct'] += 1
        
    #     # 计算总体指标
    #     results['exact_match_ratio'] = results['exact_match_count'] / results['total_samples']
        
    #     # 🔧 关键修复：现在标签数应该是 42688 * 11 = 469,568
    #     print(f"用于Macro F1计算的标签总数: {len(all_gt_labels)} (应该是 {results['total_samples']} * {np.mean(results['rollout_counts']):.0f} * 11)")
        
    #     # 计算Macro F1（现在应该与validation结果接近）
    #     if len(all_gt_labels) > 0:
    #         results['macro_f1'] = f1_score(all_gt_labels, all_pred_labels, average='macro', zero_division=0)
    #         results['macro_precision'] = precision_score(all_gt_labels, all_pred_labels, average='macro', zero_division=0)
    #         results['macro_recall'] = recall_score(all_gt_labels, all_pred_labels, average='macro', zero_division=0)
        
    #     # 正确率区间统计
    #     for acc in results['individual_accuracies']:
    #         if acc == 0:
    #             results['accuracy_bins']['0%'] += 1
    #         elif acc <= 0.2:
    #             results['accuracy_bins']['0-20%'] += 1
    #         elif acc <= 0.4:
    #             results['accuracy_bins']['20-40%'] += 1
    #         elif acc <= 0.6:
    #             results['accuracy_bins']['40-60%'] += 1
    #         elif acc <= 0.8:
    #             results['accuracy_bins']['60-80%'] += 1
    #         else:
    #             results['accuracy_bins']['80-100%'] += 1
        
    #     # 添加rollout统计信息
    #     results['avg_rollout_count'] = np.mean(results['rollout_counts'])
    #     results['total_rollouts'] = sum(results['rollout_counts'])
        
    #     return results
    
    # def print_results(self, results):
    #     """打印分析结果"""
    #     print("=" * 60)
    #     print("ROLLOUT 正确率分析结果")
    #     print("=" * 60)
        
    #     print(f"\n总样本数: {results['total_samples']}")
    #     if 'total_rollouts' in results:
    #         print(f"总rollout次数: {results['total_rollouts']}")
    #         print(f"平均每样本rollout次数: {results['avg_rollout_count']:.1f}")
        
    #     print(f"Exact Match: {results['exact_match_ratio']:.4f} ({results['exact_match_count']}/{results['total_samples']})")
        
    #     if 'macro_f1' in results:
    #         print(f"Macro F1: {results['macro_f1']:.4f}")
    #         print(f"Macro Precision: {results['macro_precision']:.4f}")
    #         print(f"Macro Recall: {results['macro_recall']:.4f}")
        
    #     print("\n11种疾病的正确率分布:")
    #     print("-" * 40)
    #     for disease in sorted(self.ALLOWED_DISEASES):
    #         stats = results['disease_stats'][disease]
    #         accuracy = stats['correct'] / stats['total'] if stats['total'] > 0 else 0
    #         print(f"{disease}: {accuracy:.4f} ({stats['correct']}/{stats['total']})")
        
    #     print("\n正确率区间分布:")
    #     print("-" * 30)
    #     total_samples = sum(results['accuracy_bins'].values())
    #     for bin_name in ['0%', '0-20%', '20-40%', '40-60%', '60-80%', '80-100%']:
    #         count = results['accuracy_bins'][bin_name]
    #         percentage = count / total_samples * 100 if total_samples > 0 else 0
    #         print(f"{bin_name}: {count} 样本 ({percentage:.1f}%)")

    def calculate_rollout_accuracy(self, data):
        """修正版：正确计算疾病统计和rollout区间"""
        # 按visit_number聚合数据
        sample_groups = defaultdict(list)
        for item in data:
            visit_number = item['visit_number']
            sample_groups[visit_number].append(item)
        
        print(f"聚合后的唯一样本数: {len(sample_groups)}")
        
        results = {
            'total_samples': len(sample_groups),
            'exact_match_count': 0,
            'disease_stats': defaultdict(lambda: {
                'total_with_disease': 0,  # 包含该疾病的样本总数
                'rollout_bins': defaultdict(int)  # 按rollout正确率区间统计
            }),
            'accuracy_bins': defaultdict(int),
            'individual_accuracies': [],
            'rollout_counts': []
        }
        
        # 与原始方式一致：每个rollout都为每种疾病创建标签
        all_gt_labels = []
        all_pred_labels = []
        
        for visit_number, rollout_items in sample_groups.items():
            gt_major_set = set(rollout_items[0]['ground_truth_major'])
            rollout_count = len(rollout_items)
            results['rollout_counts'].append(rollout_count)
            
            # 计算exact match
            exact_matches = 0
            
            # 每个rollout都产生标签（用于Macro F1计算）
            for rollout_item in rollout_items:
                rec_major_set = set(rollout_item['recommended_major'])
                
                if gt_major_set == rec_major_set:
                    exact_matches += 1
                
                # 为每种疾病创建标签
                for disease in self.ALLOWED_DISEASES:
                    gt_label = 1 if disease in gt_major_set else 0
                    pred_label = 1 if disease in rec_major_set else 0
                    
                    all_gt_labels.append(gt_label)
                    all_pred_labels.append(pred_label)
            
            # 🔧 修正1：使用exact match rate作为个体准确率
            sample_exact_match_rate = exact_matches / rollout_count
            
            if sample_exact_match_rate >= 0.5:
                results['exact_match_count'] += 1
            
            # 使用exact match rate而不是疾病匹配率
            results['individual_accuracies'].append(sample_exact_match_rate)
            
            # 🔧 修正2：疾病统计 - 只在包含该疾病的样本上计算
            for disease in self.ALLOWED_DISEASES:
                if disease in gt_major_set:  # 只统计包含该疾病的样本
                    results['disease_stats'][disease]['total_with_disease'] += 1
                    
                    # 计算该疾病在多次rollout中的正确次数
                    disease_correct_count = sum(1 for item in rollout_items 
                                            if disease in set(item['recommended_major']))
                    disease_accuracy_rate = disease_correct_count / rollout_count
                    
                    # 按rollout区间统计（保持原有区间）
                    if disease_accuracy_rate == 0:
                        results['disease_stats'][disease]['rollout_bins']['0%'] += 1
                    elif disease_accuracy_rate <= 0.2:
                        results['disease_stats'][disease]['rollout_bins']['0-20%'] += 1
                    elif disease_accuracy_rate <= 0.4:
                        results['disease_stats'][disease]['rollout_bins']['20-40%'] += 1
                    elif disease_accuracy_rate <= 0.6:
                        results['disease_stats'][disease]['rollout_bins']['40-60%'] += 1
                    elif disease_accuracy_rate <= 0.8:
                        results['disease_stats'][disease]['rollout_bins']['60-80%'] += 1
                    else:
                        results['disease_stats'][disease]['rollout_bins']['80-100%'] += 1
        
        # 计算总体指标
        results['exact_match_ratio'] = results['exact_match_count'] / results['total_samples']
        
        print(f"用于Macro F1计算的标签总数: {len(all_gt_labels)}")
        
        # 计算Macro F1
        if len(all_gt_labels) > 0:
            results['macro_f1'] = f1_score(all_gt_labels, all_pred_labels, average='macro', zero_division=0)
            results['macro_precision'] = precision_score(all_gt_labels, all_pred_labels, average='macro', zero_division=0)
            results['macro_recall'] = recall_score(all_gt_labels, all_pred_labels, average='macro', zero_division=0)
        
        # 样本正确率区间统计（基于exact match）
        for exact_match_rate in results['individual_accuracies']:
            if exact_match_rate == 0:
                results['accuracy_bins']['0%'] += 1
            elif exact_match_rate <= 0.2:
                results['accuracy_bins']['0-20%'] += 1
            elif exact_match_rate <= 0.4:
                results['accuracy_bins']['20-40%'] += 1
            elif exact_match_rate <= 0.6:
                results['accuracy_bins']['40-60%'] += 1
            elif exact_match_rate <= 0.8:
                results['accuracy_bins']['60-80%'] += 1
            else:
                results['accuracy_bins']['80-100%'] += 1
        
        # 添加rollout统计信息
        results['avg_rollout_count'] = np.mean(results['rollout_counts'])
        results['total_rollouts'] = sum(results['rollout_counts'])
        
        return results

    def print_results(self, results):
        """修正版：打印分析结果"""
        print("=" * 60)
        print("ROLLOUT 正确率分析结果")
        print("=" * 60)
        
        print(f"\n总样本数: {results['total_samples']}")
        if 'total_rollouts' in results:
            print(f"总rollout次数: {results['total_rollouts']}")
            print(f"平均每样本rollout次数: {results['avg_rollout_count']:.1f}")
        
        print(f"Exact Match: {results['exact_match_ratio']:.4f} ({results['exact_match_count']}/{results['total_samples']})")
        
        if 'macro_f1' in results:
            print(f"Macro F1: {results['macro_f1']:.4f}")
            print(f"Macro Precision: {results['macro_precision']:.4f}")
            print(f"Macro Recall: {results['macro_recall']:.4f}")
        
        print("\n11种疾病的rollout正确率分布:")
        print("-" * 80)
        print(f"{'疾病':<6} {'样本数':<8} {'0%':<6} {'0-20%':<6} {'20-40%':<7} {'40-60%':<7} {'60-80%':<7} {'80-100%':<8}")
        print("-" * 80)
        
        for disease in sorted(self.ALLOWED_DISEASES):
            stats = results['disease_stats'][disease]
            total = stats['total_with_disease']
            bins = stats['rollout_bins']
            
            if total > 0:
                print(f"{disease:<6} {total:<8} {bins['0%']:<6} {bins['0-20%']:<6} {bins['20-40%']:<7} {bins['40-60%']:<7} {bins['60-80%']:<7} {bins['80-100%']:<8}")
            else:
                print(f"{disease:<6} {0:<8} {'-':<6} {'-':<6} {'-':<7} {'-':<7} {'-':<7} {'-':<8}")
        
        print("\n样本rollout正确率区间分布:")
        print("-" * 30)
        total_samples = sum(results['accuracy_bins'].values())
        for bin_name in ['0%', '0-20%', '20-40%', '40-60%', '60-80%', '80-100%']:
            count = results['accuracy_bins'][bin_name]
            percentage = count / total_samples * 100 if total_samples > 0 else 0
            print(f"{bin_name}: {count} 样本 ({percentage:.1f}%)")
        
# 使用示例
analyzer = RolloutAnalyzer()

# 设置参数
method_name = "MiroDiag-16K"
llm_name = "moonshotai/kimi-k2-0905"
llm_base_name = llm_name.split("/")[-1]

# 查找所有完整的JSON文件
json_path = f"/Users/shihaoxu/Desktop/work/document/Dlab/code/baseline_llm/distill/{method_name}/"
json_files = glob.glob(os.path.join(json_path, f'rollout-*/results/*complete_{llm_base_name}*.json'))

print(f"找到 {len(json_files)} 个完整的JSON文件")
for f in json_files:
    print(f"  - {os.path.basename(f)}")

if json_files:
    # 加载数据
    print("\n开始加载数据...")
    data = analyzer.load_json_files(json_files)
    print(f"原始记录数: {len(data)} 条")
    
    # 计算正确率（按样本聚合多次rollout）
    print("\n计算rollout平均正确率...")
    results = analyzer.calculate_rollout_accuracy(data)
    
    # 打印结果
    analyzer.print_results(results)
else:
    print("未找到完整的JSON文件")


找到 5 个完整的JSON文件
  - 20251007_moonshotai_kimi-k2-0905_recommendation_20251007_132733_132742_complete_kimi-k2-0905_recommendation_20251007_135835_logs.json
  - 20251007_moonshotai_kimi-k2-0905_recommendation_20251007_112753_112802_complete_kimi-k2-0905_recommendation_20251007_120025_logs.json
  - 20251007_moonshotai_kimi-k2-0905_recommendation_20251007_120028_120038_complete_kimi-k2-0905_recommendation_20251007_130352_logs.json
  - 20251006_moonshotai_kimi-k2-0905_recommendation_20251006_231527_231539_complete_kimi-k2-0905_recommendation_20251007_031219_logs.json
  - 20251006_moonshotai_kimi-k2-0905_recommendation_20251006_183244_183330_complete_kimi-k2-0905_recommendation_20251006_221511_logs.json

开始加载数据...
Loading /Users/shihaoxu/Desktop/work/document/Dlab/code/baseline_llm/distill/MiroDiag-16K/rollout-3/results/20251007_moonshotai_kimi-k2-0905_recommendation_20251007_132733_132742_complete_kimi-k2-0905_recommendation_20251007_135835_logs.json
Loading /Users/shihaoxu/Desktop/work/docu

In [28]:
# 分析不同rollout的对比
def analyze_rollouts_comparison():
    """分析不同rollout之间的对比"""
    print("\n" + "=" * 60)
    print("不同ROLLOUT对比分析")
    print("=" * 60)
    
    rollout_results = {}
    
    for rollout_num in range(1, 10):
        rollout_path = f"/Users/shihaoxu/Desktop/work/document/Dlab/code/baseline_llm/distill/{method_name}/rollout-{rollout_num}/results/"
        rollout_files = glob.glob(os.path.join(rollout_path, f'*complete_{llm_base_name}*.json'))
        
        if rollout_files:
            print(f"\nRollout-{rollout_num}: 找到 {len(rollout_files)} 个文件")
            rollout_data = analyzer.load_json_files(rollout_files)
            rollout_results[f"rollout-{rollout_num}"] = analyzer.calculate_rollout_accuracy(rollout_data)
            
            print(f"  样本数: {rollout_results[f'rollout-{rollout_num}']['total_samples']}")
            print(f"  Exact Match: {rollout_results[f'rollout-{rollout_num}']['exact_match_ratio']:.4f}")
            if 'macro_f1' in rollout_results[f"rollout-{rollout_num}"]:
                print(f"  Macro F1: {rollout_results[f'rollout-{rollout_num}']['macro_f1']:.4f}")
    
    # 汇总对比
    if rollout_results:
        print(f"\n{'Rollout':<12} {'样本数':<8} {'Exact Match':<12} {'Macro F1':<10}")
        print("-" * 50)
        for rollout_name, results in rollout_results.items():
            macro_f1 = results.get('macro_f1', 0)
            print(f"{rollout_name:<12} {results['total_samples']:<8} {results['exact_match_ratio']:<12.4f} {macro_f1:<10.4f}")

# 运行对比分析
analyze_rollouts_comparison()



不同ROLLOUT对比分析

Rollout-1: 找到 1 个文件
Loading /Users/shihaoxu/Desktop/work/document/Dlab/code/baseline_llm/distill/MiroDiag-16K/rollout-1/results/20251006_moonshotai_kimi-k2-0905_recommendation_20251006_183244_183330_complete_kimi-k2-0905_recommendation_20251006_221511_logs.json
聚合后的唯一样本数: 14426
用于Macro F1计算的标签总数: 158686
  样本数: 14426
  Exact Match: 0.4420
  Macro F1: 0.7676

Rollout-2: 找到 1 个文件
Loading /Users/shihaoxu/Desktop/work/document/Dlab/code/baseline_llm/distill/MiroDiag-16K/rollout-2/results/20251006_moonshotai_kimi-k2-0905_recommendation_20251006_231527_231539_complete_kimi-k2-0905_recommendation_20251007_031219_logs.json
聚合后的唯一样本数: 14426
用于Macro F1计算的标签总数: 158686
  样本数: 14426
  Exact Match: 0.4411
  Macro F1: 0.7682

Rollout-3: 找到 1 个文件
Loading /Users/shihaoxu/Desktop/work/document/Dlab/code/baseline_llm/distill/MiroDiag-16K/rollout-3/results/20251007_moonshotai_kimi-k2-0905_recommendation_20251007_132733_132742_complete_kimi-k2-0905_recommendation_20251007_135835_logs.json
聚合

In [29]:
# 简化分析函数
def quick_rollout_analysis(method_name="MiroDiag-16K", llm_name="moonshotai/kimi-k2-0905"):
    """快速rollout分析"""
    analyzer = RolloutAnalyzer()
    llm_base_name = llm_name.split("/")[-1]
    
    json_path = f"/Users/shihaoxu/Desktop/work/document/Dlab/code/baseline_llm/distill/{method_name}/"
    json_files = glob.glob(os.path.join(json_path, f'rollout-*/results/*complete_{llm_base_name}*.json'))
    
    if not json_files:
        print("未找到完整的JSON文件")
        return None
    
    data = analyzer.load_json_files(json_files)
    results = analyzer.calculate_rollout_accuracy(data)
    
    print(f"\n📊 {method_name} - {llm_name} Rollout分析结果")
    print("=" * 60)
    print(f"样本数: {results['total_samples']}")
    print(f"平均rollout次数: {results['avg_rollout_count']:.1f}")
    print(f"Exact Match: {results['exact_match_ratio']:.4f}")
    print(f"Macro F1: {results.get('macro_f1', 0):.4f}")
    
    print("\n正确率分布:")
    total = sum(results['accuracy_bins'].values())
    for bin_name in ['0%', '0-20%', '20-40%', '40-60%', '60-80%', '80-100%']:
        count = results['accuracy_bins'][bin_name]
        pct = count / total * 100 if total > 0 else 0
        print(f"  {bin_name}: {count:>4} ({pct:>5.1f}%)")
    
    return results

# 运行快速分析
quick_results = quick_rollout_analysis()


Loading /Users/shihaoxu/Desktop/work/document/Dlab/code/baseline_llm/distill/MiroDiag-16K/rollout-3/results/20251007_moonshotai_kimi-k2-0905_recommendation_20251007_132733_132742_complete_kimi-k2-0905_recommendation_20251007_135835_logs.json
Loading /Users/shihaoxu/Desktop/work/document/Dlab/code/baseline_llm/distill/MiroDiag-16K/rollout-4/results/20251007_moonshotai_kimi-k2-0905_recommendation_20251007_112753_112802_complete_kimi-k2-0905_recommendation_20251007_120025_logs.json
Loading /Users/shihaoxu/Desktop/work/document/Dlab/code/baseline_llm/distill/MiroDiag-16K/rollout-5/results/20251007_moonshotai_kimi-k2-0905_recommendation_20251007_120028_120038_complete_kimi-k2-0905_recommendation_20251007_130352_logs.json
Loading /Users/shihaoxu/Desktop/work/document/Dlab/code/baseline_llm/distill/MiroDiag-16K/rollout-2/results/20251006_moonshotai_kimi-k2-0905_recommendation_20251006_231527_231539_complete_kimi-k2-0905_recommendation_20251007_031219_logs.json
Loading /Users/shihaoxu/Desktop/