# 1. 分析数据集

In [4]:
import numpy as np
import pandas as pd
import torch
from collections import defaultdict
import pickle
import json

class CustomJSONEncoder(json.JSONEncoder):
    """自定义JSON编码器，处理不可序列化的对象"""
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, pd.Timestamp):
            return obj.isoformat()
        elif isinstance(obj, torch.Tensor):
            return obj.tolist()
        elif hasattr(obj, '__dict__'):
            # 对于有__dict__属性的对象，尝试转换为字典
            try:
                return obj.__dict__
            except:
                return str(obj)
        elif callable(obj):
            return str(obj)
        else:
            return str(obj)

def safe_convert_to_json_serializable(obj):
    """递归地将对象转换为JSON可序列化的格式"""
    if isinstance(obj, dict):
        return {k: safe_convert_to_json_serializable(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [safe_convert_to_json_serializable(item) for item in obj]
    elif isinstance(obj, (np.integer, np.floating)):
        return obj.item()
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, pd.Timestamp):
        return obj.isoformat()
    elif isinstance(obj, torch.Tensor):
        return obj.tolist()
    elif hasattr(obj, '__dict__'):
        try:
            return safe_convert_to_json_serializable(obj.__dict__)
        except:
            return str(obj)
    elif callable(obj):
        return str(obj)
    else:
        try:
            json.dumps(obj)  # 测试是否可以序列化
            return obj
        except:
            return str(obj)

def analyze_dataset_structure(data_path):
    """
    分析数据集的结构和内容
    """
    print("=" * 60)
    print("数据集结构分析")
    print("=" * 60)
    
    # 1. 加载主数据集
    print("\n1. 主数据集 (dataset.pkl) 分析:")
    print("-" * 40)
    dataset = pd.read_pickle(data_path + '/dataset.pkl')
    
    print(f"数据集形状: {dataset.shape}")
    print(f"列名: {list(dataset.columns)}")
    print(f"数据类型:\n{dataset.dtypes}")
    
    # 存储分析结果
    analysis_result = {
        "dataset_info": {
            "shape": list(dataset.shape),
            "columns": list(dataset.columns),
            "dtypes": {str(k): str(v) for k, v in dataset.dtypes.items()}
        },
        "column_analysis": {},
        "distribution_analysis": {},
        "student_problem_analysis": {},
        "timestep_analysis": {},
        "embedding_analysis": {},
        "code_analysis": {},
        "input_analysis": {}
    }
    
    # 2. 分析各列的基本信息
    print(f"\n各列基本统计信息:")
    print("-" * 40)
    for col in dataset.columns:
        print(f"\n{col}:")
        col_info = {}
        try:
            if dataset[col].dtype == 'object':
                # 检查是否包含列表类型
                sample_val = dataset[col].iloc[0]
                if isinstance(sample_val, list):
                    col_info["type"] = "list"
                    col_info["sample_length"] = len(sample_val)
                    col_info["sample_content"] = safe_convert_to_json_serializable(sample_val[:3] if len(sample_val) > 3 else sample_val)
                    print(f"  类型: 列表")
                    print(f"  示例长度: {len(sample_val)}")
                    print(f"  示例内容: {sample_val[:3] if len(sample_val) > 3 else sample_val}")
                else:
                    col_info["type"] = "object"
                    col_info["unique_count"] = int(dataset[col].nunique())
                    col_info["sample_values"] = safe_convert_to_json_serializable(dataset[col].iloc[:3].tolist())
                    print(f"  唯一值数量: {dataset[col].nunique()}")
                    print(f"  示例值: {dataset[col].iloc[:3].tolist()}")
            else:
                col_info["type"] = "numeric"
                col_info["min"] = float(dataset[col].min())
                col_info["max"] = float(dataset[col].max())
                col_info["mean"] = float(dataset[col].mean())
                col_info["unique_count"] = int(dataset[col].nunique())
                print(f"  最小值: {dataset[col].min()}")
                print(f"  最大值: {dataset[col].max()}")
                print(f"  平均值: {dataset[col].mean():.4f}")
                print(f"  唯一值数量: {dataset[col].nunique()}")
        except Exception as e:
            col_info["error"] = str(e)
            col_info["sample_value"] = str(dataset[col].iloc[0])
            print(f"  分析出错: {e}")
            print(f"  示例值: {dataset[col].iloc[0]}")
        
        analysis_result["column_analysis"][col] = col_info
    
    # 3. 分析学生和问题分布
    print(f"\n学生和问题分布:")
    print("-" * 40)
    student_count = int(dataset['SubjectID'].nunique())
    problem_count = int(dataset['ProblemID'].nunique())
    total_records = len(dataset)
    avg_records_per_student = len(dataset) / dataset['SubjectID'].nunique()
    
    analysis_result["student_problem_analysis"] = {
        "student_count": student_count,
        "problem_count": problem_count,
        "total_records": total_records,
        "avg_records_per_student": float(avg_records_per_student)
    }
    
    print(f"学生数量: {student_count}")
    print(f"问题数量: {problem_count}")
    print(f"总记录数: {total_records}")
    print(f"平均每个学生的记录数: {avg_records_per_student:.2f}")
    
    # 4. 分析分数分布
    print(f"\n分数分布分析:")
    print("-" * 40)
    score_analysis = {}
    if 'Score_x' in dataset.columns:
        score_x_dist = dataset['Score_x'].value_counts().sort_index().to_dict()
        score_analysis["Score_x"] = {str(k): int(v) for k, v in score_x_dist.items()}
        print(f"Score_x 分布:")
        print(dataset['Score_x'].value_counts().sort_index())
    if 'Score_y' in dataset.columns:
        score_y_dist = dataset['Score_y'].value_counts().sort_index().to_dict()
        score_analysis["Score_y"] = {str(k): int(v) for k, v in score_y_dist.items()}
        print(f"Score_y 分布:")
        print(dataset['Score_y'].value_counts().sort_index())
    
    analysis_result["distribution_analysis"] = score_analysis
    
    # 5. 分析时间步长分布
    print(f"\n时间步长分析:")
    print("-" * 40)
    student_lengths = dataset.groupby('SubjectID').size()
    timestep_info = {
        "min_length": int(student_lengths.min()),
        "max_length": int(student_lengths.max()),
        "mean_length": float(student_lengths.mean()),
        "median_length": float(student_lengths.median())
    }
    analysis_result["timestep_analysis"] = timestep_info
    
    print(f"学生记录长度统计:")
    print(f"  最短: {student_lengths.min()}")
    print(f"  最长: {student_lengths.max()}")
    print(f"  平均: {student_lengths.mean():.2f}")
    print(f"  中位数: {student_lengths.median():.2f}")
    
    # 6. 分析prompt embedding
    print(f"\nPrompt Embedding 分析:")
    print("-" * 40)
    if 'prompt-embedding' in dataset.columns:
        sample_emb = dataset['prompt-embedding'].iloc[0]
        emb_info = {
            "type": str(type(sample_emb)),
            "is_tensor": hasattr(sample_emb, 'shape'),
            "sample_value": str(sample_emb)
        }
        if hasattr(sample_emb, 'shape'):
            emb_info["shape"] = list(sample_emb.shape)
        elif isinstance(sample_emb, list):
            emb_info["length"] = len(sample_emb)
        
        analysis_result["embedding_analysis"]["prompt_embedding"] = emb_info
        print(f"Embedding 类型: {type(sample_emb)}")
        if hasattr(sample_emb, 'shape'):
            print(f"Embedding 维度: {sample_emb.shape}")
        elif isinstance(sample_emb, list):
            print(f"Embedding 长度: {len(sample_emb)}")
        print(f"示例 embedding: {sample_emb}")
    
    # 7. 分析代码相关字段
    print(f"\n代码相关字段分析:")
    print("-" * 40)
    code_columns = [col for col in dataset.columns if 'code' in col.lower() or 'Code' in col]
    code_analysis = {}
    for col in code_columns:
        print(f"\n{col}:")
        sample_values = dataset[col].iloc[:3].tolist()
        code_analysis[col] = {
            "sample_values": [str(val)[:100] + "..." if len(str(val)) > 100 else str(val) for val in sample_values]
        }
        for i, val in enumerate(sample_values):
            print(f"  示例 {i+1}: {str(val)[:100]}...")
    
    analysis_result["code_analysis"] = code_analysis
    
    # 8. 分析input字段
    if 'input' in dataset.columns:
        print(f"\nInput 字段分析:")
        print("-" * 40)
        sample_input = dataset['input'].iloc[0]
        input_info = {
            "type": str(type(sample_input)),
            "is_tensor": hasattr(sample_input, 'shape'),
            "sample_value": str(sample_input)
        }
        if hasattr(sample_input, 'shape'):
            input_info["shape"] = list(sample_input.shape)
        elif isinstance(sample_input, list):
            input_info["length"] = len(sample_input)
        
        analysis_result["input_analysis"] = input_info
        print(f"Input 类型: {type(sample_input)}")
        if hasattr(sample_input, 'shape'):
            print(f"Input 维度: {sample_input.shape}")
        elif isinstance(sample_input, list):
            print(f"Input 长度: {len(sample_input)}")
        print(f"示例 input: {sample_input}")
    
    return dataset, analysis_result

def analyze_knowledge_components(data_path):
    """
    分析知识组件数据
    """
    print("\n" + "=" * 60)
    print("知识组件 (prompt_concept.xlsx) 分析")
    print("=" * 60)
    
    kc_analysis = {}
    
    try:
        kc_data = pd.read_excel(data_path + '/prompt_concept.xlsx')
        
        print(f"知识组件数据形状: {kc_data.shape}")
        print(f"列名: {list(kc_data.columns)}")
        
        kc_analysis["basic_info"] = {
            "shape": list(kc_data.shape),
            "columns": list(kc_data.columns)
        }
        
        # 分析知识组件分布
        exclude_cols = ['AssignmentID', 'ProblemID', 'Requirement']
        kc_cols = [col for col in kc_data.columns if col not in exclude_cols]
        
        print(f"\n知识组件列数: {len(kc_cols)}")
        print(f"知识组件列名: {kc_cols}")
        
        kc_analysis["knowledge_components"] = {
            "count": len(kc_cols),
            "column_names": kc_cols
        }
        
        # 分析每个知识组件的覆盖情况
        print(f"\n各知识组件覆盖情况:")
        print("-" * 40)
        coverage_info = {}
        for col in kc_cols:
            coverage = int(kc_data[col].sum())
            total = len(kc_data)
            percentage = float(coverage/total*100)
            coverage_info[col] = {
                "coverage": coverage,
                "total": total,
                "percentage": percentage
            }
            print(f"{col}: {coverage}/{total} ({percentage:.1f}%)")
        
        kc_analysis["coverage"] = coverage_info
        
        # 分析问题与知识组件的关系
        print(f"\n问题与知识组件关系:")
        print("-" * 40)
        problem_count = int(kc_data['ProblemID'].nunique())
        assignment_count = int(kc_data['AssignmentID'].nunique())
        
        kc_analysis["relationships"] = {
            "problem_count": problem_count,
            "assignment_count": assignment_count
        }
        
        print(f"问题数量: {problem_count}")
        print(f"作业数量: {assignment_count}")
        
        # 显示几个示例
        print(f"\n示例数据:")
        print("-" * 40)
        print(kc_data.head())
        
        kc_analysis["sample_data"] = safe_convert_to_json_serializable(kc_data.head().to_dict('records'))
        
        return kc_data, kc_analysis
        
    except Exception as e:
        kc_analysis["error"] = str(e)
        print(f"无法读取知识组件数据: {e}")
        return None, kc_analysis

def analyze_data_processing_simulation(configs):
    """
    模拟数据处理过程，分析不同配置下的数据变化
    """
    print("\n" + "=" * 60)
    print("数据处理过程模拟分析")
    print("=" * 60)
    
    # 模拟配置
    class MockConfigs:
        def __init__(self):
            self.data_path = '/data2/liyu/KT/OKT/data'
            self.label_type = 'binary'  # 可以改为 'ternery' 或 'raw'
            self.max_len = 50
            self.testing = False
            self.first_ast_convertible = False
            self.use_kc = True
            self.data_for = 'okt'  # 可以改为 'lstm'
            self.split_method = 'student'  # 可以改为 'entry'
            self.test_size = 0.2
            self.seed = 42
    
    configs = MockConfigs()
    
    processing_analysis = {
        "config": {
            "label_type": configs.label_type,
            "max_len": configs.max_len,
            "data_for": configs.data_for,
            "split_method": configs.split_method,
            "test_size": configs.test_size,
            "seed": configs.seed
        }
    }
    
    # 加载数据
    dataset = pd.read_pickle(configs.data_path + '/dataset.pkl')
    print(f"原始数据形状: {dataset.shape}")
    processing_analysis["original_shape"] = list(dataset.shape)
    
    # 模拟标签处理
    print(f"\n标签处理 (label_type={configs.label_type}):")
    print("-" * 40)
    if configs.label_type == 'binary':
        scores_y = []
        for item in dataset['Score_y']:
            if item >= 2:
                scores_y.append(1)
            else:
                scores_y.append(0)
        dataset['Score'] = scores_y
        score_dist = pd.Series(scores_y).value_counts().sort_index().to_dict()
        processing_analysis["label_processing"] = {
            "type": "binary",
            "score_distribution": {str(k): int(v) for k, v in score_dist.items()}
        }
        print(f"二值化后 Score 分布: {pd.Series(scores_y).value_counts().sort_index()}")
    elif configs.label_type == 'ternery':
        dataset['Score'] = dataset['Score_y']
        score_dist = dataset['Score'].value_counts().sort_index().to_dict()
        processing_analysis["label_processing"] = {
            "type": "ternery",
            "score_distribution": {str(k): int(v) for k, v in score_dist.items()}
        }
        print(f"三元标签 Score 分布: {dataset['Score'].value_counts().sort_index()}")
    elif configs.label_type == 'raw':
        dataset['Score'] = dataset['Score_x']
        score_dist = dataset['Score'].value_counts().sort_index().to_dict()
        processing_analysis["label_processing"] = {
            "type": "raw",
            "score_distribution": {str(k): int(v) for k, v in score_dist.items()}
        }
        print(f"原始标签 Score 分布: {dataset['Score'].value_counts().sort_index()}")
    
    # 模拟学生记录分割
    print(f"\n学生记录分割 (max_len={configs.max_len}):")
    print("-" * 40)
    prev_subject_id = 0
    subjectid_appedix = []
    timesteps = []
    
    for i in range(len(dataset)):
        if prev_subject_id != dataset.iloc[i].SubjectID:
            prev_subject_id = dataset.iloc[i].SubjectID
            accumulated = 0
            id_appendix = 1
        else:
            accumulated += 1
            if accumulated >= configs.max_len:
                id_appendix += 1
                accumulated = 0
        timesteps.append(accumulated)
        subjectid_appedix.append(id_appendix)
    
    dataset['timestep'] = timesteps
    dataset['SubjectID_appendix'] = subjectid_appedix
    dataset['SubjectID'] = [dataset.iloc[i].SubjectID + \
                '_{}'.format(dataset.iloc[i].SubjectID_appendix) for i in range(len(dataset))]
    
    student_count_after_split = int(dataset['SubjectID'].nunique())
    records_after_split = len(dataset)
    
    processing_analysis["student_split"] = {
        "max_len": configs.max_len,
        "student_count_after_split": student_count_after_split,
        "records_after_split": records_after_split
    }
    
    print(f"分割后学生数量: {student_count_after_split}")
    print(f"分割后记录数: {records_after_split}")
    
    # 分析时间步分布
    timestep_dist = pd.Series(timesteps).value_counts().sort_index()
    processing_analysis["timestep_distribution"] = {str(k): int(v) for k, v in timestep_dist.head(10).items()}
    print(f"时间步分布: {timestep_dist.head(10)}")
    
    # 模拟数据分割
    from sklearn.model_selection import train_test_split
    students = dataset['SubjectID'].unique()
    
    if configs.data_for == 'okt':
        # 移除第一个时间步的记录
        dropped_dataset = dataset.copy()
        dropped_dataset = dropped_dataset.drop(dropped_dataset.index[dropped_dataset['timestep'] == 0]).reset_index(drop=True)
        print(f"\n移除第一个时间步后数据形状: {dropped_dataset.shape}")
        
        processing_analysis["after_drop_first_timestep"] = {
            "shape": list(dropped_dataset.shape)
        }
        
        if configs.split_method == "student":
            train_students, test_students = train_test_split(students, test_size=configs.test_size, random_state=configs.seed)
            valid_students, test_students = train_test_split(test_students, test_size=0.5, random_state=configs.seed)
            trainset = dropped_dataset[dropped_dataset['SubjectID'].isin(train_students)]
            validset = dropped_dataset[dropped_dataset['SubjectID'].isin(valid_students)]
            testset = dropped_dataset[dropped_dataset['SubjectID'].isin(test_students)]
            
            processing_analysis["data_split"] = {
                "method": "student",
                "train_shape": list(trainset.shape),
                "valid_shape": list(validset.shape),
                "test_shape": list(testset.shape)
            }
            
            print(f"训练集: {trainset.shape}")
            print(f"验证集: {validset.shape}")
            print(f"测试集: {testset.shape}")
    
    return processing_analysis

def main():
    """
    主函数：执行完整的数据分析
    """
    data_path = '/data2/liyu/KT/OKT/data'
    
    # 存储所有分析结果
    all_analysis = {
        "analysis_timestamp": pd.Timestamp.now().isoformat(),
        "data_path": data_path
    }
    
    # 1. 分析主数据集
    dataset, dataset_analysis = analyze_dataset_structure(data_path)
    all_analysis["dataset_analysis"] = dataset_analysis
    
    # 2. 分析知识组件
    kc_data, kc_analysis = analyze_knowledge_components(data_path)
    all_analysis["knowledge_component_analysis"] = kc_analysis
    
    # 3. 模拟数据处理过程
    processing_analysis = analyze_data_processing_simulation(None)
    all_analysis["data_processing_analysis"] = processing_analysis
    
    # 转换所有数据为JSON可序列化格式
    all_analysis = safe_convert_to_json_serializable(all_analysis)
    
    # 保存分析结果到JSON文件
    output_file = '/data2/liyu/KT/OKT/data/data_info.json'
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(all_analysis, f, ensure_ascii=False, indent=2, cls=CustomJSONEncoder)
    
    print(f"\n分析结果已保存到: {output_file}")
    print("\n" + "=" * 60)
    print("分析完成")
    print("=" * 60)

if __name__ == "__main__":
    main()

数据集结构分析

1. 主数据集 (dataset.pkl) 分析:
----------------------------------------
数据集形状: (39796, 15)
列名: ['SubjectID', 'AssignmentID', 'ProblemID', 'CodeStateID', 'Score_x', 'Code', 'Code-ast', 'code-astnn', 'code-embedding', 'Score_y', 'embedding', 'astnn', 'prompt', 'prompt-embedding', 'input']
数据类型:
SubjectID            object
AssignmentID        float64
ProblemID             int64
CodeStateID          object
Score_x             float64
Code                 object
Code-ast             object
code-astnn           object
code-embedding       object
Score_y               int64
embedding            object
astnn                object
prompt               object
prompt-embedding     object
input                object
dtype: object

各列基本统计信息:
----------------------------------------

SubjectID:
  唯一值数量: 246
  示例值: ['04c32d4d95425f73b3a1d6502aed4d48', '04c32d4d95425f73b3a1d6502aed4d48', '04c32d4d95425f73b3a1d6502aed4d48']

AssignmentID:
  最小值: 439.0
  最大值: 502.0
  平均值: 483.5757
  唯一值数量: 5

Proble

# 2. 分析测试结果

In [2]:
import pickle
with open('checkpoints/20251004_115017/eval_logs.pkl', 'rb') as f:
    results = pickle.load(f)
    
print('=== 评估结果摘要 ===')
for key, value in results.items():
    if isinstance(value, (int, float)):
        print(f'{key}: {value}')
    elif isinstance(value, list) and len(value) < 5:
        print(f'{key}: {value}')


=== 评估结果摘要 ===
codebleu: 0.6186319446387262
dist_1: 0.416297102975318
dist_2: 0.7223517921205589
dist_3: 0.8246806665340114


# 3. 数据集格式转换

In [None]:
import pandas as pd
import numpy as np
import json
import pickle
import os
from datetime import datetime

def convert_pkl_to_readable_formats(data_path):
    """
    将dataset.pkl转换为多种可读格式
    """
    print("开始转换数据格式...")
    
    # 1. 加载原始数据
    dataset = pd.read_pickle(data_path + '/dataset.pkl')
    print(f"原始数据形状: {dataset.shape}")
    
    # 2. 转换为CSV格式（处理复杂对象）
    print("转换为CSV格式...")
    csv_dataset = dataset.copy()
    
    # 将复杂对象转换为字符串
    complex_columns = ['Code-ast', 'code-astnn', 'code-embedding', 'embedding', 'astnn', 'prompt-embedding', 'input']
    for col in complex_columns:
        if col in csv_dataset.columns:
            csv_dataset[col] = csv_dataset[col].apply(lambda x: str(x) if x is not None else '')
    
    csv_file = data_path + '/dataset.csv'
    csv_dataset.to_csv(csv_file, index=False, encoding='utf-8')
    print(f"CSV文件已保存: {csv_file}")

   
def create_dataset_documentation(data_path):
    """
    创建数据集文档
    """
    print("创建数据集文档...")
    
    # 加载数据
    dataset = pd.read_pickle(data_path + '/dataset.pkl')
    
    # 加载分析结果
    with open(data_path + '/data_info.json', 'r', encoding='utf-8') as f:
        analysis_data = json.load(f)
    
    # 创建Markdown文档
    md_content = f"""# OKT数据集分析文档

## 数据集概述

本数据集是一个编程教育知识追踪数据集，包含学生编程练习的详细记录。

### 基本信息
- **总记录数**: {analysis_data['dataset_analysis']['dataset_info']['shape'][0]:,}
- **特征数**: {analysis_data['dataset_analysis']['dataset_info']['shape'][1]}
- **学生数量**: {analysis_data['dataset_analysis']['student_problem_analysis']['student_count']:,}
- **问题数量**: {analysis_data['dataset_analysis']['student_problem_analysis']['problem_count']:,}
- **平均每学生记录数**: {analysis_data['dataset_analysis']['student_problem_analysis']['avg_records_per_student']:.2f}

## 数据字段说明

### 学生标识字段
- **SubjectID**: 学生唯一标识符 (字符串)
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['SubjectID']['unique_count']:,}

### 问题标识字段
- **AssignmentID**: 作业标识符 (数值)
  - 范围: {analysis_data['dataset_analysis']['column_analysis']['AssignmentID']['min']:.0f} - {analysis_data['dataset_analysis']['column_analysis']['AssignmentID']['max']:.0f}
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['AssignmentID']['unique_count']}

- **ProblemID**: 问题标识符 (数值)
  - 范围: {analysis_data['dataset_analysis']['column_analysis']['ProblemID']['min']:.0f} - {analysis_data['dataset_analysis']['column_analysis']['ProblemID']['max']:.0f}
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['ProblemID']['unique_count']}

- **CodeStateID**: 代码状态唯一标识符 (字符串)
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['CodeStateID']['unique_count']:,}

### 评分字段
- **Score_x**: 原始评分 (0-1之间的浮点数)
  - 范围: {analysis_data['dataset_analysis']['column_analysis']['Score_x']['min']:.3f} - {analysis_data['dataset_analysis']['column_analysis']['Score_x']['max']:.3f}
  - 平均值: {analysis_data['dataset_analysis']['column_analysis']['Score_x']['mean']:.3f}
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['Score_x']['unique_count']}

- **Score_y**: 二值化评分 (整数)
  - 范围: {analysis_data['dataset_analysis']['column_analysis']['Score_y']['min']:.0f} - {analysis_data['dataset_analysis']['column_analysis']['Score_y']['max']:.0f}
  - 平均值: {analysis_data['dataset_analysis']['column_analysis']['Score_y']['mean']:.3f}
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['Score_y']['unique_count']}

### 代码相关字段
- **Code**: 学生提交的代码 (字符串)
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['Code']['unique_count']:,}

- **Code-ast**: 代码的抽象语法树 (复杂对象)
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['Code-ast']['unique_count']:,}

- **code-astnn**: AST神经网络表示 (复杂对象)
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['code-astnn']['unique_count']:,}

- **code-embedding**: 代码嵌入向量 (复杂对象)
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['code-embedding']['unique_count']:,}

- **embedding**: 通用嵌入向量 (复杂对象)
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['embedding']['unique_count']:,}

- **astnn**: AST神经网络特征 (复杂对象)
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['astnn']['unique_count']:,}

- **input**: 模型输入特征 (复杂对象)
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['input']['unique_count']:,}

### 提示相关字段
- **prompt**: 问题描述/提示文本 (字符串)
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['prompt']['unique_count']:,}

- **prompt-embedding**: 提示嵌入向量 (复杂对象)
  - 唯一值数量: {analysis_data['dataset_analysis']['column_analysis']['prompt-embedding']['unique_count']:,}

## 数据分布分析

### 评分分布
- **Score_x分布**: {dict(analysis_data['dataset_analysis']['distribution_analysis']['Score_x'])}
- **Score_y分布**: {dict(analysis_data['dataset_analysis']['distribution_analysis']['Score_y'])}

### 学生记录长度分布
- **最短记录**: {analysis_data['dataset_analysis']['timestep_analysis']['min_length']} 条
- **最长记录**: {analysis_data['dataset_analysis']['timestep_analysis']['max_length']} 条
- **平均记录**: {analysis_data['dataset_analysis']['timestep_analysis']['mean_length']:.2f} 条
- **中位数记录**: {analysis_data['dataset_analysis']['timestep_analysis']['median_length']:.2f} 条

## 知识组件分析

### 知识组件基本信息
- **知识组件数量**: {analysis_data['knowledge_component_analysis']['knowledge_components']['count']}
- **问题数量**: {analysis_data['knowledge_component_analysis']['relationships']['problem_count']}
- **作业数量**: {analysis_data['knowledge_component_analysis']['relationships']['assignment_count']}

### 知识组件覆盖情况
"""

    # 添加知识组件覆盖情况
    if 'coverage' in analysis_data['knowledge_component_analysis']:
        md_content += "\n| 知识组件 | 覆盖数量 | 总数量 | 覆盖率 |\n"
        md_content += "|----------|----------|--------|--------|\n"
        for kc, info in analysis_data['knowledge_component_analysis']['coverage'].items():
            md_content += f"| {kc} | {info['coverage']} | {info['total']} | {info['percentage']:.1f}% |\n"

    md_content += f"""

## 数据处理配置

### 标签处理选项
- **binary**: 将Score_y >= 2的值设为1，否则为0
- **ternery**: 直接使用Score_y作为三元标签
- **raw**: 使用Score_x作为原始连续标签

### 数据分割选项
- **student**: 按学生ID分割数据
- **entry**: 按记录条目分割数据

### 时间步长限制
- **max_len**: 每个学生最大记录数，超过则分割为多个学生

## 数据格式转换

本数据集已转换为以下可读格式：

1. **CSV格式** (`dataset.csv`): 包含所有字段，复杂对象转换为字符串
2. **Excel格式** (`dataset_basic.xlsx`): 包含基本字段，便于查看
3. **JSON格式** (`dataset.json`): 包含所有字段，保持原始结构
4. **数据摘要** (`dataset_summary.json`): 包含数据集统计信息

## 使用建议

1. **机器学习任务**: 建议使用`input`字段作为特征，`Score_y`作为标签
2. **代码分析**: 使用`Code`字段进行代码质量分析
3. **知识追踪**: 结合`prompt-embedding`和`code-embedding`进行知识状态建模
4. **序列建模**: 按`SubjectID`分组，按时间顺序排列进行序列分析

## 注意事项

1. 复杂对象字段（如embedding、AST等）在CSV中已转换为字符串格式
2. 建议使用原始pkl文件进行深度学习任务
3. 数据已按学生ID排序，便于序列建模
4. 部分字段可能包含空值，使用前请检查

---
*文档生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*
"""

    # 保存文档
    doc_file = data_path + '/datasets.md'
    with open(doc_file, 'w', encoding='utf-8') as f:
        f.write(md_content)
    
    print(f"数据集文档已保存: {doc_file}")
    return doc_file

def main():
    """
    主函数：执行数据转换和文档生成
    """
    data_path = '/data2/liyu/KT/OKT/data'
    
    print("=" * 60)
    print("OKT数据集格式转换和文档生成")
    print("=" * 60)
    
    # 1. 转换数据格式
    converted_files = convert_pkl_to_readable_formats(data_path)
    
    # 2. 创建文档
    doc_file = create_dataset_documentation(data_path)
    
    print("\n" + "=" * 60)
    print("转换和文档生成完成")
    print("=" * 60)
    print("生成的文件:")
    for file_type, file_path in converted_files.items():
        print(f"  {file_type}: {file_path}")
    print(f"  文档: {doc_file}")

if __name__ == "__main__":
    main()

OKT数据集格式转换和文档生成
开始转换数据格式...
原始数据形状: (39796, 15)
转换为CSV格式...
CSV文件已保存: /data2/liyu/KT/OKT/data/dataset.csv
转换为Excel格式...


ModuleNotFoundError: No module named 'openpyxl'