In [1]:
import json
import tiktoken
import pandas as pd
import os
import re
from typing import Dict, Any

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def clean_string(s):
    """Clean and sanitize string for Excel."""
    if isinstance(s, str):
        s = ''.join(ch for ch in s if ord(ch) >= 32)
        s = s.replace('"', '""')
        s = re.sub(r'[\000-\010]|[\013-\014]|[\016-\031]', '', s)
        if s.startswith(('=', '+', '-', '@')):
            s = "'" + s
    return s

def flatten_dict(d: Dict[str, Any], parent_key: str = '', sep: str = '_') -> Dict[str, Any]:
    """Flatten nested dictionaries."""
    items = []
    for k, v in d.items():
        new_key = f"{parent_key}{sep}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten_dict(v, new_key, sep=sep).items())
        else:
            items.append((new_key, v))
    return dict(items)

def results_json_to_excel(results_json, output_excel):
    """Convert pretest results JSON data to Excel format."""
    rows = []

    for student_name, student_data in results_json.items():
        if student_name == 'config':
            continue
        student_config = student_data['config']
        for question_id, question_data in student_data['results'].items():
            row = {
                'student_name': student_name,
                'student_config_model': student_config.get('model', 'N/A'),
                'student_config_temperature': student_config.get('temperature', 'N/A'),
                'student_config_answer_max_tokens': student_config.get('answer_max_tokens', 'N/A'),
                'student_config_test_max_tokens': student_config.get('test_max_tokens', 'N/A'),
                'student_use_few_shot': student_config.get('use_few_shot', 'N/A'),
                'question_id': question_id,
                'category': question_data['category'],
            }

            # Process responses
            response = question_data['responses'][0]
            row.update({
                'question': response.get('question', 'N/A'),
                'options': ', '.join(response.get('options', [])),
                'correct_answer': response.get('correct_answer', 'N/A'),
                'model_response': response.get('model_response', 'N/A'),
                'model_prediction': response.get('model_prediction', 'N/A'),
                'model_accuracy': int(response.get('correct_answer', '') == response.get('model_prediction', '')),
                'model_response_tokens': num_tokens_from_string(response.get('model_response', ''), "cl100k_base"),
            })

            rows.append(flatten_dict(row))

    df = pd.DataFrame(rows)

    for column in df.columns:
        df[column] = df[column].apply(clean_string)

    df.to_excel(output_excel, index=False, engine='openpyxl')
    print(f"Excel file saved as: {output_excel}")

def main():
    file_path = 'D:/Workspace/EducationQ_Benchmark/src/data/output/EduQ-Bench_Student-mistral-nemo/GPQA/'
    file_name = "pretest_results_1.0.0_20241018_170901"
    
    print("Converting to Excel...")
    json_file = os.path.join(file_path, file_name + ".json")
    output_excel_file = os.path.join(file_path, file_name + '.xlsx')
    
    with open(json_file, "r") as f:
        pretest_results = json.load(f)
    
    results_json_to_excel(pretest_results, output_excel_file)

    print("Process completed successfully!")

if __name__ == "__main__":
    main()

Converting to Excel...
Excel file saved as: D:/Workspace/EducationQ_Benchmark/src/data/output/EduQ-Bench_Student-mistral-nemo/GPQA/pretest_results_1.0.0_20241018_170901.xlsx
Process completed successfully!
