In [37]:
import json

evals_path = "./evaluation_metrics/with_crossover_strategy/full_evals.jsonl"
evaluation_data_path = "./evaluation_stats/query.jsonl"

def read_jsonl(file_path: str):
    existing_data = []
    with open(file_path, "r") as file:
        for line in file:
            line = line.strip() # Remove leading/trailing whitespace and newlines
            if line: # Ensure the line is not empty
                try:
                    json_object = json.loads(line)
                    existing_data.append(json_object)
                except json.JSONDecodeError as e:
                    print(f"Error parsing line: {e}")
                    continue
    return existing_data


In [38]:
eval_data = read_jsonl(evals_path)

In [44]:
overall_score = 0.0
for eval_obj in eval_data:
    overall_score += eval_obj['overall_score']

overall_score / len(eval_data)

0.46215009481436575

In [None]:
id_by_topic = {data['id']:data['topic'] for data in eval_data}
id_by_topic

In [None]:
id_by_lang = {data['id']:data['language'] for data in eval_data}
id_by_lang

In [5]:
count_by_topic = {}
for data in eval_data:
    topic = data['topic']
    if topic in count_by_topic:
        count_by_topic[topic] += 1
    else:
        count_by_topic[topic] = 1

In [6]:
evals_data = read_jsonl(evals_path)
evals_data[0]

{'id': 1,
 'prompt': '收集整理目前中国9阶层实际收入和财务状况，特别研究得出中国的中产有哪些特点，实际中产人数，财力等等',
 'comprehensiveness': 0.4263322884012539,
 'insight': 0.4805194805194805,
 'instruction_following': 0.43452380952380953,
 'readability': 0.45739910313901344,
 'overall_score': 0.4525368493437378}

In [7]:
compr_by_topic = {}
insight_by_topic = {}
instr_foll_by_topic = {}
read_by_topic = {}
overall_by_topic = {}
items_by_topic = {}

for eval in evals_data:
    topic = id_by_topic[eval['id']]
    if topic in compr_by_topic:
        compr_by_topic[topic] += eval['comprehensiveness']
        insight_by_topic[topic] += eval['insight']
        instr_foll_by_topic[topic] += eval['instruction_following']
        read_by_topic[topic] += eval['readability']
        overall_by_topic[topic] += eval['overall_score']
    else:
        compr_by_topic[topic] = eval['comprehensiveness']
        insight_by_topic[topic] = eval['insight']
        instr_foll_by_topic[topic] = eval['instruction_following']
        read_by_topic[topic] = eval['readability']
        overall_by_topic[topic] = eval['overall_score']

In [27]:
compr_by_lang = {}
insight_by_lang = {}
instr_foll_by_lang = {}
read_by_lang = {}
overall_by_lang = {}
items_by_lang = {}

for eval in evals_data:
    lang = id_by_lang[eval['id']]
    if lang in compr_by_lang:
        compr_by_lang[lang] += eval['comprehensiveness']
        insight_by_lang[lang] += eval['insight']
        instr_foll_by_lang[lang] += eval['instruction_following']
        read_by_lang[lang] += eval['readability']
        overall_by_lang[lang] += eval['overall_score']
    else:
        compr_by_lang[lang] = eval['comprehensiveness']
        insight_by_lang[lang] = eval['insight']
        instr_foll_by_lang[lang] = eval['instruction_following']
        read_by_lang[lang] = eval['readability']
        overall_by_lang[lang] = eval['overall_score']

In [8]:
def average_score_by_topic(score_by_topic, count_by_topic):
    for topic in score_by_topic.keys():
        score_by_topic[topic] /= count_by_topic[topic]
    
    return score_by_topic

compr_by_topic = average_score_by_topic(compr_by_topic, count_by_topic)
insight_by_topic = average_score_by_topic(insight_by_topic, count_by_topic)
instr_foll_by_topic = average_score_by_topic(instr_foll_by_topic, count_by_topic)
read_by_topic = average_score_by_topic(read_by_topic, count_by_topic)
overall_by_topic = average_score_by_topic(overall_by_topic, count_by_topic)

In [28]:
def average_score_by_lang(score_by_lang, count_by_lang):
    for lang in score_by_lang.keys():
        score_by_lang[lang] /= count_by_lang[lang]
    
    return score_by_lang

count_by_lang = {'zh': 50, 'en': 50}
compr_by_lang = average_score_by_topic(compr_by_lang, count_by_lang)
insight_by_lang = average_score_by_topic(insight_by_lang, count_by_lang)
instr_foll_by_lang = average_score_by_topic(instr_foll_by_lang, count_by_lang)
read_by_lang = average_score_by_topic(read_by_lang, count_by_lang)
overall_by_lang = average_score_by_topic(overall_by_lang, count_by_lang)

In [30]:
compr_by_lang

{'zh': 0.4384264190740016, 'en': 0.43039581172805647}

In [36]:
import pandas as pd

df = pd.DataFrame(list(compr_by_lang.items()), columns=['Language', 'Average Comprehensiveness Score'])

# Save to CSV
csv_filename = 'comprehensiveness_score_by_lang.csv'
df.to_csv(csv_filename, index=False)

print(df.head())
print(f"\nSaved to {csv_filename}")

  Language  Average Comprehensiveness Score
0       zh                         0.438426
1       en                         0.430396

Saved to comprehensiveness_score_by_lang.csv
