In [None]:
meta_prompt_open = """
I need your help to evaluate the performance of several models in the speech interaction scenario. The models will receive a speech input from the user, which they need to understand and respond to with a speech output.
Your task is to rate the model’s responses based on the provided user input transcription [Instruction] and the model’s output transcription [Response].

Please evaluate the response on a scale of 1 to 5:
1 point: The response is largely irrelevant, incorrect, or fails to address the user’s query. It may be off-topic or provide incorrect information.
2 points: The response is somewhat relevant but lacks accuracy or completeness. It may only partially answer the user’s question or include extraneous information.
3 points: The response is relevant and mostly accurate, but it may lack conciseness or include unnecessary details that don’t contribute to the main point.
4 points: The response is relevant, accurate, and concise, providing a clear answer to the user’s question without unnecessary elaboration.
5 points: The response is exceptionally relevant, accurate, and to the point. It directly addresses the user’s query in a highly effective and efficient manner, providing exactly the information needed.

Below are the transcription of user’s instruction and models’ response:
### [Instruction]: {prompt}
### [Response]: {response}

After evaluating, please output the score only without anything else.
You don’t need to provide any explanations.
"""


In [None]:
meta_prompt_ref = """
I need your help to evaluate the performance of several models in the speech interaction scenario. The models will receive a speech input from the user, which they need to understand and respond to with a speech output.
Your task is to rate the model’s responses based on the provided user input transcription [Instruction], the model’s output transcription [Response], and the basic reference answer [Reference].

Please evaluate the response on a scale of 1 to 5:
1 point: The response is largely irrelevant, incorrect, or fails to address the user’s query. It may be off-topic or provide incorrect information.
2 points: The response is somewhat relevant but lacks accuracy, completeness, or alignment with the user’s query. It may only partially answer the question or include extraneous content.
3 points: The response is generally accurate and relevant, providing information consistent with the reference answer but may lack depth, clarity, or contextual adaptation.
4 points: The response is accurate, relevant, and contextually appropriate, not only matching the reference but also providing a clear and well-structured answer that aligns well with the user’s query.
5 points: The response is exceptionally accurate, relevant, and informative. It goes beyond the basic reference answer, offering richer, clearer, or more contextually appropriate information while fully addressing the user’s query.

Below are the transcription of user’s instruction, reference answer, and model’s response:
### [Instruction]: {prompt}
### [Reference]: {reference}
### [Response]: {response}

After evaluating, please output the score only without anything else.
You don’t need to provide any explanations.
"""


In [None]:
def clean_text(context_text):
    return context_text.replace("～","").replace("⁇","").replace("✎","").replace("↻","")

In [None]:
import http.client
import json
import time
import os
import json
import glob
import pandas as pd
from tqdm import tqdm
import multiprocessing
from collections import defaultdict

def getConn(message):
    # 创建连接和准备请求
    conn = http.client.HTTPSConnection("api.chatfire.cn")
    payload = json.dumps({
        "model": "gpt-4o-mini",
        "messages": message,
        "temperature": 0
    }, ensure_ascii=False).encode('utf-8')
    
    headers = {
        'Content-Type': 'application/json; charset=utf-8',
        'Authorization': ''  # Your API key here
    }
    
    # 发送请求
    conn.request("POST", "/v1/chat/completions", payload, headers)
    res = conn.getresponse()
    
    # 检查HTTP状态码
    if res.status != 200:
        print(json.loads(res.read().decode("utf-8")))
        print(f"API返回错误状态码：{res.status}")
    
    data = res.read().decode("utf-8")
    json_data = json.loads(data)
    
    content = json_data['choices'][0]['message']['content']
    return content

In [None]:
global task
global is_reference
def chat_completions3(query):
    resp = getConn(query)
    return resp

def generate(item):
    if task == 'multimodality_chat':
        question = item['混合模态对话']['prompt']
        response = item['混合模态对话']['预测回复']
    else:
        question = item['对话']['prompt']
        response = item['对话']['预测回复']
    
    if not is_reference:
        prompt = meta_prompt_open.replace("{prompt}", question).replace('{response}', response)
    else:
        if item['next'] != {}:
            reference = clean_text(item['next']["text"])
            prompt = meta_prompt_ref.replace("{prompt}", question).replace('{response}', response).replace('{reference}', reference)
        else:
            prompt = meta_prompt_open.replace("{prompt}", question).replace('{response}', response)
        
    for _ in range(25):
        try:
            score = chat_completions3(
                    query=[{"role": "system",
                            "content": "You are a helpful assistant who tries to help answer the user's question."},
                            {"role": "user", "content": prompt}]
                )
            break
        except Exception as e:
            print(f"发生错误: {e}")
            score = "0"
            time.sleep(5)
    time.sleep(10)
    if not is_reference:
        item['score'] = score
    else:
        item['score_ref'] = score
        item['total_score'] = str(int(item['score']) + int(score) )
    return item

# Evaluation Without Reference

In [None]:
is_reference = False

for model_dir in os.listdir('results'): 
    model_path = os.path.join('results', model_dir)
    if not os.path.isdir(model_path) or model_dir.startswith('.'):
        continue
    
    for subset_dir in os.listdir(model_path):
        subset_path = os.path.join(model_path, subset_dir)
        if not os.path.isdir(subset_path):
            continue
        
        for json_path in glob.glob(os.path.join(subset_path, '*-fix.json')):
            model = json_path.split('/')[-3]
            task = json_path.split('/')[-2]
            subset = json_path.split('/')[-1].split('-')[0]
            scores = []
            if task == 'multimodality_chat' or task == 'chat':
                with open(json_path, 'r', encoding='utf-8') as f:
                    pre_items = json.load(f)
                    with multiprocessing.Pool(128) as pool:
                        results = list(tqdm(pool.imap(generate, pre_items), total=len(pre_items)))

                    for score in results:
                        if 'score' in score:
                            scores.append(int(score['score']))
                        else:
                            scores.append(0)
                avg_score = sum(scores) / len(scores) if scores else 0
                results.insert(0, {"avg_score": avg_score})
                
                tgt_file = json_path.replace('-fix.json', '-fix_metrics.json')
                with open(tgt_file, "w") as file:
                    json.dump(results, file, ensure_ascii=False, indent=4)
                print(f'avg_score: {avg_score}')
                print(f"Processed {json_path} and saved to {tgt_file}")

# Evaluation With Reference

In [None]:
is_reference = True

for model_dir in os.listdir('results'): 
    model_path = os.path.join('results', model_dir)
    if not os.path.isdir(model_path) or model_dir.startswith('.'):
        continue
    
    for subset_dir in os.listdir(model_path):
        subset_path = os.path.join(model_path, subset_dir)
        if not os.path.isdir(subset_path):
            continue
        

        for json_path in glob.glob(os.path.join(subset_path, '*-fix_metrics.json')):
            model = json_path.split('/')[-3]
            task = json_path.split('/')[-2]
            subset = json_path.split('/')[-1].split('-')[0]
            scores = []
            if task == 'multimodality_chat' or task == 'chat':
                with open(json_path, 'r', encoding='utf-8') as f:
                    all_data = json.load(f)
                    avg = all_data[0]
                    pre_items = all_data[1:]
                    
                    with multiprocessing.Pool(256) as pool:
                        results = list(tqdm(pool.imap(generate, pre_items), total=len(pre_items)))

                    for score in results:
                        if 'score_ref' in score:
                            scores.append(int(score['score_ref']))
                        else:
                            scores.append(0)

                avg_score = sum(scores) / len(scores) if scores else 0
                avg["avg_score_ref"] =  avg_score
                avg["total_score"] =  avg_score + avg["avg_score"]
                
                print(f'Avg_score {avg["avg_score"]}')
                print(f'Avg_score_ref {avg_score}')
                print(f'total_score {avg["total_score"]}')
                results.insert(0, avg)
                tgt_file = json_path
                with open(tgt_file, "w") as file:
                    json.dump(results, file, ensure_ascii=False, indent=4)
                print(f"Processed {json_path} and saved to {tgt_file}")