# Data Analysis 
MRBench_V1: The original dataset with 192 dialogues as deatiled in the paper.

MRBench_V2: An updated version with additional 8 dialogues, bringing the total to 200 examples.


conduct DAMR / Annotation correlation (AC) scores for original result.

In [4]:
import json, os
from collections import defaultdict
import pandas as pd

In [43]:
def print_MRBench_response(data, model_name):
    print(f"Model: {model_name}")
    for i, dialogue in enumerate(data):
        print("*"*100)
        print(f"Dialogue {i+1}:")
        print(f"  Data: {dialogue['Data']}")
        print(f"  Topic: {dialogue['Topic']}")
        # print(f"  Conversation History: {dialogue['conversation_history']}")
        print(f"  >>>>Ground Truth Solution: {dialogue['Ground_Truth_Solution']}")
        llm_model_response = dialogue['anno_llm_responses'][model_name]['response']
        print(f"  >>>>LLM Response: {llm_model_response}")
        print("*"*100)

def print_MRBench_label(data, model_name, annotation_name):
    print(f"Model: {model_name}")
    from collections import defaultdict
    label_dict = defaultdict(int)
    for i, dialogue in enumerate(data):
        print("*"*100)
        print(f"Dialogue {i+1}:")
        print(f"  Data: {dialogue['Data']}")
        # if dianuelogue['Data'] == "MathDial":
        #     conti
        # print(f"  Topic: {dialogue['Topic']}")
        # print(f"  Conversation History: {dialogue['conversation_history']}")
        # print(f"  >>>>Ground Truth Solution: {dialogue['Ground_Truth_Solution']}")
        annotation = dialogue['anno_llm_responses'][model_name]['annotation'][annotation_name]
        label_dict[annotation] += 1
        print(f"  >>>>{annotation_name}: {annotation}")
    print(label_dict)
    # label_dict  Yes / All
    for k, v in label_dict.items():
        print(f"{k}: {v/len(data)*100}%")
MRBenchv1_data = json.load(open("../data/MRBench/MRBench_V2.json"))
# print_MRBench_data(MRBenchv1_data, "Mistral")
print_MRBench_label(MRBenchv1_data, "Gemini", "Tutor_Tone")

Model: Gemini
****************************************************************************************************
Dialogue 1:
  Data: MathDial
  >>>>Tutor_Tone: Neutral
****************************************************************************************************
Dialogue 2:
  Data: MathDial
  >>>>Tutor_Tone: Neutral
****************************************************************************************************
Dialogue 3:
  Data: Bridge
  >>>>Tutor_Tone: Encouraging
****************************************************************************************************
Dialogue 4:
  Data: MathDial
  >>>>Tutor_Tone: Neutral
****************************************************************************************************
Dialogue 5:
  Data: MathDial
  >>>>Tutor_Tone: Encouraging
****************************************************************************************************
Dialogue 6:
  Data: Bridge
  >>>>Tutor_Tone: Neutral
**********************************************

In [9]:
def map_annotation_label(key, label):
    label = label.lower().strip()
    Tutor_tone_mapping = {
        "encouraging": 1,
        "neutral": 2,
        "offensive": 3
    }
    
    Other_rule_mapping = {
        "yes": 1,
        "to some extent": 2,
        "no": 3
    }
    def map_revealing_of_the_answer(label):
        label = label.lower().strip()
        if label.startswith("yes") and "correct" in label:
            return 1
        elif label.startswith("yes") and "incorrect" in label:
            return 2
        elif label.startswith("no"):
            return 3
        else:
            return None
    if key == "Revealing_of_the_Answer":
        return map_revealing_of_the_answer(label)
    else:
        map_dict = Tutor_tone_mapping if key == "Tutor_Tone" else Other_rule_mapping
        for key, value in map_dict.items():
            if label.startswith(key):
                return value
        print(label)
        return None


desiderata = {
    "Mistake_Identification": 1,  # Yes
    "Mistake_Location": 1,        # Yes
    "Revealing_of_the_Answer": 3,        # No
    "Providing_Guidance": 1,      # Yes
    "Actionability": 1,           # Yes
    "Coherence": 1,                # Yes
    "Tutor_Tone": 1,              # Encouraging
    "humanlikeness": 1,               # Yes
}

# new_annotation
def evaluate_ordinary_desiderata(data, data_type="All", verbose=False):
    from collections import defaultdict
    evaluation_result = defaultdict(dict)
    for data in MRBenchv1_data:
        d_type= data['Data']
        if data_type !="All" and data_type != d_type:
            print(f"Skip {d_type}")
            continue
        for model, value in data['anno_llm_responses'].items():
            annotation_point = value['annotation_point']
            for k, v in annotation_point.items(): 
                if v is None:
                    if verbose:
                        print(model, k, v)
                    continue
                if v == desiderata[k]:
                    if k not in evaluation_result[model]:
                        evaluation_result[model][k] = [0,0]
                    evaluation_result[model][k][0] += 1
                else:
                    if k not in evaluation_result[model] and v is not None:
                        evaluation_result[model][k] = [0,0]
                    evaluation_result[model][k][1] += 1
    return evaluation_result
def print_evaluation_result(evaluation_result):
    import pandas as pd
    pd_result = []
    columns = []
    for model, value in evaluation_result.items():
        model_result = []
        value = sorted(value.items(), key=lambda x: x[0])
        columns = [k for k, v in value]
        for k, v in value:
            model_result.append((v[0]/(v[0]+v[1] )* 100.0))
        pd_result.append([model] + model_result)  
    columns = ['Tutor'] + columns
    pd_result = pd.DataFrame(pd_result, columns=columns)
    columns_mapping = {
        'Mistake_Identification': 'Mistake_Identification',
        'Mistake_Location': 'Mistake_Location',
        'Revealing_of_the_Answer': 'Revealing_of_the_Answer',
        'Providing_Guidance': 'Providing_Guidance',
        'Actionability': 'Actionability',
        'Coherence': 'Coherence',
        'Tutor_Tone': 'Tutor_Tone',
        'humanlikeness': 'Human-likeness'
    }
    pd_result.rename(columns=columns_mapping, inplace=True)
    pd_result = pd_result[['Tutor', 'Mistake_Identification', 'Mistake_Location', 'Revealing_of_the_Answer', 'Providing_Guidance', 'Actionability', 'Coherence', 'Tutor_Tone', 'Human-likeness']].round(2)
    return pd_result


# Data Analysis 
MRBench_V1: The original dataset with 192 dialogues as deatiled in the paper.

MRBench_V2: An updated version with additional 8 dialogues, bringing the total to 200 examples.


conduct DAMR score for original result.

In [10]:
from collections import defaultdict

root_dir = "../data"
MRBenchv1 = os.path.join(root_dir, "MRBench/MRBench_V1.json")
MRBenchv2 = os.path.join(root_dir, "MRBench/MRBench_V2.json")
MRBenchv1_data = json.load(open(MRBenchv1))
MRBenchv2_data = json.load(open(MRBenchv2))
print("Number of dialogues in MRBenchv1 is ", len(MRBenchv1_data))
print("Number of dialogues in MRBenchv2 is ", len(MRBenchv2_data))
# MRBenchv1_data[0]
# count the length of the ground truth solution
length_list = [len(data['Ground_Truth_Solution']) for data in MRBenchv1_data]
print("The maximum length of the ground truth solution in MRBenchv1 is ", max(length_list))

model_count = defaultdict(int)
model_data_count = defaultdict(int)
for data in MRBenchv1_data:
    for key, value in data['anno_llm_responses'].items():
        model_count[key] += 1
        model_data_count[key + "_" + data['Data']] += 1
for k, v in model_count.items():
    print("MRBenchv1 - The number of dialogues annotated by ", k, "is ", v)

print("-"*100)
for k, v in model_data_count.items():
    print("MRBenchv1 - The number of dialogues annotated by ", k, "is ", v)
print("-"*100)
model_count = defaultdict(int)
for data in MRBenchv2_data:
    for key, value in data['anno_llm_responses'].items():
        model_count[key] += 1
for k, v in model_count.items():
    print("MRBenchv2 - The number of dialogues annotated by ", k, "is ", v)


Number of dialogues in MRBenchv1 is  192
Number of dialogues in MRBenchv2 is  200
The maximum length of the ground truth solution in MRBenchv1 is  579
MRBenchv1 - The number of dialogues annotated by  Gemini is  192
MRBenchv1 - The number of dialogues annotated by  Phi3 is  192
MRBenchv1 - The number of dialogues annotated by  Llama318B is  192
MRBenchv1 - The number of dialogues annotated by  Llama31405B is  192
MRBenchv1 - The number of dialogues annotated by  Mistral is  192
MRBenchv1 - The number of dialogues annotated by  Expert is  192
MRBenchv1 - The number of dialogues annotated by  GPT4 is  192
MRBenchv1 - The number of dialogues annotated by  Sonnet is  192
MRBenchv1 - The number of dialogues annotated by  Novice is  53
----------------------------------------------------------------------------------------------------
MRBenchv1 - The number of dialogues annotated by  Gemini_MathDial is  139
MRBenchv1 - The number of dialogues annotated by  Phi3_MathDial is  139
MRBenchv1 - T

In [11]:
# map the annotation label to the desiderata point
MRBenchv1_data_mapped = []
for data in MRBenchv1_data:
    for key, value in data['anno_llm_responses'].items():
        annotation = value['annotation']
        new_annotation = {}
        for k, v in annotation.items():
            new_annotation[k] = map_annotation_label(k,v)
        value['annotation_point'] = new_annotation
    MRBenchv1_data_mapped.append(data)
# print(MRBenchv1_data_mapped[0])
# evaluate the desiderata point
evaluation_result = evaluate_ordinary_desiderata(MRBenchv1_data)
# print the evaluation result
pd_result=print_evaluation_result(evaluation_result)

import pandas as pd
ss = pd.read_csv('../paper/paper_result.csv',sep='\t')
ss['Tutor'] = ss['Tutor'].apply(lambda x: x.replace("*","")+"_paper")
concat_result = pd.concat([ss, pd_result], axis=0)
concat_result = concat_result.sort_values(by='Tutor')
concat_result

Unnamed: 0,Tutor,Mistake_Identification,Mistake_Location,Revealing_of_the_Answer,Providing_Guidance,Actionability,Coherence,Tutor_Tone,Human-likeness
5,Expert,81.25,68.75,97.92,72.92,81.77,84.9,17.19,94.79
1,Expert_paper,76.04,63.02,90.62,67.19,76.04,79.17,92.19,87.5
7,GPT-4_paper,94.27,84.38,53.12,76.04,46.35,90.17,37.5,89.62
6,GPT4,94.27,85.42,54.69,77.08,46.88,92.71,36.98,93.23
0,Gemini,87.5,62.5,92.71,58.85,61.98,82.29,39.58,95.31
4,Gemini_paper,63.02,39.58,67.71,37.5,42.71,56.77,21.88,68.23
3,Llama31405B,95.31,84.9,81.77,77.6,75.52,94.27,17.71,93.23
8,Llama31405B_paper,94.27,84.38,80.73,77.08,74.48,91.67,16.15,90.62
2,Llama318B,81.25,56.25,76.56,46.88,42.71,82.81,19.79,96.35
2,Llama318B_paper,80.21,54.69,73.96,45.31,42.71,80.73,19.79,93.75


# Compute DAMR Score For LLAMA & MISTRAL inference


In [12]:
#Compute DAMR Score For LLAMA & MISTRAL inference

llama_eval = json.load(open("../data/MRBench/MRBench_V1_Meta-Llama-3.1-8B-Instruct_llama_eval.json"))
llama_eval = json.load(open("../data/MRBench/MRBench_V1_Mistral-7B-Instruct-v0.1_llama_eval.json"))

desiderata = {
    "eval_mistake_identification_result": 1,  # Yes
    "eval_mistake_location_result": 1,        # Yes
    "eval_revealing_answer_result": 3,        # No
    "eval_providing_guidance_result": 1,      # Yes
    "eval_actionability_result": 1,           # Yes
    "eval_coherent_result": 1,                # Yes
    "eval_tutor_tone_result": 1,              # Encouraging
    "eval_humanness_result": 1,               # Yes
}

# new_annotation
def evaluate_ordinary_desiderata(MRBenchv1_eval_data,verbose=False):
    from collections import defaultdict
    evaluation_result = defaultdict(list)
    for data in MRBenchv1_eval_data:
        for sub, score in desiderata.items():
            eval_score = data[sub]['number']
            if eval_score is None:
                continue
            if eval_score == score:
                evaluation_result[sub].append(1)
            else:
                evaluation_result[sub].append(0)
    for k, v in evaluation_result.items():
        print(k, round(sum(v) / len(v) * 100, 2))

evaluate_ordinary_desiderata(llama_eval)

eval_mistake_identification_result 41.67
eval_mistake_location_result 38.54
eval_revealing_answer_result 43.75
eval_providing_guidance_result 22.4
eval_actionability_result 43.23
eval_coherent_result 62.5
eval_tutor_tone_result 46.35
eval_humanness_result 69.79


# Compute AC Score

In [20]:
import numpy as np
from sklearn.metrics import make_scorer
from scipy.stats import pearsonr
from scipy.stats import pearsonr, spearmanr
def pearson_corr(y_true, y_pred, method="pearson"):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
    y_true, y_pred = y_true[mask], y_pred[mask]
    if method == "pearson":
        r = pearsonr(y_true, y_pred)
    elif method == "spearman":
        r= spearmanr(y_true, y_pred)
    return round(r.correlation, 3), round(r.pvalue, 3)


In [34]:
# map the annotation label to the desiderata point
from collections import defaultdict
MRBenchv1_data = json.load(open("../data/MRBench/MRBench_V1.json"))
MRBenchv1_model_point = defaultdict(dict)
for data in MRBenchv1_data:
    for key, value in data['anno_llm_responses'].items():
        annotation = value['annotation']
        # new_annotation = {}
        for k, v in annotation.items():
            annotation_point = map_annotation_label(k,v)
            if k not in MRBenchv1_model_point[key]:
                MRBenchv1_model_point[key][k] = []
            MRBenchv1_model_point[key][k].append([data['conversation_id']+data['Split'], annotation_point])
temp="../data/MRBench/MRBench_V1_llama_eval.json"
from collections import defaultdict
temp=json.load(open(temp))
model_point = defaultdict(dict)
for data in temp:
    for key, value in data['anno_llm_responses'].items():
        annotation_eval = value['annotation_eval']
        for k, v in annotation_eval.items():
            if k not in model_point[key]:
                model_point[key][k] = []
            model_point[key][k].append([data['conversation_id']+data['Split'], v['number']])

In [35]:
# Mistake_Identification
result= defaultdict(dict)
for k, v in MRBenchv1_model_point.items():
    for kk, vv in v.items():
        # if kk == "Mistake_Identification":
        compare_model_result = model_point[k][kk.lower()]
        group1 = pd.DataFrame(vv, columns=['conversation_id', 'annotation_point'])
        group2 = pd.DataFrame(compare_model_result, columns=['conversation_id', 'annotation_point'])
        merge_result = pd.merge(group1, group2, on='conversation_id', how='inner')
        merge_result = merge_result.dropna().drop_duplicates(subset=['conversation_id'])
        merge_result
        y_true = merge_result['annotation_point_x']
        y_pred = merge_result['annotation_point_y']
        p_corr, p_pvalue = pearson_corr(y_true, y_pred, method="pearson")
        s_corr, s_pvalue = pearson_corr(y_true, y_pred, method="spearman")
        # result[kk][k] = [p_corr, p_pvalue, s_corr, s_pvalue, len(merge_result)]
        result[kk][k] = p_corr
            # break

result = pd.DataFrame(result)
result


Unnamed: 0,Mistake_Identification,Mistake_Location,Revealing_of_the_Answer,Providing_Guidance,Actionability,humanlikeness,Coherence,Tutor_Tone
Gemini,0.222,0.127,0.265,0.048,0.131,-0.08,-0.033,0.322
Phi3,0.632,0.648,0.225,0.567,0.288,0.252,0.432,0.441
Llama318B,0.083,0.112,0.103,0.19,0.081,0.133,-0.03,0.393
Llama31405B,-0.033,0.124,0.32,0.069,-0.114,-0.006,0.125,0.233
Mistral,0.263,0.137,0.219,0.116,0.151,0.008,-0.017,0.148
Expert,0.176,0.213,0.131,0.064,0.064,0.051,0.071,0.058
GPT4,0.36,0.203,0.325,0.25,0.149,-0.034,0.119,0.248
Sonnet,0.157,0.177,0.327,0.097,0.267,-0.037,0.175,-0.041
Novice,0.523,0.169,0.499,0.52,0.293,0.075,-0.019,0.566


# Note
DAMR: We can observe that there are differences in the results.