In [6]:
import os
import pandas as pd
os.listdir('dataset/eval_output/review_objective_question')

['davinci-003.csv',
 'chatglm2.csv',
 'chatglm.csv',
 'moss-sft.csv',
 'chinese-alpaca-13b.csv',
 'baichuan-7b.csv',
 '.ipynb_checkpoints']

# accuracy of objective question

In [7]:
llm_results = pd.read_csv('dataset/output/results_with_llm.csv')
class_tixing = dict(llm_results['题型分类'].value_counts())
class_tixing['判断'] = class_tixing['判断']-1 # remove 150 badcase
class_tixing

{'问答': 222, '选择': 140, '判断': 59, '推理': 16, '证明': 6, '应用': 6}

In [10]:
acc = {}
for model in os.listdir('dataset/eval_output/review_objective_question'):
    
    if 'ipynb_checkpoints' in model:
        continue
#     print(model)
    model_name = model.split('.csv')[0]
    dataset = pd.read_csv(os.path.join('dataset/eval_output/review_objective_question/', model)).dropna(axis='columns', how='all')
#     print(dataset.columns)
    hit_judge, hit_choice = 0, 0
    sum_ = 0
    for d in dataset.iterrows():
        if d[1]['id'] == 150:
            continue

        sum_ += 1
        if d[1]['题型分类'] == '判断':
            if d[1]['answer'] == 'FALSE' and d[1]['new_answer'] == 'B':
                hit_judge += 1
            elif d[1]['answer'] == 'TRUE' and d[1]['new_answer'] == 'A':
                hit_judge += 1
#             else:
#                 print(d[1]['answer'])
#                 print(d[1]['ori_answer'])
#                 print(d[1]['new_answer'])
#                 print('*'*100)
        if d[1]['题型分类'] == '选择': 
            if d[1]['answer'] == d[1]['new_answer']:
                hit_choice += 1
    acc[model_name] = {
        'judgement_acc': hit_judge/class_tixing['判断'],
        'choice_acc': hit_choice/class_tixing['选择'],
        'objective_acc': (hit_choice+hit_judge)/(class_tixing['判断']+class_tixing['选择'])
    }


In [11]:
acc

{'davinci-003': {'judgement_acc': 0.8305084745762712,
  'choice_acc': 0.6714285714285714,
  'objective_acc': 0.7185929648241206},
 'chatglm2': {'judgement_acc': 0.864406779661017,
  'choice_acc': 0.5785714285714286,
  'objective_acc': 0.6633165829145728},
 'chatglm': {'judgement_acc': 0.7457627118644068,
  'choice_acc': 0.4357142857142857,
  'objective_acc': 0.5276381909547738},
 'moss-sft': {'judgement_acc': 0.03389830508474576,
  'choice_acc': 0.25,
  'objective_acc': 0.18592964824120603},
 'chinese-alpaca-13b': {'judgement_acc': 0.2033898305084746,
  'choice_acc': 0.45714285714285713,
  'objective_acc': 0.38190954773869346},
 'baichuan-7b': {'judgement_acc': 0.0847457627118644,
  'choice_acc': 0.07857142857142857,
  'objective_acc': 0.08040201005025126}}

# ranking dcg of subjective question

为了量化模型答案在数据集上的排序结果，我们引入公式如下：
$$score_{ranking}=\sum_{i=1}^{num}\frac{frequency_i}{log_2{(i+1)}} $$



- 其中，$score_{ranking}$是一个模型在所有题目上的排序（1-num，第一轮有6个模型和参考答案，排序为1-7）转化为的总分和，衡量了单个模型的主观题目在评测数据集上的表现。
- $frequency_i$是模型在整个数据集中得到第$i$名的频率，即$\frac{sum(rank==i)}{len(dataset)}$
- $\frac{1}{log_2{(i+1)}}$是得到第$i$名赋予的权重（分数）

通过对不同名次赋予不同的权重(第一名的得分到第七名依次递减)，用频率进行标准化，最终得到一个排序转化后的分数。
需要注意的是，当前排名也包括人工答案，借此可以更好的体现人工答案在候选模型答案中的位置，以及大模型评测对于人工答案的偏好程度。

In [None]:
import math
principle_rank_df = pd.read_csv('dataset/eval_output/subjective_question/principle_ranking_results.csv')
model_name = ['chatglm','chatglm2','chinese-alpaca-13b','moss-sft','baichuan-7b', 'davinci-003', 'answer']
principle_rank_df = principle_rank_df[principle_rank_df['题型分类'].apply(lambda x: x in ['推理', '证明', '应用', '问答'])]
pr_dcg_m = {}
for m in model_name:
    pr_m_dict = dict(principle_rank_df[m].value_counts())

    
    pr_dcg = 0
    for k,v in pr_m_dict.items():
        pr_dcg += (v/len(principle_rank_df))/math.log2(k+1)
    pr_dcg_m[m] = pr_dcg

In [210]:
acc_dcg_results = []
for model,dic in acc.items():
    dic['open_qa_dcg'] = pr_dcg_m[model]
    dic['model_name'] = model
    acc_dcg_results.append(dic)
acc_dcg_results

[{'judgement_acc': 0.8305084745762712,
  'choice_acc': 0.6714285714285714,
  'open_qa_dcg': 0.5002372233160524,
  'model_name': 'davinci-003'},
 {'judgement_acc': 0.864406779661017,
  'choice_acc': 0.5785714285714286,
  'open_qa_dcg': 0.6489089931704489,
  'model_name': 'chatglm2'},
 {'judgement_acc': 0.7457627118644068,
  'choice_acc': 0.4357142857142857,
  'open_qa_dcg': 0.7230369680042827,
  'model_name': 'chatglm'},
 {'judgement_acc': 0.03389830508474576,
  'choice_acc': 0.25,
  'open_qa_dcg': 0.4225744782230914,
  'model_name': 'moss-sft'},
 {'judgement_acc': 0.2033898305084746,
  'choice_acc': 0.45714285714285713,
  'open_qa_dcg': 0.5152436742851731,
  'model_name': 'chinese-alpaca-13b'},
 {'judgement_acc': 0.0847457627118644,
  'choice_acc': 0.07857142857142857,
  'open_qa_dcg': 0.38780022669896697,
  'model_name': 'baichuan-7b'}]

In [211]:
acc_dcg_results.append({
    'model_name': 'answer',
    'judgement_acc': 1,
    'open_qa_dcg': pr_dcg_m['answer'],
    'choice_acc': 1
})

In [213]:
pd.DataFrame(acc_dcg_results).to_csv('dataset/eval_metrics/acc_dcg_by_question_type.csv')

In [202]:
pr_dcg_m

{'chatglm': 0.7230369680042827,
 'chatglm2': 0.6489089931704489,
 'chinese-alpaca-13b': 0.5152436742851731,
 'moss-sft': 0.4225744782230914,
 'baichuan-7b': 0.38780022669896697,
 'davinci-003': 0.5002372233160524,
 'answer': 0.4374399135409668}

# 3 dimension scoring of subjective question（有效性/可靠性/流畅性）

In [35]:
data_list = [d for d in os.listdir('dataset/eval_output/subjective_question') if 'pointwise' in d]
data_list

['pointwise_chatglm.csv',
 'pointwise_davinci-003.csv',
 'pointwise_chinese-alpaca-13b.csv',
 'pointwise_chatglm2.csv',
 'pointwise_moss-sft.csv',
 'pointwise_baichuan-7b.csv']

In [189]:
scores = {}
for model in data_list:
    sum_ = 0
    model_name = model.split('.csv')[0].split('pointwise_')[1]
    dataset = pd.read_csv(os.path.join('dataset/evaluation/subjective_question/', model)).dropna(axis='columns', how='all')
    effectiveness,reliability,fluency = 0,0,0
    for d in dataset.iterrows():
        if d[1]['题型分类'] in ['推理', '证明', '应用', '问答']:
            sum_ += 1
            if isinstance(d[1]['model_answer'], str) and not d[1]['model_answer']:
                effectiveness += 1
                reliability += 1
                fluency += 1
            elif isinstance(d[1]['model_answer'], str) and d[1]['model_answer']:
                effectiveness += int(d[1]['effectiveness'])
                reliability += int(d[1]['reliability'])
                fluency += int(d[1]['fluency'])
            elif math.isnan(d[1]['model_answer']):
                effectiveness += 1
                reliability += 1
                fluency += 1
            else: 
                effectiveness += int(d[1]['effectiveness'])
                reliability += int(d[1]['reliability'])
                fluency += int(d[1]['fluency'])


    scores[model_name] = {
        'effectiveness':effectiveness/sum_,
        'reliability':reliability/sum_,
        'fluency':fluency/sum_
    }
    

In [190]:
scores

{'chatglm': {'effectiveness': 2.884, 'reliability': 2.8, 'fluency': 2.96},
 'davinci-003': {'effectiveness': 2.868, 'reliability': 2.8, 'fluency': 2.988},
 'chinese-alpaca-13b': {'effectiveness': 2.492,
  'reliability': 2.684,
  'fluency': 2.948},
 'chatglm2': {'effectiveness': 2.832, 'reliability': 2.748, 'fluency': 2.968},
 'moss-sft': {'effectiveness': 1.912, 'reliability': 2.04, 'fluency': 2.544},
 'baichuan-7b': {'effectiveness': 1.976,
  'reliability': 2.128,
  'fluency': 2.404}}

In [195]:
subjective_scores = []
for model,dic in scores.items():
    dic['fullmarks'] = 3.000
    dic['model_name'] = model
    
    subjective_scores.append(dic)
pd.DataFrame(subjective_scores).to_csv('dataset/eval_metrics/subjective_question_scores_from_3dimension.csv')

# 3 ability dimension score calculation (基础/中文/专业)

In [80]:
llm_results = pd.read_csv('dataset/output/results_with_llm.csv')

In [98]:
sum_per_class_1 = dict(llm_results['一级分类'].value_counts())
sum_per_class_2 = dict(llm_results['二级分类'].value_counts())
print(sum_per_class_1)
print(sum_per_class_2)

{'基础能力': 271, '中文特性': 114, '专业能力': 65}
{'百科': 110, '计算能力': 40, '语义理解': 39, '代码': 39, '字义理解': 24, '对话闲聊': 22, '谚语': 22, '生成与创作': 21, '文学': 20, '成语': 19, '医疗': 19, '汉字字形和拼音理解': 18, '翻译': 18, '抽象代数': 16, '法律': 12, '诗文写作': 11}


In [97]:
class_2_class_1_mapping = {k:v for k,v in zip(llm_results['二级分类'], llm_results['一级分类'])}
print(class_2_class_1_mapping)

{'生成与创作': '基础能力', '语义理解': '基础能力', '百科': '基础能力', '对话闲聊': '基础能力', '计算能力': '基础能力', '代码': '基础能力', '成语': '中文特性', '文学': '中文特性', '谚语': '中文特性', '字义理解': '中文特性', '诗文写作': '中文特性', '汉字字形和拼音理解': '中文特性', '医疗': '专业能力', '翻译': '专业能力', '抽象代数': '专业能力', '法律': '专业能力'}


In [93]:
# bad case remove 150
print(llm_results.iloc[150])
sum_per_class_1['基础能力'] = sum_per_class_1['基础能力']-1
sum_per_class_2['百科'] = sum_per_class_2['百科']-1
print(sum_per_class_1)

一级分类                                                               基础能力
二级分类                                                                 百科
domain                                                             自然知识
数据来源                                                          机器生成/人工调整
题型分类                                                                 判断
prompt                                                            答案：对。
answer                                                                对
id                                                                  150
instruction           答案：对。\n请利用你的自然知识，一步一步思考并从“正确”或者“错误”两个选项中给出最终的判...
chatglm               好的,我来解释一下。首先,我们知道,氧气是一种无色、无味、无臭的气体,密度比空气大,在常温下...
chatglm2              我认为这个问题的答案是正确的。因为根据我的了解，地球上的生物种类繁多，而且它们之间存在着复杂...
chinese-alpaca-13b     1. 水在地球上是循环的吗？\n2. 水循环包括蒸发、降水和地下水流动吗？\n3. 水循环...
moss-sft              问题：“这个植物是什么花？” <eoh> \n<|Human|>: 这个超市里的水果都是什么...
baichuan-7b            1. The following statements are true or f

In [136]:
score_per_class_1 = {
    '基础能力': 100,
    '中文特性': 100,
    '专业能力': 100
}


score_per_class2_in_different_class1 = {
    '基础能力': 100/6,
    '中文特性': 100/6,
    '专业能力': 100/4
}


In [137]:
class_2_score_per_question = {
   c_2:score_per_class2_in_different_class1[c_1]/sum_per_class_2[c_2] for c_2,c_1 in class_2_class_1_mapping.items()
    
}
class_2_score_per_question

{'生成与创作': 0.7936507936507937,
 '语义理解': 0.4273504273504274,
 '百科': 0.15151515151515152,
 '对话闲聊': 0.7575757575757577,
 '计算能力': 0.4166666666666667,
 '代码': 0.4273504273504274,
 '成语': 0.8771929824561404,
 '文学': 0.8333333333333334,
 '谚语': 0.7575757575757577,
 '字义理解': 0.6944444444444445,
 '诗文写作': 1.5151515151515154,
 '汉字字形和拼音理解': 0.925925925925926,
 '医疗': 1.3157894736842106,
 '翻译': 1.3888888888888888,
 '抽象代数': 1.5625,
 '法律': 2.0833333333333335}

In [108]:
class_2_open_qa_weight_mapping = {
    '生成与创作': {'effectiveness': 0.6,
                 'fluency': 0.3,
                 'reliability': 0.1},
    '语义理解': {'effectiveness': 0.4,
                'fluency': 0.3,
                'reliability': 0.3},
    '百科': {'effectiveness': 0.2,
            'fluency': 0.1,
            'reliability': 0.7},
    '对话闲聊': {'effectiveness': 0.4,
                'fluency': 0.4,
                'reliability': 0.2},
    '计算能力': {'effectiveness': 0.2,
                'fluency': 0.1,
                'reliability': 0.7},
    '代码': {'effectiveness': 0.2,
            'fluency': 0.1,
            'reliability': 0.7},
    '成语': {'effectiveness': 0.5,
            'fluency': 0.25,
            'reliability': 0.25},
    '谚语': {'effectiveness': 0.4,
            'fluency': 0.4,
            'reliability': 0.2},
    '字义理解': {'effectiveness': 0.6,
                'fluency': 0.2,
                'reliability': 0.2},
    '诗文写作': {'effectiveness': 0.6,
                'fluency': 0.3,
                'reliability': 0.1},
    '法律': {'effectiveness': 0.3,
            'fluency': 0.2,
            'reliability': 0.5},
    '医疗': {'effectiveness': 0.3,
            'fluency': 0.2,
            'reliability': 0.5},
    '翻译': {'effectiveness': 0.6,
            'fluency': 0.3,
            'reliability': 0.1},
    '抽象代数': {'effectiveness': 0.4,
                'fluency': 0.2,
                'reliability': 0.4},
}

In [151]:
# model_name_objective = 'moss-sft.csv'
# model_name_subjective = 'pointwise_moss-sft.csv'

In [183]:
model_list = ['chatglm', 'chatglm2', 'davinci-003', 'chinese-alpaca-13b', 'moss-sft', 'baichuan-7b']
model_name_objective_list = [f'{m}.csv' for m in model_list]
model_name_subjective_list = [f'pointwise_{m}.csv' for m in model_list]

In [184]:
from collections import defaultdict
scores_all_c1 = {}
scores_all_c2 = {}

for model, model_name_objective, model_name_subjective in zip(model_list, model_name_objective_list, model_name_subjective_list):
    objective_acc_per_class_2 = defaultdict(int)
    dataset = pd.read_csv(os.path.join('dataset/evaluation/review_objective_question/', model_name_objective)).dropna(axis='columns', how='all')
    for d in dataset.iterrows():
        if d[1]['id'] == 150:
            continue

        sum_ += 1
        if d[1]['题型分类'] == '判断':
            if d[1]['answer'] == 'FALSE' and d[1]['new_answer'] == 'B':
                objective_acc_per_class_2[d[1]['二级分类']] += 1
            elif d[1]['answer'] == 'TRUE' and d[1]['new_answer'] == 'A':
                objective_acc_per_class_2[d[1]['二级分类']] += 1
    #             else:
    #                 print(d[1]['answer'])
    #                 print(d[1]['ori_answer'])
    #                 print(d[1]['new_answer'])
    #                 print('*'*100)
        if d[1]['题型分类'] == '选择': 
            if d[1]['answer'] == d[1]['new_answer']:
                objective_acc_per_class_2[d[1]['二级分类']] += 1
    objective_scores_per_class_2 = {c_2:class_2_score_per_question[c_2]*hit for c_2,hit in objective_acc_per_class_2.items()}
    
    
    subjective_point_per_class_2 = defaultdict(float)
    dataset = pd.read_csv(os.path.join('dataset/evaluation/subjective_question/', model_name_subjective)).dropna(axis='columns', how='all')
    for d in dataset.iterrows():
        if d[1]['id'] == 150:
            continue

        if d[1]['题型分类'] in ['推理', '证明', '应用', '问答']:
            if isinstance(d[1]['model_answer'], str) and not d[1]['model_answer']:
                effectiveness = 1
                reliability = 1
                fluency = 1
            elif isinstance(d[1]['model_answer'], str) and d[1]['model_answer']:
                effectiveness = int(d[1]['effectiveness'])
                reliability = int(d[1]['reliability'])
                fluency = int(d[1]['fluency'])
            elif math.isnan(d[1]['model_answer']):
                effectiveness = 1
                reliability = 1
                fluency = 1
            else: 
                effectiveness = int(d[1]['effectiveness'])
                reliability = int(d[1]['reliability'])
                fluency = int(d[1]['fluency'])
            weights = class_2_open_qa_weight_mapping[d[1]['二级分类']]
            score = (effectiveness*weights['effectiveness'] + reliability*weights['reliability'] + fluency*weights['fluency'])/3

            subjective_point_per_class_2[d[1]['二级分类']] += class_2_score_per_question[d[1]['二级分类']]*score
    
    scores_c1 = defaultdict(float)
    scores_c2 = defaultdict(float)
    
    for c_2,score in subjective_point_per_class_2.items():
        scores_c2[c_2] += score
        c_1 = class_2_class_1_mapping[c_2]
        scores_c1[c_1] += score
    for c_2,score in objective_scores_per_class_2.items():
        scores_c2[c_2] += score
        c_1 = class_2_class_1_mapping[c_2]
        scores_c1[c_1] += score
        
    scores_all_c1[model] = scores_c1
    scores_all_c2[model] = scores_c2    

In [185]:
subjective_point_per_class_2 = defaultdict(float)
dataset = pd.read_csv(os.path.join('dataset/evaluation/subjective_question/', model_name_subjective)).dropna(axis='columns', how='all')
for d in dataset.iterrows():
    if d[1]['id'] == 150:
        continue

    if d[1]['题型分类'] in ['推理', '证明', '应用', '问答']:
        if isinstance(d[1]['model_answer'], str) and not d[1]['model_answer']:
            effectiveness = 1
            reliability = 1
            fluency = 1
        elif isinstance(d[1]['model_answer'], str) and d[1]['model_answer']:
            effectiveness = int(d[1]['effectiveness'])
            reliability = int(d[1]['reliability'])
            fluency = int(d[1]['fluency'])
        elif math.isnan(d[1]['model_answer']):
            effectiveness = 1
            reliability = 1
            fluency = 1
        else: 
            effectiveness = int(d[1]['effectiveness'])
            reliability = int(d[1]['reliability'])
            fluency = int(d[1]['fluency'])
        weights = class_2_open_qa_weight_mapping[d[1]['二级分类']]
        score = (effectiveness*weights['effectiveness'] + reliability*weights['reliability'] + fluency*weights['fluency'])/3
        
        subjective_point_per_class_2[d[1]['二级分类']] += class_2_score_per_question[d[1]['二级分类']]*score

In [187]:
objective_scores_per_class_2

{'语义理解': 1.7094017094017095,
 '百科': 0.7575757575757576,
 '计算能力': 0.4166666666666667,
 '代码': 0.4273504273504274,
 '汉字字形和拼音理解': 4.62962962962963}

In [155]:
objective_scores_per_class_2 = {c_2:class_2_score_per_question[c_2]*hit for c_2,hit in objective_acc_per_class_2.items()}
objective_scores_per_class_2

{'语义理解': 0.8547008547008548,
 '百科': 0.4545454545454546,
 '计算能力': 0.4166666666666667,
 '代码': 1.7094017094017095,
 '成语': 1.7543859649122808,
 '文学': 2.5,
 '字义理解': 7.63888888888889,
 '汉字字形和拼音理解': 5.555555555555556,
 '翻译': 1.3888888888888888,
 '抽象代数': 4.6875,
 '法律': 2.0833333333333335}

In [156]:
scores = defaultdict(float)
for c_2,score in subjective_point_per_class_2.items():
    c_1 = class_2_class_1_mapping[c_2]
    scores[c_1] += score
for c_2,score in objective_scores_per_class_2.items():
    c_1 = class_2_class_1_mapping[c_2]
    scores[c_1] += score

In [157]:
scores

defaultdict(float,
            {'基础能力': 51.260646760646765,
             '中文特性': 42.878123338649665,
             '专业能力': 49.234283625731})

In [164]:
scores_all_c1

{'chatglm': defaultdict(float,
             {'基础能力': 82.23155085655091,
              '中文特性': 66.68328017012229,
              '专业能力': 69.51937134502923}),
 'chatglm2': defaultdict(float,
             {'基础能力': 86.69084619084622,
              '中文特性': 73.57168172957648,
              '专业能力': 78.42044346978558}),
 'davinci-003': defaultdict(float,
             {'基础能力': 90.47428034928039,
              '中文特性': 75.40315435052278,
              '专业能力': 81.80616471734893}),
 'chinese-alpaca-13b': defaultdict(float,
             {'基础能力': 70.68259518259521,
              '中文特性': 62.17326776537304,
              '专业能力': 63.63913255360624}),
 'baichuan-7b': defaultdict(float,
             {'基础能力': 46.209817959817954,
              '中文特性': 33.32912457912458,
              '专业能力': 42.637670565302145}),
 'moss-sft': defaultdict(float,
             {'基础能力': 51.260646760646765,
              '中文特性': 42.878123338649665,
              '专业能力': 49.234283625731})}

In [175]:
c1_record = []
for model,dic in scores_all_c1.items():
    dic['model_name'] = model
    c1_record.append(dic
    )
pd.DataFrame(c1_record).to_csv('dataset/eval_metrics/class_1_score.csv')

In [178]:
c2_record = []
for model,dic in scores_all_c2.items():
    dic['model_name'] = model
    c2_record.append(dic
    )
pd.DataFrame(c2_record).to_csv('dataset/eval_metrics/class_2_score.csv')

In [167]:
sum(list(scores_all_c1['chatglm'].values()))

218.43420237170244

In [188]:
c2_record_normalized = []
for model,dic in scores_all_c2.items():
    normalized_dic = {'model_name': model}
    for c2,score in dic.items():
        if c2 != 'model_name':
            c1 = class_2_class_1_mapping[c2]
            devide = score_per_class2_in_different_class1[c1]
            score = score*100/devide
            normalized_dic[c2] = score

    c2_record_normalized.append(normalized_dic)
pd.DataFrame(c2_record_normalized).to_csv('dataset/eval_metrics/class_2_score_normalized.csv')

In [181]:
scores_all_c2

{'chatglm': defaultdict(float,
             {'生成与创作': 16.507936507936513,
              '语义理解': 12.678062678062684,
              '百科': 15.6868686868687,
              '对话闲聊': 15.505050505050512,
              '计算能力': 9.375,
              '代码': 12.47863247863248,
              '谚语': 16.616161616161623,
              '字义理解': 4.6759259259259265,
              '诗文写作': 16.565656565656568,
              '医疗': 23.859649122807017,
              '翻译': 11.527777777777775,
              '抽象代数': 13.645833333333334,
              '法律': 20.48611111111111,
              '成语': 6.140350877192983,
              '文学': 12.5,
              '汉字字形和拼音理解': 10.185185185185187,
              'model_name': 'chatglm'}),
 'chatglm2': defaultdict(float,
             {'生成与创作': 16.45502645502646,
              '语义理解': 12.692307692307697,
              '百科': 15.2878787878788,
              '对话闲聊': 15.606060606060613,
              '计算能力': 12.333333333333334,
              '代码': 14.31623931623932,
              '谚语': 1