# Elo Rating

In [1]:
from collections import defaultdict
import numpy as np
import pandas as pd
import os
import plotly.express as px
from tqdm import tqdm
import json
import glob
pd.options.display.float_format = '{:.2f}'.format

In [8]:
cache_dir = "results/gre"
metrics_names = ['Completeness', 'Factualness', 'Granularity', 'Topical', 'Uniqueness']

results_files = glob.glob(os.path.join(cache_dir, "*.csv"))
results_files = [os.path.basename(f) for f in results_files]

# group them by metrics_name according to the start of the file name
results_files_grouped = defaultdict(list)
for f in results_files:
    for m in metrics_names:
        if f.startswith(m):
            results_files_grouped[m].append(f)


In [9]:
results_files_grouped

defaultdict(list,
            {'Completeness': ['Completeness_GREScores.csv'],
             'Factualness': ['Factualness_GREScores.csv'],
             'Granularity': ['Granularity_GREScores.csv'],
             'Topical': ['Topical_GREScores.csv'],
             'Uniqueness': ['Uniqueness_GREScores.csv']})

In [10]:
data_merged = {}
for metric_name, files in results_files_grouped.items():
    file = files[0]
    df = pd.read_csv(os.path.join(cache_dir, file))
    data_merged[metric_name] = df

In [11]:
def compute_elo(battles, K, SCALE, BASE, INIT_RATING):
    rating = defaultdict(lambda: INIT_RATING)

    for rd, model_a, model_b, win in battles[['model_A_name', 'model_B_name', 'win']].itertuples():
        ra = rating[model_a]
        rb = rating[model_b]
        ea = 1 / (1 + BASE ** ((rb - ra) / SCALE))
        eb = 1 / (1 + BASE ** ((ra - rb) / SCALE))
        if win == "model_A_win":
            sa = 1
        elif win == "model_B_win":
            sa = 0
        elif win == "tie" or win == "tie (bothbad)":
            sa = 0.5
        else:
            raise Exception(f"unexpected vote {win}")
            
        rating[model_a] += K * (sa - ea)
        rating[model_b] += K * (1 - sa - eb)

    return rating

In [12]:
def preety_print_elo_ratings(elo_ratings):
    df = pd.DataFrame([
        [n, elo_ratings[n]] for n in elo_ratings.keys()
    ], columns=["Model", "Elo rating"]).sort_values("Elo rating", ascending=False).reset_index(drop=True)
    df["Elo rating"] = (df["Elo rating"] + 0.5).astype(int)
    df.index = df.index + 1
    return df

param_K=16
param_SCALE=400
param_BASE=10
param_INIT_RATING=1000

elo_ratings_dict = {}

for metric_name in metrics_names:
    elo_ratings = compute_elo(data_merged[metric_name], K=param_K, SCALE=param_SCALE, BASE=param_BASE, INIT_RATING=param_INIT_RATING)
    print(metric_name)
    print(preety_print_elo_ratings(elo_ratings))
    elo_ratings_dict[metric_name] = elo_ratings
    # save to txt
    with open(f'output/gre/{metric_name}_elo_ratings.txt', 'w') as f:
        f.write(preety_print_elo_ratings(elo_ratings).to_string())

Completeness
         Model  Elo rating
1  Groundtruth        1198
2     OpenChat         963
3  GPT-4-Turbo         949
4   LLaMA2-70b         891
Factualness
         Model  Elo rating
1     OpenChat        1123
2   LLaMA2-70b        1081
3  GPT-4-Turbo        1079
4  Groundtruth         717
Granularity
         Model  Elo rating
1   LLaMA2-70b        1021
2  GPT-4-Turbo        1021
3     OpenChat         995
4  Groundtruth         962
Topical
         Model  Elo rating
1     OpenChat        1219
2  GPT-4-Turbo        1160
3   LLaMA2-70b        1006
4  Groundtruth         615
Uniqueness
         Model  Elo rating
1     OpenChat        1061
2   LLaMA2-70b        1037
3  GPT-4-Turbo        1036
4  Groundtruth         866


# Agreement

In [22]:
data_merged_human = {}

for metric_name in metrics_names:
    file_dir = 'results/human'
    file_list = glob.glob(os.path.join(file_dir, f"{metric_name}_*.csv"))
    file = file_list[0]

    df = pd.read_csv(file)
    data_merged_human[metric_name] = df


Completeness
Factualness
Granularity
Topical
Uniqueness


In [23]:
data_merged_human['Completeness']

Unnamed: 0,sample_id,model_A_name,model_B_name,win
0,0,Groundtruth,LLaMA2-70b,model_A_win
1,0,Groundtruth,Openchat,model_A_win
2,0,Groundtruth,GPT-4-Turbo,model_A_win
3,0,LLaMA2-70b,Openchat,model_A_win
4,0,LLaMA2-70b,GPT-4-Turbo,tie
...,...,...,...,...
595,69,Groundtruth,Openchat,model_A_win
596,69,Groundtruth,GPT-4-Turbo,model_A_win
597,69,LLaMA2-70b,Openchat,tie
598,69,LLaMA2-70b,GPT-4-Turbo,model_B_win


In [24]:
def cal_agreement(human_1, human_2, metric_name):
    agree = 0.
    total = 0.
    for i in range(len(human_1)):
        if human_1.iloc[i].win == human_2.iloc[i].win:
            agree += 1
        elif human_1.iloc[i].win == "tie" or human_2.iloc[i].win == "tie":
            agree += 0.5
        total += 1

    print(f'{metric_name}: human aggreement score is: {agree/total}')

for metric_name in metrics_names:
    cal_agreement(data_merged_human[metric_name], data_merged[metric_name], metric_name)

Completeness: human aggreement score is: 0.7958333333333333
Factualness: human aggreement score is: 0.6225
Granularity: human aggreement score is: 0.6516666666666666
Topical: human aggreement score is: 0.7191666666666666
Uniqueness: human aggreement score is: 0.6608333333333334
