In [None]:
# common

# common imports
import sys
import os
from pathlib import Path
from typing import Any, Callable, Dict, Literal, Optional, Protocol, Tuple, List
from functools import cmp_to_key

import json

# Add our code to import path
sys.path.insert(0, "/workspace/code")

from otgpt_hft.data_model.serial.store import Store
from otgpt_hft.data_model.serial.entry import SerializedEntry
from otgpt_hft.data_model.source import OAnnoSource
from otgpt_hft.data_model.dialogue.graph import DialogueGraph


In [5]:
# load dev split
dataset_name = "Thaweewat-oasst1_th"
split = "dev-2024-02-12--2"

# NOTE: replace with path to Human Feedback Tool data store
data_split_dir = Path(f"/workspace/data/store/{dataset_name}/{split}")
print("data_split_dir", data_split_dir)

os.makedirs(data_split_dir, exist_ok=True)
store = Store(SerializedEntry, data_split_dir)

# loading annotation storage
await store.load_chunks()

# user for computing scores
# NOTE: must be use with complete annotation
#       this script does not work with partial annotation
user = "user/<username>"

data_split_dir /workspace/data/store/Thaweewat-oasst1_th/dev-2024-02-12--2


In [7]:
# get all data entries
entries = await store.get_entries(0, len(store))

Show annotation of a given user

In [22]:
for idx, entry in enumerate(entries):
    
    # compute graph
    graph = DialogueGraph(entry)

    # get comparison data of annotator (user) of 
    # the initial prompt (root node)
    cmp = graph.root.get_cmp(user)

    # get all responses for root
    res_ids = graph.root._next
    
    # compare function for sorting based on comparison data `cmp`
    def compare(res_id_1, res_id_2):
        c = cmp.get_cmp(res_id_1, res_id_2)
        if c == ">":
            return 1
        elif c == "<":
            return -1
        elif c == "=":
            return 0
        assert False, f"c {c}"
    
    sorted_res_ids = sorted(res_ids, key=cmp_to_key(compare), reverse=True)

    print("entry", idx, graph.root.unit.id)
    prev_res_id = None
    for res_id in sorted_res_ids:
        if prev_res_id is not None:
           print(cmp.get_cmp(prev_res_id, res_id))
        print(graph.nodes[res_id].unit.source.name)
        prev_res_id = res_id
    print("================================================================")

entry 0 b2ea0598-3bdc-4d91-a9c4-ea2dd8db47fd
openai/chatgpt-3.5/2024-02-06
=
openai/chatgpt-4/2024-02-06
>
Thaweewat-oasst1_th
=
SeaLLMs/SeaLLM-7B-v2
>
Thaweewat-oasst1_th
>
aisingapore/sealion7b-instruct-nc
=
pythainlp/wangchanglm-7.5B-sft-en
=
pythainlp/wangchanglm-7.5B-sft-enth
>
llama2-7b-finetune-hf
=
llama2-7b-finetune-hf/dpo/002-001-dpo-temp-0_3-v-all-ref/checkpoint-4
>
mistralai/Mistral-7B-Instruct-v0.2
>
scb10x/typhoon-7b
entry 1 3e0fc4b0-ac79-4cd3-842c-aebb81231eb5
Thaweewat-oasst1_th
>
openai/chatgpt-3.5/2024-02-06
=
openai/chatgpt-4/2024-02-06
>
llama2-7b-finetune-hf
=
llama2-7b-finetune-hf/dpo/002-001-dpo-temp-0_3-v-all-ref/checkpoint-4
=
pythainlp/wangchanglm-7.5B-sft-en
>
Thaweewat-oasst1_th
>
Thaweewat-oasst1_th
>
aisingapore/sealion7b-instruct-nc
=
pythainlp/wangchanglm-7.5B-sft-enth
>
mistralai/Mistral-7B-Instruct-v0.2
>
scb10x/typhoon-7b
>
SeaLLMs/SeaLLM-7B-v2
entry 2 fcd988b4-141c-4919-aa90-b0d7484f7833
Thaweewat-oasst1_th
=
llama2-7b-finetune-hf
=
llama2-7b-finetun

In [16]:
# List of models to evaluate
# also serve as order for listing the models
order = [
    "Thaweewat-oasst1_th",
    "openai/chatgpt-3.5/2024-02-06",
    "openai/chatgpt-4/2024-02-06",
    "llama2-7b-finetune-hf",
    "llama2-7b-finetune-hf/dpo/002-001-dpo-temp-0_3-v-all-ref/checkpoint-4",
    "SeaLLMs/SeaLLM-7B-v2",
    "aisingapore/sealion7b-instruct-nc",
    # "pythainlp/wangchanglm-7.5B-sft-en",
    "pythainlp/wangchanglm-7.5B-sft-enth",
    "mistralai/Mistral-7B-Instruct-v0.2",
    "scb10x/typhoon-7b",
]

In [34]:
# percentile scores of each model
entries_score = {
    model: []
    for model in order
}

# iterate over each data entry
for idx, entry in enumerate(entries):
    # store ranking information of each response (e.g., model output)
    # there are multiple rankings per response since there are multiple
    # reference human annotation (Thaweewat-oasst1_th) per prompt
    entry_rankings = {
        model: []
        for model in order
    }

    # compute graph
    graph = DialogueGraph(entry)

    # get comparison data of annotator (user)
    cmp = graph.root.get_cmp(user)

    # list of responses' id
    res_ids = [
        id
        for id in graph.root._next
        # only keep response we are trying to rank
        if graph.nodes[id].unit.source.name in order
    ]

    # compare function for sorting based on comparison data `cmp`
    def compare(res_id_1, res_id_2):
        c = cmp.get_cmp(res_id_1, res_id_2)
        if c == ">":
            return 1
        elif c == "<":
            return -1
        elif c == "=":
            return 0
        assert False

    # sort response ids based on comparison data `cmp`
    sorted_res_ids = sorted(res_ids, key=cmp_to_key(compare), reverse=True)

    # NOTE: if A > B = C > D
    #       then the rank will be
    #       A   rank 1   true rank 1
    #       B   rank 2   true rank 2.5
    #       C   rank 2   true rank 2.5
    #       D   rank 3   true rank 4
    #       we use true rank for percentile calculation
    
    # print(idx)
    prev_res_id = None
    rank = 1
    count = 0
    # mapping from raw ranking to true ranking
    rank_map: Dict[int, float] = {}
    for res_id in sorted_res_ids:
        if prev_res_id is not None:
            cmp_res = cmp.get_cmp(prev_res_id, res_id)
            # print(cmp_res)
            if cmp_res == ">":
                # compute true rank
                rank_map[rank] = rank + count / 2
                rank += 1 + count
                count = 0
            else:
                count += 1
        entry_rankings[graph.nodes[res_id].unit.source.name].append(rank)
        # print(rank, graph.nodes[res_id].unit.source.name)
        prev_res_id = res_id
    rank_map[rank] = rank + count / 2
    # print("rank_map", rank_map)
    total = len(sorted_res_ids)
    # print("total", total)
    # print(entry_rankings)

    # go over each model
    for name in order:
        rankings = entry_rankings[name]
        
        # map ranking to true ranking
        mapped_rankings = [rank_map[rank] for rank in rankings]
        
        # compute average rank
        avg_rank = sum(mapped_rankings) / len(mapped_rankings)
        
        # print(name)
        # print("mapped_rankings", mapped_rankings)
        # print("avg_rank", avg_rank)
        
        # compute percentile
        score = 1 - (avg_rank - 1) / (total - 1)
        # print("score", score)
        
        entries_score[name].append(score)
        # print()

In [35]:
# compute average score
model_scores = []
for name in order:
    scores = entries_score[name]
    assert len(scores) == len(entries)
    # sum and divide by total entires
    score = sum(scores)/len(entries)*100
    model_scores.append((score, name))

# sort order by score
model_scores = sorted(model_scores, reverse=True)
for score, name in model_scores:
    print(f"{score:.2f}\t", name)

84.73	 openai/chatgpt-4/2024-02-06
72.23	 openai/chatgpt-3.5/2024-02-06
62.57	 Thaweewat-oasst1_th
53.36	 llama2-7b-finetune-hf
52.45	 aisingapore/sealion7b-instruct-nc
52.36	 llama2-7b-finetune-hf/dpo/002-001-dpo-temp-0_3-v-all-ref/checkpoint-4
45.55	 pythainlp/wangchanglm-7.5B-sft-enth
41.73	 SeaLLMs/SeaLLM-7B-v2
17.23	 mistralai/Mistral-7B-Instruct-v0.2
3.41	 scb10x/typhoon-7b
