In [1]:
import os
import ast
import json
import torch
import pickle
import numpy as np
from tqdm import tqdm
from collections import defaultdict, Counter, OrderedDict
from pathlib import Path
from datasets import load_dataset, Dataset
import pandas as pd
import random
import collections
from scipy.stats import pearsonr
from pathlib import Path
from typing import Dict, List, Sequence
import copy
from typing import Dict, Any, List
random.seed(9001)

In [2]:
import mteb
from mteb.task_selection import results_to_dataframe
from mteb.abstasks.AbsTask import AbsTask
from mteb.load_results.mteb_results import MTEBResults
from mteb.model_meta import ModelMeta

### Environmental Variables

In [17]:
RESULT_DIR = 
DATASET_DIR = 

In [3]:
def load_results(result_dir: str,
                    models: List[str],
                    tasks: List[str],
                    score_name: str='main_score') -> Dict[str, Dict[str, Any]]:
    """
    Load PIR results and compute the mean score across tasks for each model.

    Args:
        result_dir (str): Directory containing the results.
        models (List[str]): List of model names.
        tasks (List[str]): List of task names.

    Returns:
        Dict[str, Dict[str, Any]]: Nested dictionary with model names as keys,
            each containing a dictionary of task scores and a 'mean' key for the average score.
    """
    
    results = defaultdict(dict)
    for _model in models:
        _model_path = os.path.join(result_dir, _model.replace('/', '__'))
        assert os.path.isdir(_model_path), f"Model {_model} not found in {result_dir}"
        
        # Only one directory under _model_path
        directories = [d for d in os.listdir(_model_path) if os.path.isdir(os.path.join(_model_path, d))]
        assert len(directories) == 1, f"Model {_model} has more than one directory"
        
        sub_directories = [
            d for d in os.listdir(os.path.join(_model_path, directories[0]))
            if os.path.isdir(os.path.join(_model_path, directories[0], d))
        ]
        assert len(sub_directories) == 1, f"Model {_model} has more than one sub-directory"
        
        scores = []  # List to store scores for computing the mean
        for _task in tasks:
            _task_path = os.path.join(
                _model_path, directories[0], sub_directories[0], f'{_task}.json'
            )
            if os.path.isfile(_task_path):
                with open(_task_path, 'r') as f:
                    score = np.mean([_score[score_name] for _score in json.load(f)['scores']['test']])
                    results[_model][_task] = score
                    scores.append(score)
            else:
                print(f"Task {_task} not found in {result_dir}/{_model}")
                results[_model][_task] = 0
                scores.append(0)
        
        # Compute and store the mean score for the current model
        if scores:
            mean_score = sum(scores) / len(scores)
            results[_model]['mean'] = mean_score
        else:
            results[_model]['mean'] = None  # Handle case with no scores

    return results

def highlight_max_in_column(s):
    """
    Highlights the maximum value in a Series with bold text.

    Args:
        s (pd.Series): A pandas Series representing a column in the DataFrame.

    Returns:
        List[str]: A list of styles for each element in the Series.
    """
    is_max = s == s.max()
    return ['font-weight: bold' if v else '' for v in is_max]

In [4]:
def load_finegrained_results(result_dir: str,
                    models: List[str],
                    tasks: List[str],
                    score_name: str = 'main_score') -> Dict[str, Dict[str, Any]]:
    """
    Load PIR results and compute the mean score across tasks for each model.

    Args:
        result_dir (str): Directory containing the results.
        models (List[str]): List of model names.
        tasks (List[str]): List of task names.

    Returns:
        Dict[str, Dict[str, Any]]: Nested dictionary with model names as keys,
            each containing a dictionary of task scores and a 'mean' key for the average score.
    """
    
    results = defaultdict(dict)
    for _model in models:
        _model_path = os.path.join(result_dir, _model.replace('/', '__'))
        assert os.path.isdir(_model_path), f"Model {_model} not found in {result_dir}"
        
        # Only one directory under _model_path
        directories = [d for d in os.listdir(_model_path) if os.path.isdir(os.path.join(_model_path, d))]
        assert len(directories) == 1, f"Model {_model} has more than one directory"
        
        sub_directories = [
            d for d in os.listdir(os.path.join(_model_path, directories[0]))
            if os.path.isdir(os.path.join(_model_path, directories[0], d))
        ]
        assert len(sub_directories) == 1, f"Model {_model} has more than one sub-directory"
        
        scores = []  # List to store scores for computing the mean
        for _task in tasks:
            _task_path = os.path.join(
                _model_path, directories[0], sub_directories[0], f'{_task}.json'
            )
            if os.path.isfile(_task_path):
                with open(_task_path, 'r') as f:
                    for _hf_subset, _score in [(_score['hf_subset'], _score[score_name]) for _score in json.load(f)['scores']['test']]:
                        results[_model][_task + '-' + _hf_subset] = _score
                        scores.append(_score)
            else:
                print(f"Task {_task} not found in {result_dir}/{_model}")
                results[_model][_task] = 0
                scores.append(0)
        
        # Compute and store the mean score for the current model
        if scores:
            mean_score = sum(scores) / len(scores)
            results[_model]['mean'] = mean_score
        else:
            results[_model]['mean'] = None  # Handle case with no scores

    return results

## 1. Stardard Code Retrieval

In [18]:
tasks = [
    'CodeNetBugPreferenceRetrieval',
    'CodeNetEfficiencyPreferenceRetrieval',
    'CVEFixesPreferenceRetrieval',
    'Defects4JPreferenceRetrieval',
    'DeprecatedCodePreferenceRetrieval',
    'SaferCodePreferenceRetrieval',
    'SQLR2PreferenceRetrieval',
]

models = [
    'bm25s',
    'facebook__contriever',
    'Alibaba-NLP__gte-base-en-v1.5',
    'sentence-transformers__gtr-t5-base',
    'sentence-transformers__gtr-t5-large',
    'intfloat__e5-base-v2',
    'intfloat__e5-large-v2',
    'Alibaba-NLP__gte-Qwen2-1.5B-instruct',
    'intfloat__e5-mistral-7b-instruct',
    'hkunlp__instructor-base',
    'hkunlp__instructor-large',
    'hkunlp__instructor-xl',
    'samaya-ai__promptriever-llama2-7b-v1',
    'samaya-ai__promptriever-llama3.1-8b-v1',
    'samaya-ai__promptriever-llama3.1-8b-instruct-v1',
    'samaya-ai__promptriever-mistral-v0.1-7b-v1',
    'openai__text-embedding-ada-002',
    'openai__text-embedding-3-small',
    'openai__text-embedding-3-large',
    'voyageai__voyage-code-2',
    'voyageai__voyage-code-3',
    'microsoft__codebert-base',
    'microsoft__graphcodebert-base',
    'nomic-ai__CodeRankEmbed',
    'local-repllama-llama31-8b-lora-64',
    'local-repllama-llama31-8b-lora-64-quality',
    'local-repllama-llama32-3b-lora-256',
    'local-repllama-llama32-3b-lora-256-quality',
    'codesage__codesage-small',
    'codesage__codesage-base',
]

### nDCG@10

In [19]:
data = load_results(RESULT_DIR, models, tasks)
df = pd.DataFrame.from_dict(data, orient='index').style.apply(highlight_max_in_column, axis=0)

df

Unnamed: 0,CodeNetBugPreferenceRetrieval,CodeNetEfficiencyPreferenceRetrieval,CVEFixesPreferenceRetrieval,Defects4JPreferenceRetrieval,DeprecatedCodePreferenceRetrieval,SaferCodePreferenceRetrieval,SQLR2PreferenceRetrieval,mean
bm25s,0.02369,0.016018,0.686708,0.64952,0.513111,0.562674,0.40744,0.408452
facebook__contriever,0.042602,0.03556,0.577292,0.4965,0.373674,0.461494,0.18176,0.30984
Alibaba-NLP__gte-base-en-v1.5,0.050217,0.040711,0.75677,0.7412,0.553405,0.706559,0.13802,0.426697
sentence-transformers__gtr-t5-base,0.037531,0.031757,0.755962,0.74681,0.535365,0.729153,0.10686,0.420491
sentence-transformers__gtr-t5-large,0.072997,0.062108,0.79696,0.78222,0.595779,0.75348,0.16223,0.460825
intfloat__e5-base-v2,0.078253,0.062887,0.815572,0.80384,0.67052,0.77015,0.4279,0.518446
intfloat__e5-large-v2,0.15084,0.13421,0.816638,0.81365,0.67969,0.759033,0.42472,0.539826
Alibaba-NLP__gte-Qwen2-1.5B-instruct,0.122938,0.114149,0.843964,0.80937,0.735735,0.786633,0.19978,0.516081
intfloat__e5-mistral-7b-instruct,0.329696,0.296498,0.818562,0.82422,0.72486,0.764849,0.30834,0.581004
hkunlp__instructor-base,0.044007,0.036884,0.802858,0.80085,0.613234,0.743686,0.12445,0.452281


### MRR@10

In [20]:
data = load_results(RESULT_DIR, models, tasks, 'mrr_at_10')
df = pd.DataFrame.from_dict(data, orient='index').style.apply(highlight_max_in_column, axis=0)

df

Unnamed: 0,CodeNetBugPreferenceRetrieval,CodeNetEfficiencyPreferenceRetrieval,CVEFixesPreferenceRetrieval,Defects4JPreferenceRetrieval,DeprecatedCodePreferenceRetrieval,SaferCodePreferenceRetrieval,SQLR2PreferenceRetrieval,mean
bm25s,0.027623,0.017121,0.620801,0.571994,0.345391,0.480747,0.367719,0.347343
facebook__contriever,0.058414,0.047684,0.498378,0.430755,0.299047,0.362177,0.165324,0.265968
Alibaba-NLP__gte-base-en-v1.5,0.067853,0.055278,0.675761,0.671603,0.458251,0.615709,0.120718,0.380739
sentence-transformers__gtr-t5-base,0.054817,0.043855,0.670361,0.66468,0.440766,0.634615,0.084533,0.370518
sentence-transformers__gtr-t5-large,0.100716,0.08156,0.715084,0.707569,0.493739,0.654435,0.127128,0.411461
intfloat__e5-base-v2,0.108133,0.085871,0.740699,0.734029,0.554018,0.682599,0.385947,0.470185
intfloat__e5-large-v2,0.195422,0.17444,0.741656,0.745225,0.561858,0.670862,0.381011,0.495782
Alibaba-NLP__gte-Qwen2-1.5B-instruct,0.15797,0.148821,0.785788,0.740444,0.617184,0.70308,0.154275,0.472509
intfloat__e5-mistral-7b-instruct,0.399009,0.356252,0.749444,0.758945,0.61143,0.676966,0.2609,0.544706
hkunlp__instructor-base,0.060527,0.048069,0.724202,0.725136,0.510993,0.644316,0.106088,0.402761


## 2. Preference-guided Code Retrieval

#### Load golden labels

In [29]:
def load_jsonl(filepath):
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # skip empty lines
                data.append(json.loads(line))
    return data

In [30]:
# the map between tasks an subtasks
task_sub_task_mapping = {
    "CodeNetBugPreferenceRetrieval": ["c","cpp", "go", "java", "javascript", "python", "ruby", "rust", "swift", "typescript"],
    "CodeNetEfficiencyPreferenceRetrieval": ["c","cpp", "go", "java", "javascript", "python", "ruby", "rust", "swift", "typescript"],
    "CVEFixesPreferenceRetrieval": ['c', 'go', 'java', 'python', 'ruby'],
    "Defects4JPreferenceRetrieval": [None],
    "DeprecatedCodePreferenceRetrieval": ['numpy', 'pandas', 'pytorch', 'scipy', 'seaborn', 'sklearn', 'tensorflow', 'transformers'],
    "SaferCodePreferenceRetrieval": ["c", "cpp", "python", "java", "javascript", "go", "ruby"],
    "SQLR2PreferenceRetrieval": [None],
}

In [31]:
# load the golden qrels
datadir_mapping = {
    "CodeNetBugPreferenceRetrieval": "codenet_bug",
    "CodeNetEfficiencyPreferenceRetrieval": "codenet_effi",
    "CVEFixesPreferenceRetrieval": "CVEFixes",
    "Defects4JPreferenceRetrieval": "Defects4J",
    "DeprecatedCodePreferenceRetrieval": "DeprecatedCode",
    "SaferCodePreferenceRetrieval": "SafeCoder",
    'SQLR2PreferenceRetrieval': "sqlr2",
}

In [32]:
def get_lang(_doc_str):
    return _doc_str.split("-")[1]

def get_lang_codenet(_doc_str):
    return _doc_str.split("-")[-2].lower()

def load_qrels(task, lang):
    if task=='DeprecatedCodePreferenceRetrieval':
        qrels_path = os.path.join(DATASET_DIR, datadir_mapping[task], f'qrels-{lang}.jsonl')
    else:
        qrels_path = os.path.join(DATASET_DIR, datadir_mapping[task], 'qrels.jsonl')

    lines = load_jsonl(qrels_path)

    # filtering only for CVEFixesPreferenceRetrieval
    if task == 'CVEFixesPreferenceRetrieval' or task == 'SaferCodePreferenceRetrieval':
        tmp_lines = []
        for _line in lines:
            filtered_pos_docids = [_id for _id in _line['pos-docids'] if get_lang(_id) == lang]
            filtered_neg_docids = [_id for _id in _line['neg-docids'] if get_lang(_id) == lang]
            _line['pos-docids'] = filtered_pos_docids
            _line['neg-docids'] = filtered_neg_docids
            if len(filtered_neg_docids) != 0 and len(filtered_neg_docids) != 0:
                tmp_lines.append(_line)

        lines = tmp_lines

    if task.startswith("CodeNet"):
        tmp_lines = []
        for _line in lines:
            filtered_pos_docids = [_id for _id in _line['pos-docids'] if get_lang_codenet(_id) == lang]
            filtered_neg_docids = [_id for _id in _line['neg-docids'] if get_lang_codenet(_id) == lang]
            _line['pos-docids'] = filtered_pos_docids
            _line['neg-docids'] = filtered_neg_docids
            if len(filtered_neg_docids) != 0 and len(filtered_neg_docids) != 0:
                tmp_lines.append(_line)

        lines = tmp_lines

    qrels_dict = {_line['qid']: _line for _line in lines}
    return qrels_dict

#### Obtain the prediction

In [33]:
def load_predictions(model, task, subtask):
    prediction_file = f'{task}_{subtask}_predictions.json' if subtask is not None else f'{task}_default_predictions.json'
    prediction_path = os.path.join(result_dir, model, prediction_file)

    with open(prediction_path, 'r') as f:
        json_dict = json.loads(f.read())

    json_sorted_dict = {}
    for _k, _v in json_dict.items():
        sorted_doc_values = sorted([(_docid, _score) for _docid, _score in _v.items()], key=lambda x: x[1], reverse=True)
        json_sorted_dict[_k] = [_item[0] for _item in sorted_doc_values]

    return json_sorted_dict

## Metrics for Quality-aware Code Information Retrieval

Assume for each query, we have:
- A set of positive samples: $\mathcal{P} = \{p_1, p_2, \dots, p_m\}$
- A set of negative samples: $\mathcal{N} = \{n_1, n_2, \dots, n_n\}$
- Each sample has a model score: $s(x)$
- Total number of candidates: $C = |\mathcal{P}| + |\mathcal{N}|$

### 2.1: Pairwise Preference Accuracy (PPA)

Measures the proportion of positive-negative pairs where the positive is scored higher:

$$
\text{PPA} = \frac{1}{|\mathcal{P}| \cdot |\mathcal{N}|} \sum_{p \in \mathcal{P}} \sum_{n \in \mathcal{N}} \mathbb{1}(s(p) > s(n))
$$

- **Upper bound (best case)**: $\text{PPA} = 1$, meaning all positives are scored above all negatives.
- **Lower bound (worst case)**: $\text{PPA} = 0$, meaning all negatives are scored above all positives.

In [34]:
def compute_ppa(_qrels_dict, _predictions):
    overall_ppa = []
    for _qrels_key in list(_qrels_dict.keys()):
        pos_docids = _qrels_dict[_qrels_key]['pos-docids']
        neg_docids = _qrels_dict[_qrels_key]['neg-docids']

        query_ppa = []
        for _pos_docid in pos_docids:
            _pos_docid_rank = _predictions[_qrels_key].index(_pos_docid) if _pos_docid in _predictions[_qrels_key] else len(_predictions[_qrels_key])
            for _neg_docid in neg_docids:
                _neg_docid_rank = _predictions[_qrels_key].index(_neg_docid) if _neg_docid in _predictions[_qrels_key] else len(_predictions[_qrels_key])
                if _pos_docid_rank < _neg_docid_rank:
                    query_ppa.append(1)
                else:
                    query_ppa.append(0)
        overall_ppa.append(np.mean(query_ppa))

    assert len(overall_ppa) > 0
    return np.mean(overall_ppa)

In [36]:
ppa_dict = defaultdict(dict)

for _model in tqdm(models):
    for _task in tasks:
        ppa_values = []
        for _subtask in task_sub_task_mapping[_task]:
            _qrels_dict = load_qrels(_task, _subtask)
            _predictions = load_predictions(_model, _task, _subtask)
            _ppa_value = compute_ppa(_qrels_dict, _predictions)
            ppa_values.append(_ppa_value)
            ppa_dict[_model][_task + '-' + _subtask if _subtask is not None else _task] = _ppa_value
        
        # Compute mean PPA over subtasks
        if _task not in ppa_dict[_model]:
            mean_ppa = sum(ppa_values) / len(ppa_values) if ppa_values else None
            ppa_dict[_model][_task] = mean_ppa

df = pd.DataFrame.from_dict(ppa_dict, orient='index').style.apply(highlight_max_in_column, axis=0)

100%|██████████| 30/30 [30:34<00:00, 61.14s/it]


In [37]:
# original_df is already your DataFrame
ppa_df = pd.DataFrame.from_dict(ppa_dict, orient='index')
mask = ~ppa_df.columns.astype(str).str.contains(r'-')
ppa_mean_df = ppa_df.loc[:, mask]

# inspect the result
ppa_mean_df

Unnamed: 0,CodeNetBugPreferenceRetrieval,CodeNetEfficiencyPreferenceRetrieval,CVEFixesPreferenceRetrieval,Defects4JPreferenceRetrieval,DeprecatedCodePreferenceRetrieval,SaferCodePreferenceRetrieval,SQLR2PreferenceRetrieval
bm25s,0.460592,0.3717,0.648998,0.567452,0.177335,0.505294,0.698311
facebook__contriever,0.408573,0.385453,0.531247,0.468951,0.473865,0.410161,0.593825
Alibaba-NLP__gte-base-en-v1.5,0.442425,0.386383,0.553256,0.595289,0.506067,0.477051,0.603781
sentence-transformers__gtr-t5-base,0.407176,0.339693,0.572111,0.571734,0.474838,0.496209,0.649839
sentence-transformers__gtr-t5-large,0.455356,0.391341,0.572961,0.576017,0.467169,0.488318,0.681315
intfloat__e5-base-v2,0.476349,0.432534,0.605025,0.591006,0.479891,0.527586,0.695093
intfloat__e5-large-v2,0.462021,0.467265,0.610142,0.620985,0.46782,0.516071,0.730491
Alibaba-NLP__gte-Qwen2-1.5B-instruct,0.443527,0.441737,0.647909,0.603854,0.468762,0.51674,0.711484
intfloat__e5-mistral-7b-instruct,0.49272,0.482654,0.607143,0.64454,0.524875,0.503371,0.735318
hkunlp__instructor-base,0.422701,0.394064,0.6077,0.608137,0.490619,0.495242,0.713998


### 2.2: Margin-based Ranking Score (MRS)

Measures the average margin between positive and negative scores - in order to normalize the score across different datasets and retrievers, we use rank reciprocal function $r$

$$
\text{MRS} = \frac{1}{|\mathcal{P}| \cdot |\mathcal{N}|} \sum_{p \in \mathcal{P}} \sum_{n \in \mathcal{N}} \left( r(p) - r(n) \right)
$$

- **Upper bound**: $\text{MRS} \to 1$ if positives are scored much higher than negatives.
- **Lower bound**: $\text{MRS} \to -1$ if negatives are scored much higher than positives.

In [43]:
def compute_mrs(_qrels_dict, _predictions):
    overall_mrs = []
    for _qrels_key in list(_qrels_dict.keys()):
        pos_docids = _qrels_dict[_qrels_key]['pos-docids']
        neg_docids = _qrels_dict[_qrels_key]['neg-docids']
        query_mrs = []
        for _pos_docid in pos_docids:
            _pos_docid_rank = _predictions[_qrels_key].index(_pos_docid) if _pos_docid in _predictions[_qrels_key] else len(_predictions[_qrels_key])
            for _neg_docid in neg_docids:
                _neg_docid_rank = _predictions[_qrels_key].index(_neg_docid) if _neg_docid in _predictions[_qrels_key] else len(_predictions[_qrels_key])
                query_mrs.append(1./(1+_pos_docid_rank) - 1./(1+_neg_docid_rank))
        overall_mrs.append(np.mean(query_mrs))
    return np.mean(overall_mrs)

In [45]:
mrs_dict = defaultdict(dict)
for _model in tqdm(models):
    mrs_values = []
    for _task in tasks:
        for _subtask in task_sub_task_mapping[_task]:
            _qrels_dict = load_qrels(_task, _subtask)
            _predictions = load_predictions(_model, _task, _subtask)
            _mrs_value = compute_mrs(_qrels_dict, _predictions)
            mrs_values.append(_mrs_value)
            mrs_dict[_model][_task + '-' + _subtask if _subtask is not None else _task] = _mrs_value
        
        # Compute mean MRS over subtasks
        if _task not in mrs_dict[_model]:
            mean_mrs = sum(mrs_values) / len(mrs_values) if mrs_values else None
            mrs_dict[_model][_task] = mean_mrs

df = pd.DataFrame.from_dict(mrs_dict, orient='index').style.apply(highlight_max_in_column, axis=0)

100%|██████████| 30/30 [27:38<00:00, 55.29s/it]


In [46]:
mrs_df = pd.DataFrame.from_dict(mrs_dict, orient='index')
# original_df is already your DataFrame
mask = ~mrs_df.columns.astype(str).str.contains(r'-')
mrs_mean_df = mrs_df.loc[:, mask]

# inspect the result
mrs_mean_df

Unnamed: 0,CodeNetBugPreferenceRetrieval,CodeNetEfficiencyPreferenceRetrieval,CVEFixesPreferenceRetrieval,Defects4JPreferenceRetrieval,DeprecatedCodePreferenceRetrieval,SaferCodePreferenceRetrieval,SQLR2PreferenceRetrieval
bm25s,0.001361,-0.00044,0.026623,0.074746,-0.019841,-0.016983,0.217158
facebook__contriever,-0.001578,-0.001291,0.006353,-0.017175,0.003271,-0.010169,0.080737
Alibaba-NLP__gte-base-en-v1.5,-0.004377,-0.002676,0.010859,0.104495,0.011133,0.00502,0.065668
sentence-transformers__gtr-t5-base,-0.00083,-0.002046,0.012063,0.060104,0.007494,0.004252,0.035975
sentence-transformers__gtr-t5-large,-0.003788,-0.005891,0.010156,0.077666,0.005331,0.002426,0.04857
intfloat__e5-base-v2,0.003403,0.002342,0.0221,0.090219,0.015921,0.016638,0.271137
intfloat__e5-large-v2,-0.004649,-0.001405,0.021521,0.119311,0.014561,0.013517,0.2355
Alibaba-NLP__gte-Qwen2-1.5B-instruct,-0.008265,-0.004938,0.025908,0.10364,0.018533,0.016389,0.087614
intfloat__e5-mistral-7b-instruct,0.014154,0.007928,0.026452,0.148796,0.0286,0.022714,0.15483
hkunlp__instructor-base,-0.003342,-0.0028,0.019617,0.107377,0.016457,0.012225,0.080215


### 3. Analysis

#### 3.1 PPA across multiple languages

In [47]:
ppa_df

Unnamed: 0,CodeNetBugPreferenceRetrieval-c,CodeNetBugPreferenceRetrieval-cpp,CodeNetBugPreferenceRetrieval-go,CodeNetBugPreferenceRetrieval-java,CodeNetBugPreferenceRetrieval-javascript,CodeNetBugPreferenceRetrieval-python,CodeNetBugPreferenceRetrieval-ruby,CodeNetBugPreferenceRetrieval-rust,CodeNetBugPreferenceRetrieval-swift,CodeNetBugPreferenceRetrieval-typescript,...,DeprecatedCodePreferenceRetrieval,SaferCodePreferenceRetrieval-c,SaferCodePreferenceRetrieval-cpp,SaferCodePreferenceRetrieval-python,SaferCodePreferenceRetrieval-java,SaferCodePreferenceRetrieval-javascript,SaferCodePreferenceRetrieval-go,SaferCodePreferenceRetrieval-ruby,SaferCodePreferenceRetrieval,SQLR2PreferenceRetrieval
bm25s,0.45083,0.447957,0.421137,0.550447,0.492656,0.428161,0.449553,0.477331,0.433509,0.454342,...,0.177335,0.318072,0.533333,0.585551,0.5,0.477876,0.622222,0.5,0.505294,0.698311
facebook__contriever,0.408365,0.360153,0.371488,0.327267,0.445083,0.43742,0.440294,0.441411,0.426165,0.428081,...,0.473865,0.383133,0.45,0.456274,0.307692,0.380531,0.466667,0.426829,0.410161,0.593825
Alibaba-NLP__gte-base-en-v1.5,0.471584,0.433908,0.446679,0.439017,0.479885,0.426245,0.429757,0.390964,0.457375,0.448835,...,0.506067,0.484337,0.383333,0.477186,0.5,0.469027,0.488889,0.536585,0.477051,0.603781
sentence-transformers__gtr-t5-base,0.40198,0.386654,0.382184,0.42848,0.444125,0.411239,0.409323,0.409004,0.373164,0.425607,...,0.474838,0.573494,0.433333,0.465779,0.461538,0.433628,0.666667,0.439024,0.496209,0.649839
sentence-transformers__gtr-t5-large,0.462324,0.408046,0.402937,0.535441,0.503512,0.461367,0.477011,0.410281,0.439017,0.453624,...,0.467169,0.607229,0.416667,0.545627,0.346154,0.451327,0.6,0.45122,0.488318,0.681315
intfloat__e5-base-v2,0.477011,0.369413,0.42433,0.6606,0.534163,0.475734,0.501916,0.367178,0.457216,0.495929,...,0.479891,0.638554,0.583333,0.480989,0.384615,0.469027,0.6,0.536585,0.527586,0.695093
intfloat__e5-large-v2,0.484355,0.458493,0.435504,0.472222,0.467752,0.482759,0.496488,0.338442,0.481003,0.503193,...,0.46782,0.626506,0.6,0.5,0.384615,0.433628,0.555556,0.512195,0.516071,0.730491
Alibaba-NLP__gte-Qwen2-1.5B-instruct,0.412197,0.360153,0.431354,0.305236,0.529693,0.531609,0.505109,0.371328,0.461606,0.52698,...,0.468762,0.628916,0.5,0.551331,0.384615,0.460177,0.555556,0.536585,0.51674,0.711484
intfloat__e5-mistral-7b-instruct,0.458174,0.447318,0.463442,0.554278,0.52235,0.504789,0.490741,0.441571,0.505268,0.539272,...,0.524875,0.633735,0.5,0.528517,0.5,0.477876,0.444444,0.439024,0.503371,0.735318
hkunlp__instructor-base,0.42848,0.430077,0.365741,0.409962,0.458812,0.475734,0.473819,0.303959,0.433589,0.446839,...,0.490619,0.556627,0.483333,0.477186,0.461538,0.442478,0.533333,0.512195,0.495242,0.713998


In [48]:
mrs_df

Unnamed: 0,CodeNetBugPreferenceRetrieval-c,CodeNetBugPreferenceRetrieval-cpp,CodeNetBugPreferenceRetrieval-go,CodeNetBugPreferenceRetrieval-java,CodeNetBugPreferenceRetrieval-javascript,CodeNetBugPreferenceRetrieval-python,CodeNetBugPreferenceRetrieval-ruby,CodeNetBugPreferenceRetrieval-rust,CodeNetBugPreferenceRetrieval-swift,CodeNetBugPreferenceRetrieval-typescript,...,DeprecatedCodePreferenceRetrieval,SaferCodePreferenceRetrieval-c,SaferCodePreferenceRetrieval-cpp,SaferCodePreferenceRetrieval-python,SaferCodePreferenceRetrieval-java,SaferCodePreferenceRetrieval-javascript,SaferCodePreferenceRetrieval-go,SaferCodePreferenceRetrieval-ruby,SaferCodePreferenceRetrieval,SQLR2PreferenceRetrieval
bm25s,-0.002919,0.002914,0.007742,-0.003668,0.005517,-0.000565,-0.000755,0.00307,0.00336,-0.001084,...,-0.019841,-0.096946,0.027103,0.05432,-0.058136,-0.004915,0.049174,0.007689,-0.016983,0.217158
facebook__contriever,0.002186,-0.012131,0.0003,-0.006725,-0.002834,0.003648,0.001101,-3e-05,0.007334,-0.00863,...,0.003271,-0.058199,-0.071245,-0.034089,-0.204822,-0.075825,-0.066463,-0.017507,-0.010169,0.080737
Alibaba-NLP__gte-base-en-v1.5,-0.008158,-0.010288,-0.006254,-0.012888,0.007311,-0.002694,0.001372,-0.00943,-0.001345,-0.001401,...,0.011133,-0.023081,-0.129503,-0.025942,-0.028601,-0.023205,0.014303,0.043305,0.00502,0.065668
sentence-transformers__gtr-t5-base,0.000786,0.000958,-0.010511,-0.004831,0.009383,-0.008854,0.00385,0.001556,0.002468,-0.003109,...,0.007494,0.064598,-0.083612,-0.024438,-0.095589,-0.039694,0.169179,-0.070909,0.004252,0.035975
sentence-transformers__gtr-t5-large,-3.2e-05,-0.020296,-0.020596,0.008037,0.022215,-0.014564,0.002596,-0.010922,-0.000413,-0.003906,...,0.005331,0.083239,-0.083333,0.046931,-0.164161,-0.052511,0.127963,-0.039933,0.002426,0.04857
intfloat__e5-base-v2,0.003085,-0.012237,-0.01924,0.016877,0.028326,0.01561,0.000516,-0.011941,0.012312,0.000722,...,0.015921,0.123194,0.081389,-0.009569,-0.125321,-0.045834,0.080714,0.036258,0.016638,0.271137
intfloat__e5-large-v2,0.007445,4.7e-05,-0.026434,-0.012663,0.010167,0.003101,-0.009282,-0.027497,0.010095,-0.001469,...,0.014561,0.115895,0.085556,0.000564,-0.132508,-0.058784,0.040344,0.008048,0.013517,0.2355
Alibaba-NLP__gte-Qwen2-1.5B-instruct,-0.011842,-0.039221,-0.011362,-0.063745,0.027574,0.02905,0.015526,-0.039837,0.010528,0.000678,...,0.018533,0.121807,-0.004722,0.054292,-0.151877,-0.064545,0.048889,0.037989,0.016389,0.087614
intfloat__e5-mistral-7b-instruct,-0.017348,-0.03019,-0.002506,0.063765,0.052003,-0.013993,-0.002778,0.004873,0.040267,0.04745,...,0.0286,0.127216,-0.0125,0.018248,-0.031352,-0.040216,-0.053704,-0.048801,0.022714,0.15483
hkunlp__instructor-base,-0.008103,-0.001483,-0.009735,-0.008916,0.007215,-0.003972,-0.000919,-0.006903,0.00129,-0.001891,...,0.016457,0.043094,-0.039491,-0.019073,0.008288,-0.059155,0.016622,-0.008613,0.012225,0.080215


#### 3.2 Instruction Ablation

In [50]:
ablation_models = [
    'hkunlp__instructor-base_wo_instruction',
    'hkunlp__instructor-large_wo_instruction',
    'hkunlp__instructor-xl_wo_instruction',
    'samaya-ai__promptriever-llama2-7b-v1_wo_instruction',
    'samaya-ai__promptriever-llama3.1-8b-v1_wo_instruction',
    'samaya-ai__promptriever-llama3.1-8b-instruct-v1_wo_instruction',
    'samaya-ai__promptriever-mistral-v0.1-7b-v1_wo_instruction',
    'hkunlp__instructor-base_neg_instruction',
    'hkunlp__instructor-large_neg_instruction',
    'hkunlp__instructor-xl_neg_instruction',
    'samaya-ai__promptriever-llama2-7b-v1_neg_instruction',
    'samaya-ai__promptriever-llama3.1-8b-v1_neg_instruction',
    'samaya-ai__promptriever-llama3.1-8b-instruct-v1_neg_instruction',
    'samaya-ai__promptriever-mistral-v0.1-7b-v1_neg_instruction'
]

##### PPA

In [51]:
ppa_dict = defaultdict(dict)

for _model in tqdm(ablation_models):
    for _task in tasks:
        ppa_values = []
        for _subtask in task_sub_task_mapping[_task]:
            _qrels_dict = load_qrels(_task, _subtask)
            _predictions = load_predictions(_model, _task, _subtask)
            _ppa_value = compute_ppa(_qrels_dict, _predictions)
            ppa_values.append(_ppa_value)
            ppa_dict[_model][_task + '-' + _subtask if _subtask is not None else _task] = _ppa_value
        
        # Compute mean PPA over subtasks
        if _task not in ppa_dict[_model]:
            mean_ppa = sum(ppa_values) / len(ppa_values) if ppa_values else None
            ppa_dict[_model][_task] = mean_ppa

df = pd.DataFrame.from_dict(ppa_dict, orient='index').style.apply(highlight_max_in_column, axis=0)

100%|██████████| 14/14 [12:55<00:00, 55.39s/it]


In [52]:
# original_df is already your DataFrame
ppa_df = pd.DataFrame.from_dict(ppa_dict, orient='index')
mask = ~ppa_df.columns.astype(str).str.contains(r'-')
ppa_instruct_mean_df = ppa_df.loc[:, mask]

# inspect the result
ppa_instruct_mean_df

Unnamed: 0,CodeNetBugPreferenceRetrieval,CodeNetEfficiencyPreferenceRetrieval,CVEFixesPreferenceRetrieval,Defects4JPreferenceRetrieval,DeprecatedCodePreferenceRetrieval,SaferCodePreferenceRetrieval,SQLR2PreferenceRetrieval
hkunlp__instructor-base_wo_instruction,0.422701,0.394064,0.598573,0.616702,0.49165,0.495242,0.713998
hkunlp__instructor-large_wo_instruction,0.43295,0.426712,0.620426,0.599572,0.525746,0.495357,0.701227
hkunlp__instructor-xl_wo_instruction,0.445227,0.424331,0.603786,0.623126,0.48497,0.475076,0.731798
samaya-ai__promptriever-llama2-7b-v1_wo_instruction,0.475894,0.45991,0.653528,0.69379,0.51863,0.536373,0.81969
samaya-ai__promptriever-llama3.1-8b-v1_wo_instruction,0.480715,0.468401,0.644419,0.678801,0.55639,0.520745,0.795253
samaya-ai__promptriever-llama3.1-8b-instruct-v1_wo_instruction,0.509171,0.490612,0.635323,0.670236,0.543952,0.516486,0.826529
samaya-ai__promptriever-mistral-v0.1-7b-v1_wo_instruction,0.467433,0.467063,0.637727,0.683084,0.51354,0.511811,0.813355
hkunlp__instructor-base_neg_instruction,0.422733,0.394602,0.598079,0.616702,0.491877,0.495242,0.713898
hkunlp__instructor-large_neg_instruction,0.43295,0.427212,0.619736,0.599572,0.525952,0.495357,0.701227
hkunlp__instructor-xl_neg_instruction,0.445227,0.424217,0.603786,0.623126,0.48558,0.475076,0.731798


##### MRS

In [54]:
mrs_dict = defaultdict(dict)
for _model in tqdm(ablation_models):
    mrs_values = []
    for _task in tasks:
        for _subtask in task_sub_task_mapping[_task]:
            _qrels_dict = load_qrels(_task, _subtask)
            _predictions = load_predictions(_model, _task, _subtask)
            _mrs_value = compute_mrs(_qrels_dict, _predictions)
            mrs_values.append(_mrs_value)
            mrs_dict[_model][_task + '-' + _subtask if _subtask is not None else _task] = _mrs_value
        
        # Compute mean MRS over subtasks
        if _task not in mrs_dict[_model]:
            mean_mrs = sum(mrs_values) / len(mrs_values) if mrs_values else None
            mrs_dict[_model][_task] = mean_mrs

df = pd.DataFrame.from_dict(mrs_dict, orient='index').style.apply(highlight_max_in_column, axis=0)

100%|██████████| 14/14 [12:06<00:00, 51.87s/it]


In [55]:
mrs_df = pd.DataFrame.from_dict(mrs_dict, orient='index')
# original_df is already your DataFrame
mask = ~mrs_df.columns.astype(str).str.contains(r'-')
mrs_mean_df = mrs_df.loc[:, mask]

# inspect the result
mrs_mean_df

Unnamed: 0,CodeNetBugPreferenceRetrieval,CodeNetEfficiencyPreferenceRetrieval,CVEFixesPreferenceRetrieval,Defects4JPreferenceRetrieval,DeprecatedCodePreferenceRetrieval,SaferCodePreferenceRetrieval,SQLR2PreferenceRetrieval
hkunlp__instructor-base_wo_instruction,-0.003342,-0.0028,0.017941,0.118422,0.015424,0.011368,0.079925
hkunlp__instructor-large_wo_instruction,-0.007316,-0.005174,0.019751,0.100129,0.023168,0.018353,0.126495
hkunlp__instructor-xl_wo_instruction,-0.00097,-0.001686,0.019793,0.126674,0.016776,0.011269,0.151634
samaya-ai__promptriever-llama2-7b-v1_wo_instruction,0.005882,0.001766,0.033034,0.202358,0.035563,0.036332,0.237587
samaya-ai__promptriever-llama3.1-8b-v1_wo_instruction,0.005519,-0.00278,0.027356,0.182466,0.037843,0.034728,0.229442
samaya-ai__promptriever-llama3.1-8b-instruct-v1_wo_instruction,0.018661,0.009019,0.035227,0.18039,0.041404,0.037095,0.300318
samaya-ai__promptriever-mistral-v0.1-7b-v1_wo_instruction,-0.001368,-0.004661,0.025296,0.196042,0.027307,0.0255,0.233568
hkunlp__instructor-base_neg_instruction,-0.003344,-0.002798,0.017868,0.118422,0.015415,0.01136,0.079756
hkunlp__instructor-large_neg_instruction,-0.007316,-0.005173,0.019613,0.100129,0.023115,0.018313,0.126767
hkunlp__instructor-xl_neg_instruction,-0.00097,-0.001691,0.019783,0.126674,0.016864,0.011341,0.151697
