<a href="https://colab.research.google.com/github/Shun0212/CodeBERTPretrained/blob/main/EvalCodeMor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
import torch
import numpy as np
from transformers import AutoModelForMaskedLM, AutoTokenizer, RobertaForMaskedLM, BertForMaskedLM
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
import random

# デバイス設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用デバイス: {device}")

def get_cls_embedding(model, tokenizer, text, device, max_length=256):
    """
    入力テキストから [CLS] トークンの埋め込みを取得する関数。
    ※モデルが BERT 系の場合は model.bert を、RoBERTa 系の場合は model.roberta を利用して出力を取得します。
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # モデルの種類に応じて内部のエンコーダにアクセスする
    if isinstance(model, BertForMaskedLM): # BERT 系モデルの判定を修正
        outputs = model.bert(**inputs)
    elif isinstance(model, RobertaForMaskedLM): # RoBERTa 系モデルの判定を修正
        outputs = model.roberta(**inputs)
    else:
        raise ValueError("Model must be BertForMaskedLM or RobertaForMaskedLM.") # エラーメッセージを修正

    # outputs.last_hidden_state: (batch_size, sequence_length, hidden_size)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] または最初のトークンの埋め込み
    return cls_embedding.detach().cpu().numpy()


def calculate_metrics(sim_matrix, k_values=[1, 5, 10, 50, 100]):
    num_queries = sim_matrix.shape[0]
    metrics = {
        "mrr": 0.0,
        "recall@k": {k: 0.0 for k in k_values},
        "precision@k": {k: 0.0 for k in k_values},
        "ndcg@k": {k: 0.0 for k in k_values},
        "map": 0.0,
        "f1@k": {k: 0.0 for k in k_values},
        "r_precision": 0.0,
        "success_rate@k": {k: 0.0 for k in k_values},
        "query_coverage@k": {k: 0.0 for k in k_values},
    }

    for i in range(num_queries):
        sims = sim_matrix[i]
        ranked_indices = np.argsort(-sims)
        correct_rank = np.where(ranked_indices == 0)[0][0] + 1
        is_correct_in_top_k = {k: correct_rank <= k for k in k_values}

        # MRR
        metrics["mrr"] += 1.0 / correct_rank

        # 各 k に対してクエリごとの precision, recall, f1 を計算
        for k in k_values:
            if correct_rank <= k:
                # 正解がトップkに含まれる場合
                prec = 1.0 / correct_rank
                rec = 1.0  # 候補が1件の場合、正解が含まれていれば recall は 1.0
            else:
                prec = 0.0
                rec = 0.0
            f1 = calculate_f1(prec, rec)

            metrics["precision@k"][k] += prec
            metrics["recall@k"][k] += rec
            metrics["f1@k"][k] += f1

            # NDCG
            ideal_ranking = [1.0] + [0.0] * (k - 1)
            actual_ranking = [1.0 if j == 0 else 0.0 for j in ranked_indices[:k]]
            idcg = calculate_dcg(ideal_ranking)
            dcg = calculate_dcg(actual_ranking)
            metrics["ndcg@k"][k] += dcg / idcg if idcg > 0 else 0.0

            # Success Rate, Query Coverage
            metrics["success_rate@k"][k] += 1.0 if is_correct_in_top_k[k] else 0.0
            metrics["query_coverage@k"][k] += 1.0 if is_correct_in_top_k[k] else 0.0

        # MAP, R-Precision
        metrics["map"] += calculate_average_precision(ranked_indices)
        metrics["r_precision"] += calculate_r_precision(ranked_indices)

    # 各指標をクエリ数で平均化
    metrics["mrr"] /= num_queries
    metrics["map"] /= num_queries
    metrics["r_precision"] /= num_queries
    for k in k_values:
        metrics["recall@k"][k] /= num_queries
        metrics["precision@k"][k] /= num_queries
        metrics["ndcg@k"][k] /= num_queries
        metrics["f1@k"][k] /= num_queries
        metrics["success_rate@k"][k] /= num_queries
        metrics["query_coverage@k"][k] /= num_queries

    return metrics


def calculate_average_precision(ranked_indices): # ★ MAP 計算用関数を追加
    """
    Average Precision (AP) を計算する関数。
    """
    correct_rank = np.where(ranked_indices == 0)[0][0] + 1
    ap = 0.0
    for k in range(1, len(ranked_indices) + 1):
        if k == correct_rank: # k番目に正解コードが現れた場合のみ Precision を加算
            ap += 1.0 / k
    return ap


def calculate_f1(precision, recall): # ★ F1値計算用関数を追加
    """
    F1値を計算する関数 (Precision と Recall から計算)。
    """
    if precision + recall == 0: # Precision, Recall が共に 0 の場合
        return 0.0
    return 2 * (precision * recall) / (precision + recall)


def calculate_r_precision(ranked_indices): # ★ R-Precision 計算用関数を追加
    """
    R-Precision を計算する関数 (R=1 で固定: 候補プールに正解コードは1つのみ)。
    """
    r = 1 # 正解コード数 (候補プールに1つのみ)
    correct_in_top_r = 0
    for i in range(r): # 上位 r 件まで確認
        if ranked_indices[i] == 0: # 正解コードがランクイン
            correct_in_top_r += 1
    return correct_in_top_r / r # R-Precision は適合率

def calculate_dcg(ranking):
    """
    Discounted Cumulative Gain (DCG) を計算する関数。
    """
    dcg = 0.0
    for i, rel in enumerate(ranking):
        dcg += rel / np.log2(i + 2) # 順位 i+1 の割引率: 1/log2(i+2)
    return dcg


import re

def remove_multiline_strings(code):
    """
    Pythonコードからマルチライン文字列（""" """ や ''' '''）を削除する関数
    """
    pattern = r'("""(.*?)"""|\'\'\'(.*?)\'\'\')'
    return re.sub(pattern, '', code, flags=re.DOTALL)

def evaluate_code_search(model, tokenizer, dataset, device, max_examples=100, pool_size=100,
                         query_field="func_documentation_string", code_field="func_code_string"):
    num_examples = min(len(dataset), max_examples)
    all_codes = [remove_multiline_strings(dataset[i][code_field]) for i in range(len(dataset))]
    queries = []
    correct_code_indices = []

    for i in range(num_examples):
        ex = dataset[i]
        query_text = ex[query_field]
        queries.append(query_text)
        correct_code_indices.append(i)

    all_code_embeddings = []
    print("データセット全体のコード埋め込みを計算中...")
    for code in all_codes[:num_examples]:
        emb = get_cls_embedding(model, tokenizer, code, device)
        all_code_embeddings.append(emb)
    all_code_embeddings = np.concatenate(all_code_embeddings, axis=0)
    print("コード埋め込み計算完了.")

    sim_matrices = []
    print("候補コードプールを作成し、類似度計算中...")
    for i in range(num_examples):
        query_embedding = get_cls_embedding(model, tokenizer, queries[i], device).reshape(1, -1)
        candidate_pool_embeddings = []
        candidate_pool_embeddings.append(all_code_embeddings[correct_code_indices[i]])
        incorrect_code_indices = []
        while len(incorrect_code_indices) < pool_size - 1:
            rand_index = random.randint(0, num_examples - 1)
            if rand_index != correct_code_indices[i] and rand_index not in incorrect_code_indices:
                incorrect_code_indices.append(rand_index)
        for incorrect_index in incorrect_code_indices:
            candidate_pool_embeddings.append(all_code_embeddings[incorrect_index])
        candidate_pool_embeddings = np.stack(candidate_pool_embeddings, axis=0)
        sims = cosine_similarity(query_embedding, candidate_pool_embeddings)[0]
        sim_matrices.append(sims)
    sim_matrix = np.array(sim_matrices)
    print("類似度計算完了.")
    metrics = calculate_metrics(sim_matrix)
    return metrics




if __name__ == "__main__":
    # CodeSearchNet の Python 部分のテストセットをロード
    print("CodeSearchNet データセットをロードします...")
    dataset = load_dataset("code_search_net", "python", split="test", trust_remote_code=True)
    # デモ用に上位 1000 件を評価対象にする
    subset = dataset.select(range(1000)) # 評価サンプル数を調整

    ### CodeMorph-BERT の評価 ###
    model_name1 = "Shuu12121/CodeMorph-BERT"
    print(f"\n{model_name1} を評価します (候補プールサイズ: 100)...")
    tokenizer1 = AutoTokenizer.from_pretrained(model_name1)
    model1 = AutoModelForMaskedLM.from_pretrained(model_name1) # AutoModelForMaskedLM を使用
    model1.to(device)
    metrics1 = evaluate_code_search(model1, tokenizer1, subset, device, max_examples=1000, pool_size=100) # max_examples, pool_size を調整
    print(f"CodeMorph-BERT の評価指標 (候補プールサイズ: 100):")
    for name, value_dict in metrics1.items(): # metrics1 は辞書の中に辞書を持つ構造 (metrics1.items() で外側の辞書をiterate)
        if isinstance(value_dict, dict): # Recall@k, Precision@k, NDCG@k の場合 (辞書型)
            for k, v in value_dict.items(): # value_dict.items() で内側の辞書をiterate
                print(f"  {name}@{k}: {v:.4f}")
        else: # MRR の場合 (辞書型ではない)
            print(f"  {name}: {value_dict:.4f}")


    ### Microsoft CodeBERT (microsoft/codebert-base-mlm) の評価 ###
    model_name2 = "microsoft/codebert-base-mlm"
    print(f"\n{model_name2} を評価します (候補プールサイズ: 100)...")
    tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
    model2 = AutoModelForMaskedLM.from_pretrained(model_name2) # AutoModelForMaskedLM を使用
    model2.to(device)
    metrics2 = evaluate_code_search(model2, tokenizer2, subset, device, max_examples=1000, pool_size=100) # max_examples, pool_size を調整
    print(f"CodeBERT (microsoft/codebert-base-mlm) の評価指標 (候補プールサイズ: 100):")
    for name, value_dict in metrics2.items(): # metrics2 は辞書の中に辞書を持つ構造 (metrics2.items() で外側の辞書をiterate)
        if isinstance(value_dict, dict): # Recall@k, Precision@k, NDCG@k の場合 (辞書型)
            for k, v in value_dict.items(): # value_dict.items() で内側の辞書をiterate
                print(f"  {name}@{k}: {v:.4f}")
        else: # MRR の場合 (辞書型ではない)
            print(f"  {name}: {value_dict:.4f}")

使用デバイス: cuda
CodeSearchNet データセットをロードします...

Shuu12121/CodeMorph-BERT を評価します (候補プールサイズ: 100)...
データセット全体のコード埋め込みを計算中...
コード埋め込み計算完了.
候補コードプールを作成し、類似度計算中...
類似度計算完了.


Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CodeMorph-BERT の評価指標 (候補プールサイズ: 100):
  mrr: 0.6678
  recall@k@1: 0.5650
  recall@k@5: 0.7970
  recall@k@10: 0.8600
  recall@k@50: 0.9770
  recall@k@100: 1.0000
  precision@k@1: 0.5650
  precision@k@5: 0.6526
  precision@k@10: 0.6610
  precision@k@50: 0.6675
  precision@k@100: 0.6678
  ndcg@k@1: 0.5650
  ndcg@k@5: 0.6887
  ndcg@k@10: 0.7091
  ndcg@k@50: 0.7363
  ndcg@k@100: 0.7401
  map: 0.6678
  f1@k@1: 0.5650
  f1@k@5: 0.6899
  f1@k@10: 0.7046
  f1@k@50: 0.7169
  f1@k@100: 0.7175
  r_precision: 0.5650
  success_rate@k@1: 0.5650
  success_rate@k@5: 0.7970
  success_rate@k@10: 0.8600
  success_rate@k@50: 0.9770
  success_rate@k@100: 1.0000
  query_coverage@k@1: 0.5650
  query_coverage@k@5: 0.7970
  query_coverage@k@10: 0.8600
  query_coverage@k@50: 0.9770
  query_coverage@k@100: 1.0000

microsoft/codebert-base-mlm を評価します (候補プールサイズ: 100)...
データセット全体のコード埋め込みを計算中...
コード埋め込み計算完了.
候補コードプールを作成し、類似度計算中...
類似度計算完了.
CodeBERT (microsoft/codebert-base-mlm) の評価指標 (候補プールサイズ: 100):
  mrr: 0.5598
  r

In [None]:
import torch
import numpy as np
from transformers import AutoModelForMaskedLM, AutoTokenizer, RobertaForMaskedLM, BertForMaskedLM
from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用デバイス: {device}")


import re

def remove_multiline_strings(code):
    """
    Pythonコードからマルチライン文字列（""" """ や ''' '''）を削除する関数
    """
    pattern = r'("""(.*?)"""|\'\'\'(.*?)\'\'\')'
    return re.sub(pattern, '', code, flags=re.DOTALL)

def evaluate_code_search(model, tokenizer, dataset, device, max_examples=100, pool_size=100,
                         query_field="func_documentation_string", code_field="func_code_string"):
    num_examples = min(len(dataset), max_examples)
    all_codes = [remove_multiline_strings(dataset[i][code_field]) for i in range(len(dataset))]
    queries = []
    correct_code_indices = []

    for i in range(num_examples):
        ex = dataset[i]
        query_text = ex[query_field]
        queries.append(query_text)
        correct_code_indices.append(i)

    all_code_embeddings = []
    print("データセット全体のコード埋め込みを計算中...")
    for code in all_codes[:num_examples]:
        emb = get_cls_embedding(model, tokenizer, code, device)
        all_code_embeddings.append(emb)
    all_code_embeddings = np.concatenate(all_code_embeddings, axis=0)
    print("コード埋め込み計算完了.")

    sim_matrices = []
    print("候補コードプールを作成し、類似度計算中...")
    for i in range(num_examples):
        query_embedding = get_cls_embedding(model, tokenizer, queries[i], device).reshape(1, -1)
        candidate_pool_embeddings = []
        candidate_pool_embeddings.append(all_code_embeddings[correct_code_indices[i]])
        incorrect_code_indices = []
        while len(incorrect_code_indices) < pool_size - 1:
            rand_index = random.randint(0, num_examples - 1)
            if rand_index != correct_code_indices[i] and rand_index not in incorrect_code_indices:
                incorrect_code_indices.append(rand_index)
        for incorrect_index in incorrect_code_indices:
            candidate_pool_embeddings.append(all_code_embeddings[incorrect_index])
        candidate_pool_embeddings = np.stack(candidate_pool_embeddings, axis=0)
        sims = cosine_similarity(query_embedding, candidate_pool_embeddings)[0]
        sim_matrices.append(sims)
    sim_matrix = np.array(sim_matrices)
    print("類似度計算完了.")
    metrics = calculate_metrics(sim_matrix)
    return metrics


def get_cls_embedding(model, tokenizer, text, device, max_length=256):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_length)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    if isinstance(model, BertForMaskedLM):
        outputs = model.bert(**inputs)
    elif isinstance(model, RobertaForMaskedLM):
        outputs = model.roberta(**inputs)
    else:
        raise ValueError("Model must be BertForMaskedLM or RobertaForMaskedLM.")
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.detach().cpu().numpy()

def calculate_metrics(sim_matrix, k_values=[1, 5, 10, 50, 100]):
    num_queries = sim_matrix.shape[0]
    metrics = {
        "mrr": 0.0,
        "recall@k": {k: 0.0 for k in k_values},
        "precision@k": {k: 0.0 for k in k_values},
        "ndcg@k": {k: 0.0 for k in k_values},
        "map": 0.0,
        "f1@k": {k: 0.0 for k in k_values},
        "r_precision": 0.0,
        "success_rate@k": {k: 0.0 for k in k_values},
        "query_coverage@k": {k: 0.0 for k in k_values},
    }
    for i in range(num_queries):
        sims = sim_matrix[i]
        ranked_indices = np.argsort(-sims)
        correct_rank = np.where(ranked_indices == 0)[0][0] + 1
        is_correct_in_top_k = {k: correct_rank <= k for k in k_values}
        metrics["mrr"] += 1.0 / correct_rank
        for k in k_values:
            if correct_rank <= k:
                prec = 1.0 / correct_rank
                rec = 1.0
            else:
                prec = 0.0
                rec = 0.0
            f1 = calculate_f1(prec, rec)
            metrics["precision@k"][k] += prec
            metrics["recall@k"][k] += rec
            metrics["f1@k"][k] += f1
            ideal_ranking = [1.0] + [0.0] * (k - 1)
            actual_ranking = [1.0 if j == 0 else 0.0 for j in ranked_indices[:k]]
            idcg = calculate_dcg(ideal_ranking)
            dcg = calculate_dcg(actual_ranking)
            metrics["ndcg@k"][k] += dcg / idcg if idcg > 0 else 0.0
            metrics["success_rate@k"][k] += 1.0 if is_correct_in_top_k[k] else 0.0
            metrics["query_coverage@k"][k] += 1.0 if is_correct_in_top_k[k] else 0.0
        metrics["map"] += calculate_average_precision(ranked_indices)
        metrics["r_precision"] += calculate_r_precision(ranked_indices)
    metrics["mrr"] /= num_queries
    metrics["map"] /= num_queries
    metrics["r_precision"] /= num_queries
    for k in k_values:
        metrics["recall@k"][k] /= num_queries
        metrics["precision@k"][k] /= num_queries
        metrics["ndcg@k"][k] /= num_queries
        metrics["f1@k"][k] /= num_queries
        metrics["success_rate@k"][k] /= num_queries
        metrics["query_coverage@k"][k] /= num_queries
    return metrics

def calculate_average_precision(ranked_indices):
    correct_rank = np.where(ranked_indices == 0)[0][0] + 1
    ap = 0.0
    for k in range(1, len(ranked_indices) + 1):
        if k == correct_rank:
            ap += 1.0 / k
    return ap

def calculate_f1(precision, recall):
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

def calculate_r_precision(ranked_indices):
    r = 1
    correct_in_top_r = 0
    for i in range(r):
        if ranked_indices[i] == 0:
            correct_in_top_r += 1
    return correct_in_top_r / r

def calculate_dcg(ranking):
    dcg = 0.0
    for i, rel in enumerate(ranking):
        dcg += rel / np.log2(i + 2)
    return dcg



if __name__ == "__main__":
    print("\ngoogle/code_x_glue_tc_nl_code_search_adv データセット (Test) をロードします...")
    tc_dataset = load_dataset("google/code_x_glue_tc_nl_code_search_adv", split="test")
    subset_tc = tc_dataset.select(range(10000))
    model_name1 = "Shuu12121/CodeMorph-BERT"
    print(f"\n{model_name1} を google/code_x_glue_tc_nl_code_search_adv で評価します (候補プールサイズ: 100)...")
    tokenizer1 = AutoTokenizer.from_pretrained(model_name1)
    model1 = AutoModelForMaskedLM.from_pretrained(model_name1)
    model1.to(device)
    metrics_tc1 = evaluate_code_search(model1, tokenizer1, subset_tc, device, max_examples=10000, pool_size=100, query_field="docstring", code_field="code")
    print(f"{model_name1} の評価指標 (google/code_x_glue_tc_nl_code_search_adv):")
    for name, value in metrics_tc1.items():
        if isinstance(value, dict):
            for k, v in value.items():
                print(f"  {name}@{k}: {v:.4f}")
        else:
            print(f"  {name}: {value:.4f}")
    model_name2 = "microsoft/codebert-base-mlm"
    print(f"\n{model_name2} を google/code_x_glue_tc_nl_code_search_adv で評価します (候補プールサイズ: 100)...")
    tokenizer2 = AutoTokenizer.from_pretrained(model_name2)
    model2 = AutoModelForMaskedLM.from_pretrained(model_name2)
    model2.to(device)
    metrics_tc2 = evaluate_code_search(model2, tokenizer2, subset_tc, device, max_examples=10000, pool_size=100, query_field="docstring", code_field="code")
    print(f"{model_name2} の評価指標 (google/code_x_glue_tc_nl_code_search_adv):")
    for name, value in metrics_tc2.items():
        if isinstance(value, dict):
            for k, v in value.items():
                print(f"  {name}@{k}: {v:.4f}")
        else:
            print(f"  {name}: {value:.4f}")


使用デバイス: cuda

google/code_x_glue_tc_nl_code_search_adv データセット (Test) をロードします...

Shuu12121/CodeMorph-BERT を google/code_x_glue_tc_nl_code_search_adv で評価します (候補プールサイズ: 100)...
データセット全体のコード埋め込みを計算中...
コード埋め込み計算完了.
候補コードプールを作成し、類似度計算中...
類似度計算完了.


Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Shuu12121/CodeMorph-BERT の評価指標 (google/code_x_glue_tc_nl_code_search_adv):
  mrr: 0.2997
  recall@k@1: 0.1995
  recall@k@5: 0.3928
  recall@k@10: 0.4899
  recall@k@50: 0.8080
  recall@k@100: 1.0000
  precision@k@1: 0.1995
  precision@k@5: 0.2693
  precision@k@10: 0.2821
  precision@k@50: 0.2969
  precision@k@100: 0.2997
  ndcg@k@1: 0.1995
  ndcg@k@5: 0.2999
  ndcg@k@10: 0.3312
  ndcg@k@50: 0.4011
  ndcg@k@100: 0.4323
  map: 0.2997
  f1@k@1: 0.1995
  f1@k@5: 0.2998
  f1@k@10: 0.3223
  f1@k@50: 0.3504
  f1@k@100: 0.3558
  r_precision: 0.1995
  success_rate@k@1: 0.1995
  success_rate@k@5: 0.3928
  success_rate@k@10: 0.4899
  success_rate@k@50: 0.8080
  success_rate@k@100: 1.0000
  query_coverage@k@1: 0.1995
  query_coverage@k@5: 0.3928
  query_coverage@k@10: 0.4899
  query_coverage@k@50: 0.8080
  query_coverage@k@100: 1.0000

microsoft/codebert-base-mlm を google/code_x_glue_tc_nl_code_search_adv で評価します (候補プールサイズ: 100)...
データセット全体のコード埋め込みを計算中...
コード埋め込み計算完了.
候補コードプールを作成し、類似度計算中...
類似度計算完了.