In [7]:
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from tqdm.auto import tqdm

# --- 1. 모델 로드 ---
# 사용할 GPU 설정 (가능하면 GPU 사용)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 1-A. 임베딩 모델 (Bi-Encoder) 로드
# 한국어 임베딩 모델 중 성능이 좋은 모델을 선택합니다.
embedding_model = SentenceTransformer("intfloat/multilingual-e5-base", device=device)

# 1-B. 리랭커 모델 (Cross-Encoder) 로드
reranker_model = CrossEncoder('BAAI/bge-reranker-v2-m3', device=device)

# --- 2. 데이터셋 로드 ---
# Ko-MIRACL 데이터셋의 한국어 테스트셋 로드
# streaming=True를 사용하면 전체 데이터를 다운로드하지 않고 일부만 스트리밍하여 메모리를 절약할 수 있습니다.
dataset = load_dataset("miracl/miracl", "ko", use_auth_token=True, split="testA", streaming=True)

# 평가를 위해 데이터셋에서 일부만 샘플링 (예: 100개)
num_samples = 100
eval_samples = list(dataset.take(num_samples))

# --- 3. 평가 진행 ---
# 결과 저장을 위한 변수 초기화
baseline_mrr_scores = []
reranked_mrr_scores = []
baseline_hit_rate_at_10 = 0
reranked_hit_rate_at_10 = 0

IterableDataset({
    features: ['query_id', 'query', 'positive_passages', 'negative_passages'],
    n_shards: 1
})
[{'query_id': '1602', 'query': '유비 출생일은 언제인가요?', 'positive_passages': [], 'negative_passages': []}, {'query_id': '1603', 'query': '목탁수구리는 대형종인가요?', 'positive_passages': [], 'negative_passages': []}, {'query_id': '1606', 'query': '아브라함 링컨은 몇 대 대통령인가?', 'positive_passages': [], 'negative_passages': []}, {'query_id': '1607', 'query': '상자 속 소년의 매장지는 어디인가요?', 'positive_passages': [], 'negative_passages': []}, {'query_id': '1608', 'query': '현대 자동차는 언제 처음으로 자동차를 수출하기 시작했나요?', 'positive_passages': [], 'negative_passages': []}, {'query_id': '1609', 'query': '화의군은 세종과 영빈 강씨의 아들인가요?', 'positive_passages': [], 'negative_passages': []}, {'query_id': '1610', 'query': '한나라 건국일은 언제인가요?', 'positive_passages': [], 'negative_passages': []}, {'query_id': '1613', 'query': '성모 마리아 축일은 언제인가요?', 'positive_passages': [], 'negative_passages': []}, {'query_id': '1614', 'query': '노무라 가쓰야의 마지막 야구 경기는

In [13]:
from datasets import load_dataset

# 한국어 query + qrels
miracl = load_dataset("miracl/miracl", "ko")
# 한국어 corpus
corpus = load_dataset("miracl/miracl-corpus", "ko", split="train")

corpus_dict = {c["docid"]: c for c in corpus}  # 빠른 조회용 dict


Downloading data:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/55.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/53.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/233k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.7k [00:00<?, ?B/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating testB split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating testA split: 0 examples [00:00, ? examples/s]

In [17]:
eval_pairs = []
eval_labels = []
queries = []

for q in miracl["dev"]:
    query = q["query"]
    # positive pairs
    for p in q["positive_passages"]:
        text = corpus_dict[p["docid"]]["text"]
        eval_pairs.append([str(query), str(text)])  # <-- 리스트로, str로 강제 변환
        eval_labels.append(1)
        queries.append(q["query_id"])
    # negative pairs
    for n in q["negative_passages"]:
        text = corpus_dict[n["docid"]]["text"]
        eval_pairs.append([str(query), str(text)])  # <-- 리스트로, str로 강제 변환
        eval_labels.append(0)
        queries.append(q["query_id"])


In [19]:
from collections import defaultdict
# 4. 점수 계산
scores = reranker_model.predict(eval_pairs, convert_to_numpy=True, show_progress_bar=True)

# 5. 쿼리별로 결과 묶기
results_by_query = defaultdict(list)
for qid, score, label in zip(queries, scores, eval_labels):
    results_by_query[qid].append((score, label))

# 6. nDCG@10, MRR@10 계산
def ndcg_at_k(scores, labels, k=10):
    idx = np.argsort(scores)[::-1][:k]
    gains = (2 ** np.array(labels)[idx] - 1)
    discounts = 1 / np.log2(np.arange(2, len(gains) + 2))
    dcg = np.sum(gains * discounts)
    ideal_idx = np.argsort(labels)[::-1][:k]
    ideal_gains = (2 ** np.array(labels)[ideal_idx] - 1)
    ideal_dcg = np.sum(ideal_gains * discounts)
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

def mrr_at_k(scores, labels, k=10):
    idx = np.argsort(scores)[::-1][:k]
    for rank, i in enumerate(idx, start=1):
        if labels[i] == 1:
            return 1 / rank
    return 0.0

ndcgs, mrrs = [], []
for qid, pairs in results_by_query.items():
    s, l = zip(*pairs)
    ndcgs.append(ndcg_at_k(np.array(s), np.array(l)))
    mrrs.append(mrr_at_k(np.array(s), np.array(l)))

print(f"Mean nDCG@10: {np.mean(ndcgs):.4f}")
print(f"Mean MRR@10: {np.mean(mrrs):.4f}")

Batches:   0%|          | 0/96 [00:00<?, ?it/s]

NameError: name 'np' is not defined

In [None]:
reranker_model2 = CrossEncoder('Dongjin-kr/ko-reranker', device=device)

In [None]:
# 4. 점수 계산
scores = reranker_model2.predict(eval_pairs, convert_to_numpy=True, show_progress_bar=True)

# 5. 쿼리별로 결과 묶기
results_by_query = defaultdict(list)
for qid, score, label in zip(queries, scores, eval_labels):
    results_by_query[qid].append((score, label))

# 6. nDCG@10, MRR@10 계산
def ndcg_at_k(scores, labels, k=10):
    idx = np.argsort(scores)[::-1][:k]
    gains = (2 ** np.array(labels)[idx] - 1)
    discounts = 1 / np.log2(np.arange(2, len(gains) + 2))
    dcg = np.sum(gains * discounts)
    ideal_idx = np.argsort(labels)[::-1][:k]
    ideal_gains = (2 ** np.array(labels)[ideal_idx] - 1)
    ideal_dcg = np.sum(ideal_gains * discounts)
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

def mrr_at_k(scores, labels, k=10):
    idx = np.argsort(scores)[::-1][:k]
    for rank, i in enumerate(idx, start=1):
        if labels[i] == 1:
            return 1 / rank
    return 0.0

ndcgs, mrrs = [], []
for qid, pairs in results_by_query.items():
    s, l = zip(*pairs)
    ndcgs.append(ndcg_at_k(np.array(s), np.array(l)))
    mrrs.append(mrr_at_k(np.array(s), np.array(l)))

print(f"Mean nDCG@10: {np.mean(ndcgs):.4f}")
print(f"Mean MRR@10: {np.mean(mrrs):.4f}")

In [None]:
reranker_model3 = CrossEncoder('dragonkue/bge-reranker-v2-m3-ko', device=device)

In [None]:
# 4. 점수 계산
scores = reranker_model3.predict(eval_pairs, convert_to_numpy=True, show_progress_bar=True)

# 5. 쿼리별로 결과 묶기
results_by_query = defaultdict(list)
for qid, score, label in zip(queries, scores, eval_labels):
    results_by_query[qid].append((score, label))

# 6. nDCG@10, MRR@10 계산
def ndcg_at_k(scores, labels, k=10):
    idx = np.argsort(scores)[::-1][:k]
    gains = (2 ** np.array(labels)[idx] - 1)
    discounts = 1 / np.log2(np.arange(2, len(gains) + 2))
    dcg = np.sum(gains * discounts)
    ideal_idx = np.argsort(labels)[::-1][:k]
    ideal_gains = (2 ** np.array(labels)[ideal_idx] - 1)
    ideal_dcg = np.sum(ideal_gains * discounts)
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

def mrr_at_k(scores, labels, k=10):
    idx = np.argsort(scores)[::-1][:k]
    for rank, i in enumerate(idx, start=1):
        if labels[i] == 1:
            return 1 / rank
    return 0.0

ndcgs, mrrs = [], []
for qid, pairs in results_by_query.items():
    s, l = zip(*pairs)
    ndcgs.append(ndcg_at_k(np.array(s), np.array(l)))
    mrrs.append(mrr_at_k(np.array(s), np.array(l)))

print(f"Mean nDCG@10: {np.mean(ndcgs):.4f}")
print(f"Mean MRR@10: {np.mean(mrrs):.4f}")