#### 검색성능비교

실험 조건
- k: {3, 5, 7}
- threshold: {0.6, 0.7, 0.8}
- 카테고리 필터 유무

평가 지표
- Recall@k: top-k 결과에 gold_ids 중 하나라도 포함되면 1, 아니면 0
- (질문 거를때 사용할 기준 점수)avg_max_score: 검색 결과 중 최고 유사도 점수의 평균


In [1]:
from langchain_openai import OpenAIEmbeddings
from collections import defaultdict
from pinecone import Pinecone
from tqdm import tqdm
import pandas as pd
import numpy as np
import json
import os

from dotenv import load_dotenv
load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

INDEX_V1 = "plant-qna"      
INDEX_V3 = "plant-qna-v3"   

with open('augmented_eval_queries_for_rag.json', 'r', encoding='utf-8') as f:
    eval_queries = json.load(f)

print(f"평가셋 로드: {len(eval_queries)}개 쿼리")
print(f"인덱스: {INDEX_V1} (baseline), {INDEX_V3} (candidate)")

  from .autonotebook import tqdm as notebook_tqdm


평가셋 로드: 294개 쿼리
인덱스: plant-qna (baseline), plant-qna-v3 (candidate)


In [2]:
# 검색 함수 정의
def search_with_filter(index_name, query, k, category_main=None, namespace=None):
    """
    Pinecone 인덱스에서 검색 수행
    
    Args:
        index_name: 인덱스 이름
        query: 검색 쿼리
        k: 반환할 결과 수
        category_main: 카테고리 필터 (None이면 필터 없음)
        namespace: 네임스페이스
    
    Returns:
        list of dict: [{doc_id, score}, ...]
    """
    try:
        index = pc.Index(index_name)
        
        # 쿼리 임베딩 생성
        query_embedding = embeddings.embed_query(query)
        
        # 메타데이터 필터 구성
        filter_dict = None
        if category_main:
            filter_dict = {"category_main": {"$eq": category_main}}
        
        # 네임스페이스 설정
        if namespace is None:
            namespace = f"{index_name}-openai"
        
        # 검색 수행
        results = index.query(
            vector=query_embedding,
            top_k=k,
            include_metadata=True,
            filter=filter_dict,
            namespace=namespace
        )
        
        # 결과 파싱
        search_results = []
        for match in results.matches:
            doc_id = match.metadata.get('ids', match.id)
            search_results.append({
                'doc_id': doc_id,
                'score': match.score
            })
        
        return search_results
    
    except Exception as e:
        print(f"검색 오류 ({index_name}): {e}")
        return []

In [3]:
k_values = [3, 5, 7]
index_configs = [
    ('v1', INDEX_V1, ['off']),       
    ('v3', INDEX_V3, ['on', 'off'])   
]

all_results = []

print("="*80)
print("통합 평가 시작")
print("="*80)

total_experiments = sum(len(k_values) * len(filters) for _, _, filters in index_configs)
exp_count = 0

for index_label, index_name, filter_modes in index_configs:
    for k in k_values:
        for filter_mode in filter_modes:
            exp_count += 1
            print(f"\n[{exp_count}/{total_experiments}] 평가 중: index={index_label}, k={k}, filter={filter_mode}")
            
            hits = 0
            max_scores = []
            
            for sample in tqdm(eval_queries, desc="검색 중", leave=False):
                query = sample['query']
                gold_ids = sample['gold_ids']
                # v1은 항상 None, v3는 filter_mode에 따라
                category_main = sample['category_main'] if filter_mode == 'on' else None
                
                # 검색 수행
                results = search_with_filter(index_name, query, k, category_main)
                
                # 결과 분석
                retrieved_ids = [r['doc_id'] for r in results]
                scores = [r['score'] for r in results]
                max_score = max(scores) if scores else 0.0
                hit = any(gold_id in retrieved_ids for gold_id in gold_ids)
                
                if hit:
                    hits += 1
                max_scores.append(max_score)
                
                # 상세 결과 저장
                all_results.append({
                    'index': index_label,
                    'k': k,
                    'filter_mode': filter_mode,
                    'query': query[:50],
                    'category_main': sample['category_main'],
                    'gold_ids': gold_ids,
                    'retrieved_ids': retrieved_ids,
                    'max_score': max_score,
                    'hit': 1 if hit else 0
                })
            
            # 실험 결과 출력
            recall = hits / len(eval_queries)
            avg_max_score = np.mean(max_scores)
            print(f"  → Recall@{k}: {recall:.3f}, avg_max_score: {avg_max_score:.3f}")

print("\n" + "="*80)
print(f"평가 완료! 총 {len(all_results)}개 결과 생성")
print("="*80)

통합 평가 시작

[1/9] 평가 중: index=v1, k=3, filter=off


                                                          

  → Recall@3: 0.000, avg_max_score: 0.000

[2/9] 평가 중: index=v1, k=5, filter=off


                                                          

  → Recall@5: 0.000, avg_max_score: 0.000

[3/9] 평가 중: index=v1, k=7, filter=off


                                                          

  → Recall@7: 0.000, avg_max_score: 0.000

[4/9] 평가 중: index=v3, k=3, filter=on


                                                          

  → Recall@3: 0.963, avg_max_score: 0.615

[5/9] 평가 중: index=v3, k=3, filter=off


                                                          

  → Recall@3: 0.935, avg_max_score: 0.617

[6/9] 평가 중: index=v3, k=5, filter=on


                                                          

  → Recall@5: 0.969, avg_max_score: 0.615

[7/9] 평가 중: index=v3, k=5, filter=off


                                                          

  → Recall@5: 0.952, avg_max_score: 0.617

[8/9] 평가 중: index=v3, k=7, filter=on


                                                          

  → Recall@7: 0.983, avg_max_score: 0.615

[9/9] 평가 중: index=v3, k=7, filter=off


                                                          

  → Recall@7: 0.976, avg_max_score: 0.617

평가 완료! 총 2646개 결과 생성




In [4]:
# 결과 분석 - v1 vs v3 비교
print("="*80)
print("인덱스 성능 비교")
print("="*80)

# 조건별 집계
comparison_data = []
for r in all_results:
    key = (r['index'], r['k'], r['filter_mode'])
    existing = next((c for c in comparison_data 
                     if (c['index'], c['k'], c['filter']) == key), None)
    if not existing:
        subset = [x for x in all_results 
                  if x['index'] == r['index'] and x['k'] == r['k'] and x['filter_mode'] == r['filter_mode']]
        comparison_data.append({
            'index': r['index'],
            'k': r['k'],
            'filter': r['filter_mode'],
            'recall': np.mean([x['hit'] for x in subset]),
            'avg_max_score': np.mean([x['max_score'] for x in subset]),
            'std_max_score': np.std([x['max_score'] for x in subset])
        })

df_comparison = pd.DataFrame(comparison_data)
print("\n전체 결과:")
print(df_comparison.to_string(index=False))

# v1(off) vs v3(off) 비교 - 동일 조건 비교
print("\n" + "="*80)
print("v1(off) vs v3(off) 비교 - 필터 없이 순수 인덱스 성능")
print("="*80)

v1_off = df_comparison[df_comparison['index'] == 'v1']
v3_off = df_comparison[(df_comparison['index'] == 'v3') & (df_comparison['filter'] == 'off')]

for k in k_values:
    v1_row = v1_off[v1_off['k'] == k].iloc[0] if len(v1_off[v1_off['k'] == k]) > 0 else None
    v3_row = v3_off[v3_off['k'] == k].iloc[0] if len(v3_off[v3_off['k'] == k]) > 0 else None
    
    if v1_row is not None and v3_row is not None:
        recall_diff = v3_row['recall'] - v1_row['recall']
        score_diff = v3_row['avg_max_score'] - v1_row['avg_max_score']
        print(f"\nk={k}:")
        print(f"  v1: Recall={v1_row['recall']:.3f}, avg_score={v1_row['avg_max_score']:.4f}")
        print(f"  v3: Recall={v3_row['recall']:.3f}, avg_score={v3_row['avg_max_score']:.4f}")
        print(f"  차이: Recall {recall_diff:+.3f}, avg_score {score_diff:+.4f}")

# v3 필터 on vs off 비교
print("\n" + "="*80)
print("v3 필터 효과 비교 (on vs off)")
print("="*80)

v3_on = df_comparison[(df_comparison['index'] == 'v3') & (df_comparison['filter'] == 'on')]

for k in k_values:
    on_row = v3_on[v3_on['k'] == k].iloc[0] if len(v3_on[v3_on['k'] == k]) > 0 else None
    off_row = v3_off[v3_off['k'] == k].iloc[0] if len(v3_off[v3_off['k'] == k]) > 0 else None
    
    if on_row is not None and off_row is not None:
        recall_diff = on_row['recall'] - off_row['recall']
        score_diff = on_row['avg_max_score'] - off_row['avg_max_score']
        print(f"\nk={k}:")
        print(f"  filter=off: Recall={off_row['recall']:.3f}, avg_score={off_row['avg_max_score']:.4f}")
        print(f"  filter=on:  Recall={on_row['recall']:.3f}, avg_score={on_row['avg_max_score']:.4f}")
        print(f"  필터 효과: Recall {recall_diff:+.3f}, avg_score {score_diff:+.4f}")


인덱스 성능 비교

전체 결과:
index  k filter   recall  avg_max_score  std_max_score
   v1  3    off 0.000000       0.000000       0.000000
   v1  5    off 0.000000       0.000000       0.000000
   v1  7    off 0.000000       0.000000       0.000000
   v3  3     on 0.962585       0.615163       0.090549
   v3  3    off 0.935374       0.617436       0.087933
   v3  5     on 0.969388       0.615184       0.090572
   v3  5    off 0.952381       0.617451       0.087988
   v3  7     on 0.982993       0.615176       0.090556
   v3  7    off 0.976190       0.617441       0.087959

v1(off) vs v3(off) 비교 - 필터 없이 순수 인덱스 성능

k=3:
  v1: Recall=0.000, avg_score=0.0000
  v3: Recall=0.935, avg_score=0.6174
  차이: Recall +0.935, avg_score +0.6174

k=5:
  v1: Recall=0.000, avg_score=0.0000
  v3: Recall=0.952, avg_score=0.6175
  차이: Recall +0.952, avg_score +0.6175

k=7:
  v1: Recall=0.000, avg_score=0.0000
  v3: Recall=0.976, avg_score=0.6174
  차이: Recall +0.976, avg_score +0.6174

v3 필터 효과 비교 (on vs off)

k=3:
  f

In [6]:
# # score_threshold 분석
# print("="*80)
# print("score_threshold 분석 (k=5, filter=on 기준)")
# print("="*80)

# # k=5, filter=on인 v3 결과만 분석
# target_results = [r for r in all_results 
#                   if r['index'] == 'v3' and r['k'] == 5 and r['filter_mode'] == 'on']

# hit_scores = [r['max_score'] for r in target_results if r['hit'] == 1]
# miss_scores = [r['max_score'] for r in target_results if r['hit'] == 0]

# print(f"\nHit (정답 포함): {len(hit_scores)}건")
# print(f"  - 평균 max_score: {np.mean(hit_scores):.4f}")
# print(f"  - 최소 max_score: {np.min(hit_scores):.4f}")
# print(f"  - 25% 분위: {np.percentile(hit_scores, 25):.4f}")

# if miss_scores:
#     print(f"\nMiss (정답 미포함): {len(miss_scores)}건")
#     print(f"  - 평균 max_score: {np.mean(miss_scores):.4f}")
#     print(f"  - 최대 max_score: {np.max(miss_scores):.4f}")
# else:
#     print(f"\nMiss: 0건 (모든 쿼리에서 정답 포함!)")

# # threshold 후보별 영향
# print("\nthreshold 후보별 영향:")
# print(f"{'Threshold':<12} {'Hit 포함률':<15} {'전체 필터링률':<15}")
# print("-"*45)

# thresholds = [0.6, 0.65, 0.7, 0.75, 0.8]
# for t in thresholds:
#     hit_above = sum(1 for s in hit_scores if s >= t)
#     total_above = sum(1 for r in target_results if r['max_score'] >= t)
    
#     hit_rate = hit_above / len(hit_scores) * 100 if hit_scores else 0
#     total_rate = total_above / len(target_results) * 100
    
#     print(f"{t:<12} {hit_rate:.1f}%{'':<10} {total_rate:.1f}%")


In [5]:
# 최종 결론 및 권장 설정
print("="*80)
print("최종 결론")
print("="*80)

# 최적 조건 찾기
best_config = max(comparison_data, key=lambda x: (x['recall'], x['avg_max_score']))

print(f"\n최적 설정:")
print(f"   - 인덱스: plant-qna-{best_config['index']}")
print(f"   - k: {best_config['k']}")
print(f"   - 카테고리 필터: {best_config['filter']}")
print(f"   - Recall@k: {best_config['recall']:.3f}")
print(f"   - avg_max_score: {best_config['avg_max_score']:.4f}")

# v1(off) vs v3(off) 비교 - 동일 조건(필터 없음)
v1_data = [r for r in comparison_data if r['index'] == 'v1']
v3_off_data = [r for r in comparison_data if r['index'] == 'v3' and r['filter'] == 'off']
v3_on_data = [r for r in comparison_data if r['index'] == 'v3' and r['filter'] == 'on']

if v1_data and v3_off_data:
    v1_avg_recall = np.mean([r['recall'] for r in v1_data])
    v3_off_avg_recall = np.mean([r['recall'] for r in v3_off_data])
    v1_avg_score = np.mean([r['avg_max_score'] for r in v1_data])
    v3_off_avg_score = np.mean([r['avg_max_score'] for r in v3_off_data])
    
    print(f"\nv1 vs v3 비교 (필터 off, 동일 조건):")
    print(f"   - v1(off): Recall={v1_avg_recall:.3f}, avg_score={v1_avg_score:.4f}")
    print(f"   - v3(off): Recall={v3_off_avg_recall:.3f}, avg_score={v3_off_avg_score:.4f}")
    
    if v3_off_avg_recall >= v1_avg_recall:
        print(f"   v3가 v1 대비 Recall 유지/향상")
    else:
        print(f"   v3가 v1 대비 Recall 감소")

# v3 필터 효과
if v3_on_data and v3_off_data:
    v3_on_avg_recall = np.mean([r['recall'] for r in v3_on_data])
    v3_on_avg_score = np.mean([r['avg_max_score'] for r in v3_on_data])
    
    print(f"\nv3 필터 효과 (on vs off):")
    print(f"   - v3(off): Recall={v3_off_avg_recall:.3f}, avg_score={v3_off_avg_score:.4f}")
    print(f"   - v3(on):  Recall={v3_on_avg_recall:.3f}, avg_score={v3_on_avg_score:.4f}")
    
    filter_recall_effect = v3_on_avg_recall - v3_off_avg_recall
    filter_score_effect = v3_on_avg_score - v3_off_avg_score
    
    if filter_recall_effect >= 0:
        print(f"   필터가 Recall 유지/향상 ({filter_recall_effect:+.3f})")
    else:
        print(f"   필터가 Recall 감소 ({filter_recall_effect:+.3f})")
    
    if filter_score_effect > 0:
        print(f"   필터가 avg_score 향상 ({filter_score_effect:+.4f})")

# 권장 threshold
print(f"\n권장 score_threshold: 0.65 ~ 0.70")
print("   (Hit 포함률 90% 이상 유지하면서 동문서답 필터링)")

print("\n" + "="*80)


최종 결론

최적 설정:
   - 인덱스: plant-qna-v3
   - k: 7
   - 카테고리 필터: on
   - Recall@k: 0.983
   - avg_max_score: 0.6152

v1 vs v3 비교 (필터 off, 동일 조건):
   - v1(off): Recall=0.000, avg_score=0.0000
   - v3(off): Recall=0.955, avg_score=0.6174
   v3가 v1 대비 Recall 유지/향상

v3 필터 효과 (on vs off):
   - v3(off): Recall=0.955, avg_score=0.6174
   - v3(on):  Recall=0.972, avg_score=0.6152
   필터가 Recall 유지/향상 (+0.017)

권장 score_threshold: 0.65 ~ 0.70
   (Hit 포함률 90% 이상 유지하면서 동문서답 필터링)

