In [1]:
# ============ 라이브러리 임포트 1 ============
import os
from dotenv import load_dotenv, find_dotenv
import warnings
from urllib3.exceptions import InsecureRequestWarning
warnings.filterwarnings('ignore', category=InsecureRequestWarning)
import pandas as pd
import numpy as np

In [2]:
# ============ 라이브러리 임포트 2 ============
from elastic_helpers import ESSearch
from utils import InputProcessor, convert_to_excel_name

In [3]:
# ============ 환경변수 로드 및 구글 인증 key check ============
load_dotenv()
print(find_dotenv())
print(os.getenv("GOOGLE_APPLICATION_CREDENTIALS"))

/home/a543979/hdegis-chat-backend/.env
/home/a543979/hdegis-chat-backend/key/pjt-dev-hdegis-app-454401-bd4fac2d452b.json


In [4]:
# ============ 환경변수 설정 (실제 스크립트에서는 .env 에서 불러올 정보) ============
ES_HOST = "https://node.hd-aic.com:30692"
ES_USER = "genai_hde"
ES_PASSWORD = "wpsAI1@"

In [5]:
# ============ 검색 클래스 생성 ============
es = ESSearch(
    hosts=ES_HOST,
    credentials=(ES_USER, ES_PASSWORD)
)

  _transport = transport_class(


In [None]:
# ============ QA 데이터셋 불러오기 ============
df = pd.read_excel("QA 성능테스트 v4.xlsx")
print(df.shape)

In [None]:
# ============ 검색 설정값 변경 ============
# (실제 스크립트에서는 .env 혹은 config에서 불러올 예정)

# INDEX_NAME = "hde_hvcb_text_004"
# EMBEDDING_MODEL_NAME = "text-embedding-004"
# INDEX_NAME = "hde_hvcb_text_0815"
# EMBEDDING_MODEL_NAME = "text-embedding-005"
INDEX_NAME = "hde_hvcb_text_m_002"
EMBEDDING_MODEL_NAME = "text-multilingual-embedding-002"

TOP_K = 10
TEXT_FIELDS = ["content"]
VECTOR_FIELD = "embedding"

SEARCH_METHOD = "hyde_hybrid"

## Tolerance 0

In [None]:
# ============ 검색 평가 (Tolerance 0) ============

results = [] # 각 쿼리의 검색결과 저장
hit_list = []  # 각 쿼리의 hit 여부 저장
rr_list = []   # 각 쿼리의 reciprocal rank 저장

for i, row in df.iterrows():
   # --- 1. 사용자 입력 ---
    user_query = row['query']
    user_filter = row['폴더 선택 (검색 범위 지정)']
    print(f"[{(i+1):02d}/{len(df)}] User Query:\n{user_query} (Filter: {user_filter})")
    print("=================================") 


    # --- 2. gt 정리 ---
    gt_answer = row['gt_answer']
    gt_refs = []
    print("gt_ref:")
    for n in range(1, 14):
        if pd.isna(row[f"page_{n}"]):
            break
        path = row[f"path_{n}"]
        filename = row[f"filename_{n}"]
        page = int(row[f'page_{n}'])
        gt_refs.append((path, filename, page))
        print((path, filename, page))
    gt_set = set(gt_refs)
    print("=================================") 
    
     # --- 3. 검색 ---
    if SEARCH_METHOD == "vector":
        response = es.vector_search_w_filter(
            index_name=INDEX_NAME, embedding_model_name=EMBEDDING_MODEL_NAME, top_k=TOP_K,
            user_query=user_query, user_filter=user_filter,
            vector_field=VECTOR_FIELD,
        )
    elif SEARCH_METHOD == "keyword":
        response = es.keyword_text_search_w_filter(
            index_name=INDEX_NAME, top_k=TOP_K,
            user_query=user_query, user_filter=user_filter,
            text_fields=TEXT_FIELDS,
        )
    elif SEARCH_METHOD == "hybrid":
        response = es.hybrid_search_w_filter(
            index_name=INDEX_NAME, embedding_model_name=EMBEDDING_MODEL_NAME, top_k=TOP_K,
            user_query=user_query, user_filter=user_filter,
            vector_field=VECTOR_FIELD, text_fields=TEXT_FIELDS,
        )
    elif SEARCH_METHOD == "hyde": 
        response = es.hyde_vector_search_w_filter(
            index_name=INDEX_NAME, embedding_model_name=EMBEDDING_MODEL_NAME, top_k=TOP_K,
            user_query=user_query, user_filter=user_filter,
            vector_field=VECTOR_FIELD,
        )
    elif SEARCH_METHOD == "hyde_hybrid": 
        response = es.hyde_hybrid_search_w_filter(
            index_name=INDEX_NAME, embedding_model_name=EMBEDDING_MODEL_NAME, top_k=TOP_K,
            user_query=user_query, user_filter=user_filter,
            vector_field=VECTOR_FIELD, text_fields=TEXT_FIELDS
        )
        
    hits = response['hits']['hits']

    
    # --- 4. 하나의 쿼리에 대한 개별 검색결과 데이터 정리 ---
    print("=================================")
    search_refs = []
    for hit in hits:
        search_path = convert_to_excel_name('/'.join(hit['_source']['folder_levels']))
        search_filename = hit['_source']['pdf_name']
        search_page = int(hit['_source']['page'])
        score = hit['_score']
        is_hit = (search_path, search_filename, search_page) in gt_set # 여기서 계산하는: Hit -> 질문 1개에 대한 개별 검색결과
        search_refs.append((search_path, search_filename, search_page, score, is_hit))
        
        result = {
            "user_filter": user_filter,
            "user_query": user_query,
            "search_path": search_path,
            "search_filename": search_filename,
            "search_page": search_page,
            "is_hit": is_hit,
            "score": score,
            "gt_refs": gt_refs,
            "gt_answer": gt_answer,
        }
        results.append(result)
        
        print(f"[{is_hit}] {(search_path, search_filename, search_page)} (Score: {score})")
        # break # 하나의 질문에 대한 검색결과 한 개의 결과 (전체보려면 주석처리)
    print("=================================")


    # --- 5. Hit 계산 (질문 1개, 즉 개별 검색결과에 대해서 하나라도 hit 하면 hit) ---
    hit = any((search_path, search_filename, search_page) in gt_set 
              for (search_path, search_filename, search_page, score, is_hit) in search_refs)
    hit_list.append(hit)

    # --- 6. Reciprocal Rank 계산 (질문 1개)
    rr = 0
    for rank, search_item in enumerate(search_refs, start=1):
        search_path, search_filename, search_page, score, is_hit = search_item
        if is_hit:
            rr = 1 / rank
            break
    rr_list.append(rr)

    print(f"Hit: {hit}, RR: {rr}")
    print("=================================")
    
    # break # 질문 한 개에 대한 결과 (전체 보려면 주석처리)


# --- 1. 전체 질문에 대한 결과 저장 ---
df_0 = pd.DataFrame(results)
df_0.to_csv(f'tolerance_{SEARCH_METHOD}_0.csv', index=False)
print(f"tolerance_{SEARCH_METHOD}_0.csv 저장 완료!  (행 수: {len(df_0)}) ")

# --- 2. 전체 질문에 대한 Hit Rate, MRR 계산 및 출력
hit_rate = np.mean(hit_list)
mrr = np.mean(rr_list)
print("=================================")
print(f"[Tolerance ±0]")
print(f"Hit Rate: {hit_rate * 100:.2f}%")
print(f"Mean Reciprocal Rank (MRR): {mrr * 100:.2f}%")
print("=================================")

## Tolerance@1, 3, 5

In [None]:
# ============ 설정값 및 필요한 util 함수 및 데이터 불러오기 ============
import ast

# tolerance 기준 설정
tolerances = [1, 3, 5]

# tolerance_0.csv 읽기
df_0 = pd.read_csv(f"tolerance_{SEARCH_METHOD}_0.csv")
print(f"tolerance_{SEARCH_METHOD}_0.csv: {df_0.shape}")

# gt_refs 파싱 함수
def parse_gt_refs(gt_refs_str):
    """gt_refs 문자열을 리스트로 변환"""
    return ast.literal_eval(gt_refs_str)


def is_hit_with_tolerance(search_item, gt_refs, tolerance):
    """Tolerance를 고려해서 search_item이 gt_refs 안에 있는지 검사"""
    search_path, search_filename, search_page, score= search_item
    for gt_path, gt_filename, gt_page in gt_refs:
        if (search_path == gt_path) and (search_filename == gt_filename) and abs(search_page - gt_page) <= tolerance:
            return True
    return False

In [None]:
# ============ 검색 평가 (Tolerance @ 1, 3, 5) ============
for tol in tolerances:
    results = []
    hit_list = []
    rr_list = []

    grouped = df_0.groupby(['user_filter', 'user_query'])  # 쿼리별로 그룹핑

    for (user_filter, user_query), group in grouped:        
        search_ref = []   # 원래 검색된 결과 저장
        expanded_ref = [] # tolerance 추가 문서 중 기존 검색결과와 겹치지 않는것만 저장
        used_keys = set() # 중복을 확인하기 위한 set

        # --- 0. gt 파싱 ---
        first_row = group.iloc[0]
        gt_refs = parse_gt_refs(first_row['gt_refs'])
        gt_answer = first_row['gt_answer']
        
        # --- 1. 원래 검색결과 정리 ---
        for idx, row in group.iterrows():
            search_path = row['search_path']
            search_filename = row['search_filename']
            search_page = row['search_page']
            score = row['score']

            search_ref.append((search_path, search_filename, search_page, score))      
            used_keys.add((search_path, search_filename, search_page))
    
            # break # 하나의 질문에 대한 한 개의 검색결과에 대한 결과 (전체 보려면 주석처리)
        
        # --- 2. tolerance 확장  ---
        for (search_path, search_filename, search_page, score) in search_ref:
            for delta in range(-tol, tol+1):
                if delta == 0: 
                    continue
                new_page = search_page + delta
                if (search_path, search_filename, new_page) not in used_keys:
                    expanded_ref.append((search_path, search_filename, new_page, -1))
                    used_keys.add((search_path, search_filename, new_page))

        # --- 3. 쿼리별 검색결과 데이터 정리 ---
        total_refs = search_ref + expanded_ref 
        gt_set = set(gt_refs)
        for path, filename, page, score in total_refs:
            is_hit = (path, filename, page) in gt_set # 여기서 계산하는: Hit -> 질문 1개에 대한 개별 검색결과
            result = {
                "user_filter": user_filter,
                "user_query": user_query,
                "search_path": search_path,
                "search_filename": search_filename,
                "search_page": search_page,
                "is_hit": is_hit,
                "score": score,
                "gt_refs": gt_refs,
                "gt_answer": gt_answer,
            }
            results.append(result)
        
        # --- 4. Hit 계산 (질문 1개, 즉 개별 검색결과에 대해서 하나라도 hit 하면 hit) ---
        hit = any((path, filename, page) in set(gt_refs) for (path, filename, page, score) in total_refs)
        hit_list.append(hit)
        
        # -- 5. Reciprocal Rank 계산 (질문 1개)--
        rr = 0
        for rank, search_item in enumerate(search_ref, start=1):
            if is_hit_with_tolerance(search_item, gt_refs, tol):
                rr = 1 / rank
                break
        rr_list.append(rr)        
        
        # break # 질문 한 개에 대한 결과 (전체 보려면 주석처리)

    # --- 1. tolerance별 결과저장 ---
    df_tol = pd.DataFrame(results)
    df_tol.to_csv(f'tolerance_{SEARCH_METHOD}_{tol}.csv', index=False)
    print(f'tolerance_{SEARCH_METHOD}_{tol}.csv 저장 완료! (행 수: {len(df_tol)}) ')

    # --- 2. tolerance별 Hit Rate, MRR 계산 및 출력 ---
    hit_rate = np.mean(hit_list)
    mrr = np.mean(rr_list)
    print("=================================")
    print(f"[Tolerance ±{tol}]")
    print(f"Hit Rate: {hit_rate * 100:.2f}%")
    print(f"Mean Reciprocal Rank (MRR): {mrr * 100:.2f}%")
    print("=================================")
    

    # break # tolerance 한개에 대한 결과 (전체 보려면 주석처리)