In [None]:
import os
import json
import faiss
import numpy as np
import openai
import pandas as pd
import re
from pathlib import Path
from dotenv import load_dotenv
from rapidfuzz import fuzz, process

# ✅ 환경 설정
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

# ✅ 경로 설정
CHUNKS_DIR = "output_jsonl_chunks"
VECTOR_INDEX = "vector.index"
VECTOR_METADATA = "vector_metadata.json"
DATA_LIST = "data_list.csv"

# ✅ 파일명 정규화 함수
def sanitize_filename(filename: str) -> str:
    name = Path(filename).stem
    name = re.sub(r'[\\/:*?"<>|()\u3000\s]+', '', name)
    return name.strip()

# ✅ 가장 유사한 파일명 찾기
def find_closest_filename(target, candidates):
    match = process.extractOne(target, candidates, scorer=fuzz.ratio)
    if match and match[1] > 90:
        return match[0]
    return None

# ✅ 메타데이터 enrich (data_list.csv 연동)
def enrich_metadata(meta: dict, data_df: pd.DataFrame) -> dict:
    fname = meta["filename"].strip()
    row = data_df[data_df["파일명"].str.strip() == fname]
    if not row.empty:
        row = row.iloc[0]
        for col in ["공고 번호", "사업명", "사업 금액", "발주 기관", "입찰 참여 마감일"]:
            meta[col] = row.get(col, "")
    return meta

# ✅ 데이터 로드
index = faiss.read_index(VECTOR_INDEX)

with open(VECTOR_METADATA, "r", encoding="utf-8") as f:
    vector_metadatas = json.load(f)

data_list = pd.read_csv(DATA_LIST)

# ✅ 청크 파일 로드 (청크 단위로 title/subtitle/index 포함)
all_chunks = {}
loaded_files = set()
actual_files = {sanitize_filename(f): f for f in os.listdir(CHUNKS_DIR)}

for meta in vector_metadatas:
    filename = meta["filename"]
    sanitized = sanitize_filename(filename)

    if sanitized not in loaded_files:
        file_match = actual_files.get(sanitized)
        if not file_match:
            file_match = find_closest_filename(sanitized, actual_files.keys())
            if file_match:
                file_match = actual_files[file_match]
        if not file_match:
            print(f"❌ 파일 없음: {sanitized}.jsonl")
            continue

        with open(os.path.join(CHUNKS_DIR, file_match), "r", encoding="utf-8") as f:
            all_chunks[sanitized] = [json.loads(line) for line in f]
        loaded_files.add(sanitized)

# ✅ 유사 청크 검색 함수 (context window + subtitle-aware)
def search_similar_chunks(query, top_k, context_window=1):
    response = openai.embeddings.create(input=[query], model="text-embedding-3-small")
    query_embedding = np.array(response.data[0].embedding).astype("float32")

    D, I = index.search(query_embedding.reshape(1, -1), top_k)

    results = []
    seen = set()
    for idx in I[0]:
        if idx < 0 or idx >= len(vector_metadatas): continue
        meta = vector_metadatas[idx]
        meta = enrich_metadata(meta, data_list)
        filename = meta["filename"]
        sanitized = sanitize_filename(filename)
        base_idx = meta["index"]

        if sanitized not in all_chunks:
            print(f"❌ 청크 로딩 실패: {sanitized}")
            continue

        chunk_list = all_chunks[sanitized]

        for offset in range(-context_window, context_window + 1):
            cidx = base_idx + offset
            if 0 <= cidx < len(chunk_list) and (sanitized, cidx) not in seen:
                seen.add((sanitized, cidx))
                chunk = chunk_list[cidx]
                results.append({
                    "text": chunk["text"],
                    "metadata": {**meta, "title": chunk["title"], "subtitle": chunk["subtitle"], "index": chunk["index"]}
                })
    return results

# ✅ GPT 답변 생성 (title/subtitle + 메타데이터 포함)
def generate_answer(query, chunks):
    context = ""
    for c in chunks:
        m = c["metadata"]
        context += f"""
📄 [문서 정보]
- 제목: {m.get('title', '')}
- 소제목: {m.get('subtitle', '')}
- 공고번호: {m.get('공고 번호', '')}
- 발주기관: {m.get('발주 기관', '')}
- 사업명: {m.get('사업명', '')}
- 예산: {m.get('사업 금액', '')}
- 마감일: {m.get('입찰 참여 마감일', '')}

📑 [본문]
{c['text']}

""".strip() + "\n\n"

    prompt = f"""다음은 사용자의 질문과 관련된 문서 내용입니다.

[질문]
{query}

[문서]
{context}

위 문서들을 참고하여 질문에 대해 명확하고 간결하게 답변하세요."""

    response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2
    )
    return response.choices[0].message.content.strip()

# ✅ 실행 예시
if __name__ == "__main__":
    query = "한국연구재단에서는 어떤 내용을 보냈어?"
    results = search_similar_chunks(query, top_k=20, context_window=1)

    print(f"🔍 검색된 청크 수: {len(results)}")
    for i, r in enumerate(results):
        m = r["metadata"]
        print(f"\n🔹 {i+1}. {m['filename']} | {m['title']} > {m['subtitle']} | idx: {m['index']}")
        print(r['text'], "...")

    print("\n🧠 GPT 응답:")
    print(generate_answer(query, results))
