In [1]:
# function.py
import requests
import re
import os
from dotenv import load_dotenv
from langchain.tools import tool
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Optional, Tuple
from langchain_openai import ChatOpenAI


# === .env 불러오기 ===
load_dotenv()
naver_client_id = os.getenv("NAVER_CLIENT_ID")
naver_client_secret = os.getenv("NAVER_CLIENT_SECRET")


  from .autonotebook import tqdm as notebook_tqdm


# 네이버 쇼핑 API 함수

In [2]:
def price_tool(user_query: str) -> str:
    """A tool that uses the Naver Shopping API to look up perfume prices (results are returned as formatted strings)"""
    
    url = "https://openapi.naver.com/v1/search/shop.json"
    headers = {
        "X-Naver-Client-Id": naver_client_id,
        "X-Naver-Client-Secret": naver_client_secret
    }
    params = {"query": user_query, "display": 5, "sort": "sim"}
    
    try:
        response = requests.get(url, headers=headers, params=params)
    except Exception as e:
        return f"❌ 요청 오류: {e}"
    
    if response.status_code != 200:
        return f"❌ API 오류: {response.status_code}"
    
    data = response.json()
    if not data or "items" not in data or len(data["items"]) == 0:
        return f"😔 '{user_query}'에 대한 검색 결과가 없습니다."
    
    # HTML 태그 제거 함수
    def remove_html_tags(text: str) -> str:
        return re.sub(r"<[^>]+>", "", text)
    
    # 상위 3개만 정리
    products = data["items"][:3]
    output = f"🔍 '{user_query}' 검색 결과:\n\n"
    for i, item in enumerate(products, 1):
        title = remove_html_tags(item.get("title", ""))
        lprice = item.get("lprice", "0")
        mall = item.get("mallName", "정보 없음")
        link = item.get("link", "정보 없음")
        
        output += f"📦 {i}. {title}\n"
        if lprice != "0":
            output += f"   💰 가격: {int(lprice):,}원\n"
        output += f"   🏪 판매처: {mall}\n"
        output += f"   🔗 링크: {link}\n\n"
    
    return output


In [3]:
price_tool("샤넬 NO5 향수 가격 ")  # 함수 테스트

"🔍 '샤넬 NO5 향수 가격 ' 검색 결과:\n\n📦 1. [국내백화점/선물포장] 샤넬 넘버5 NO5 오드빠르펭 여성 향수 35ml\n   💰 가격: 141,000원\n   🏪 판매처: 라이크컴퍼니\n   🔗 링크: https://smartstore.naver.com/main/products/11549340601\n\n📦 2. CHANEL No.5 오 드 퍼퓸 플로럴향, 50ml, 1개\n   💰 가격: 130,000원\n   🏪 판매처: 네이버\n   🔗 링크: https://search.shopping.naver.com/catalog/53015716331\n\n📦 3. [국내백화점/선물포장] 샤넬 넘버5 NO5 LEAU 로 오 드 뚜왈렛 여성 향수 50ml\n   💰 가격: 206,000원\n   🏪 판매처: 라이크컴퍼니\n   🔗 링크: https://smartstore.naver.com/main/products/11549357996\n\n"

# 메타필터링 def

In [5]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from pinecone import Pinecone

# -------------------------------
# 1. 환경 변수 로드 & 클라이언트 초기화
# -------------------------------
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

client = OpenAI(api_key=OPENAI_API_KEY)       # ✅ OpenAI 클라이언트
pc = Pinecone(api_key=PINECONE_API_KEY)       # ✅ Pinecone 클라이언트

# host 주소는 콘솔에서 확인한 값으로 교체해야 함
index = pc.Index(
    "perfume-vectordb",
    host="https://perfume-vectordb-xxxxx.svc.us-east1-gcp.pinecone.io"
)


In [6]:
# -------------------------------
def get_embedding(text: str):
    response = client.embeddings.create(
        model="text-embedding-ada-002",  # dimension = 1536
        input=text
    )
    return response.data[0].embedding

# -------------------------------
# 3. 브랜드 필터 함수
# -------------------------------
BRAND_LIST = [
    '겔랑', '구찌', '끌로에', '나르시소 로드리게즈', '니샤네', '도르세', '디올', '딥티크', '랑콤',
    '로라 메르시에', '로에베', '록시땅', '르 라보', '메모', '메종 마르지엘라', '메종 프란시스 커정',
    '멜린앤게츠', '미우미우', '바이레도', '반클리프 아펠', '버버리', '베르사체', '불가리', '비디케이',
    '산타 마리아 노벨라', '샤넬', '세르주 루텐', '시슬리 코스메틱', '아쿠아 디 파르마', '에따 리브르 도량쥬',
    '에르메스', '에스티 로더', '엑스 니힐로', '이니시오 퍼퓸', '이솝', '입생로랑', '제르조프', '조 말론',
    '조르지오 아르마니', '줄리엣 헤즈 어 건', '지방시', '질 스튜어트', '크리드', '킬리안', '톰 포드',
    '티파니앤코', '퍼퓸 드 말리', '펜할리곤스', '프라다', '프레데릭 말'
]

def brand_tool(user_query: str):
    for brand in BRAND_LIST:
        if brand in user_query:
            return {"brand": {"$eq": brand}}
    return None

# -------------------------------
# 4. 쿼리 실행
# -------------------------------
query = "샤넬 플로럴 향수 추천해줘"

# (1) 쿼리 → 임베딩
vector = get_embedding(query)

# (2) 브랜드 필터 추출
brand_filter = brand_tool(query)

# (3) Pinecone 검색
result = index.query(
    vector=vector,
    top_k=5,
    include_metadata=True,
    filter=brand_filter
)

# (4) 결과 출력
print("=== 검색 결과 ===")
for match in result["matches"]:
    meta = match["metadata"]
    print(f"브랜드: {meta.get('brand')} | 이름: {meta.get('name')} | 점수: {match['score']:.4f}")

ProtocolError: Failed to connect to /query

#  판다스 메타데이터 뽑을 이름

In [13]:
import pandas as pd

# CSV 로드
df = pd.read_csv("perfume_final.csv")

# 브랜드 목록 (중복 제거)
brands = sorted(df["sizes"].unique().tolist())

print(f"브랜드 개수: {len(brands)}")
print("all:", brands[:])


브랜드 개수: 80
all: ['[10, 100]', '[10, 50]', '[100, 10]', '[100, 15]', '[100, 20]', '[100, 30]', '[100, 50]', '[100]', '[10]', '[125]', '[15, 50, 100, 30]', '[150]', '[15]', '[20, 100]', '[20]', '[25, 50]', '[25]', '[30, 100, 10]', '[30, 100]', '[30, 150]', '[30, 15]', '[30, 50, 100, 10]', '[30, 50, 100, 15]', '[30, 50, 100]', '[30, 50, 10]', '[30, 50, 15]', '[30, 50, 75]', '[30, 50, 85, 15]', '[30, 50, 85]', '[30, 50, 90]', '[30, 50]', '[30, 75, 100, 150]', '[30, 75, 100]', '[30, 75]', '[30, 85, 15]', '[30, 85]', '[30]', '[35, 50, 100, 150]', '[35, 50, 100]', '[35, 50, 200]', '[35, 50]', '[35, 70]', '[35]', '[3]', '[40, 60]', '[50, 100, 10, 30]', '[50, 100, 10]', '[50, 100, 15, 30]', '[50, 100, 150]', '[50, 100, 15]', '[50, 100, 30, 15]', '[50, 100, 30]', '[50, 100, 8]', '[50, 100]', '[50, 10]', '[50, 150]', '[50, 15]', '[50, 200]', '[50, 25]', '[50, 30]', '[50, 75, 125]', '[50, 75]', '[50, 80]', '[50, 90, 150]', '[50, 90]', '[50]', '[60, 100]', '[60]', '[67]', '[70, 35]', '[70]', '[75, 

In [3]:

# -------------------------------
def get_embedding(text: str):
    response = client.embeddings.create(
        model="text-embedding-ada-002",  # dimension = 1536
        input=text
    )
    return response.data[0].embedding

# -------------------------------
# 3. 브랜드 필터 함수
# -------------------------------
BRAND_LIST = [
    '겔랑', '구찌', '끌로에', '나르시소 로드리게즈', '니샤네', '도르세', '디올', '딥티크', '랑콤',
    '로라 메르시에', '로에베', '록시땅', '르 라보', '메모', '메종 마르지엘라', '메종 프란시스 커정',
    '멜린앤게츠', '미우미우', '바이레도', '반클리프 아펠', '버버리', '베르사체', '불가리', '비디케이',
    '산타 마리아 노벨라', '샤넬', '세르주 루텐', '시슬리 코스메틱', '아쿠아 디 파르마', '에따 리브르 도량쥬',
    '에르메스', '에스티 로더', '엑스 니힐로', '이니시오 퍼퓸', '이솝', '입생로랑', '제르조프', '조 말론',
    '조르지오 아르마니', '줄리엣 헤즈 어 건', '지방시', '질 스튜어트', '크리드', '킬리안', '톰 포드',
    '티파니앤코', '퍼퓸 드 말리', '펜할리곤스', '프라다', '프레데릭 말'
]

def brand_tool(user_query: str):
    for brand in BRAND_LIST:
        if brand in user_query:
            return {"brand": {"$eq": brand}}
    return None

# -------------------------------
# 4. 쿼리 실행
# -------------------------------
query = "샤넬 플로럴 향수 추천해줘"

# (1) 쿼리 → 임베딩
vector = get_embedding(query)

# (2) 브랜드 필터 추출
brand_filter = brand_tool(query)

# (3) Pinecone 검색
result = index.query(
    vector=vector,
    top_k=5,
    include_metadata=True,
    filter=brand_filter
)

# (4) 결과 출력
print("=== 검색 결과 ===")
for match in result["matches"]:
    meta = match["metadata"]
    print(f"브랜드: {meta.get('brand')} | 이름: {meta.get('name')} | 점수: {match['score']:.4f}")


NameError: name 'client' is not defined

In [None]:

# -------------------------------
# 1) best_season 필터 툴 함수
# -------------------------------
def brand_tool(user_query: str):
    brand_map = {
        "봄": "봄",
        "여름": "여름",
        "가을": "가을",
        "autumn": "가을",
        "겨울": "겨울"
    }
    for kr, season_val in brand_map.items():
        if kr in user_query.lower():
            return {"best_season": {"$eq": season_val}}
    return None

# -------------------------------
# 2) 검색 함수
# -------------------------------
def search_perfumes(user_query: str, top_k: int = 5):
    # ① 필터 생성
    season_filter = best_season_tool(user_query)
    
    # ② 쿼리 임베딩
    query_emb = model.encode(user_query).tolist()
    
    # ③ Pinecone 검색
    results = index.query(
        vector=query_emb,
        top_k=top_k,
        include_metadata=True,
        filter=season_filter if season_filter else None
    )
    
    # ④ 결과 정리
    perfumes = []
    for m in results["matches"]:
        perfumes.append({
            "score": m["score"],
            "brand": m["metadata"].get("brand", ""),
            "name": m["metadata"].get("name", ""),
            "eng_name": m["metadata"].get("eng_name", ""),
            "best_season": m["metadata"].get("best_season", ""),
            "best_time": m["metadata"].get("best_time", ""),
            "gender": m["metadata"].get("gender", ""),
            "concentration": m["metadata"].get("concentration", ""),
            "price": m["metadata"].get("price_krw", 0),
            "detail_url": m["metadata"].get("detail_url", "")
        })
    return perfumes

# -------------------------------
# 3) 테스트 실행
# -------------------------------
if __name__ == "__main__":
    queries = [
        "겨울용 향수 추천해줘",
        "여름에 어울리는 향수 보여줘",
        "남성용 향수 추천해줘"  # season 없음 → 필터 없이 검색
    ]
    
    for q in queries:
        print(f"\n🔍 Query: {q}")
        results = search_perfumes(q, top_k=3)
        for r in results:
            print(f"- {r['brand']} {r['name']} ({r['eng_name']}) "
                  f"[시즌:{r['best_season']} | 시간:{r['best_time']} | 성별:{r['gender']}] "
                  f"💰 {r['price']}원")
            print(f"  🔗 {r['detail_url']}")


# ML로직 함수

In [4]:
# ============================================
# 저장된 VotingClassifier (.pkl) 불러오기 + 예측
# ============================================

import joblib
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

# -------------------------------
# 설정
# -------------------------------
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
MAX_LEN = 256
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[Device] {device}")

# -------------------------------
# 1) 저장된 pkl 불러오기
# -------------------------------
SAVE_PKL = "./models.pkl"
data = joblib.load(SAVE_PKL)

clf = data["classifier"]
mlb = data["mlb"]
thresholds = data["thresholds"]

print(f"[Loaded model from {SAVE_PKL}]")
print(f"Labels: {list(mlb.classes_)}")

# -------------------------------
# 2) MiniLM 로드 (임베딩 추출용)
# -------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
base_model.eval()

def encode_texts(texts, batch_size=32):
    """텍스트를 MiniLM 임베딩으로 변환"""
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt").to(device)
        with torch.no_grad():
            model_out = base_model(**enc)
            emb = model_out.last_hidden_state.mean(dim=1)
        all_embeddings.append(emb.cpu().numpy())
    return np.vstack(all_embeddings)

# -------------------------------
# 3) 예측 함수
# -------------------------------
def predict_multilingual(text: str, topk=3, thresholds=None):
    emb = encode_texts([text], batch_size=1)
    proba = clf.predict_proba(emb)[0]

    if thresholds is not None:
        pick = [i for i, p in enumerate(proba) if p >= thresholds.get(mlb.classes_[i], 0.5)]
        if not pick:  # 어떤 것도 threshold 못 넘으면 topk 선택
            pick = np.argsort(-proba)[:topk]
    else:
        pick = np.argsort(-proba)[:topk]

    return [mlb.classes_[i] for i in pick]


[Device] cpu


  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
  machine. Consider using `save_model/load_model` instead. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.  Changing `tree_method` to `hist`.
  setstate(state)
  setstate(state)
  setstate(state)
  setstate(state)


[Loaded model from ./models.pkl]
Labels: ['Amber', 'Aromatic', 'Blossom', 'Bouquet', 'Citrus', 'Classical', 'Crisp', 'Dry', 'Floral', 'Flower', 'Fougère', 'Fresh', 'Fresher', 'Fruity', 'Gourmand', 'Green', 'Iris', 'Jasmine', 'Lily', 'Mossy', 'Musk', 'Orange', 'Rich', 'Richer', 'Rose', 'Soft', 'Spicy', 'Tuberose', 'Valley', 'Violet', 'Water', 'White', 'Woods', 'Woody']


In [6]:
# 저장된 모델에서 라벨 확인
print("=== 전체 라벨 목록 ===")
for i, label in enumerate(mlb.classes_):
    print(f"{i}: {label}")


=== 전체 라벨 목록 ===
0: Amber
1: Aromatic
2: Blossom
3: Bouquet
4: Citrus
5: Classical
6: Crisp
7: Dry
8: Floral
9: Flower
10: Fougère
11: Fresh
12: Fresher
13: Fruity
14: Gourmand
15: Green
16: Iris
17: Jasmine
18: Lily
19: Mossy
20: Musk
21: Orange
22: Rich
23: Richer
24: Rose
25: Soft
26: Spicy
27: Tuberose
28: Valley
29: Violet
30: Water
31: White
32: Woods
33: Woody


In [5]:

# -------------------------------
# 4) 예측 실행
# -------------------------------
example_text = "여자친구 달달한향좋아하는데 추천좀"
print("\n[Example Prediction]")
print(predict_multilingual(example_text, topk=3, thresholds=thresholds))


[Example Prediction]
['Amber', 'Floral', 'Fresher', 'Fruity']


# BM25 서치

In [9]:
import json
import joblib
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from rank_bm25 import BM25Okapi

# -------------------------------
# 1) 모델/라벨 불러오기
# -------------------------------
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
SAVE_PKL = "./models.pkl"
MAX_LEN = 256
device = "cuda" if torch.cuda.is_available() else "cpu"

data = joblib.load(SAVE_PKL)
clf = data["classifier"]
mlb = data["mlb"]
thresholds = data["thresholds"]

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
base_model = AutoModel.from_pretrained(MODEL_NAME).to(device)
base_model.eval()

# -------------------------------
# 2) 텍스트 인코딩
# -------------------------------
def encode_texts(texts, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt").to(device)
        with torch.no_grad():
            model_out = base_model(**enc)
            emb = model_out.last_hidden_state.mean(dim=1)
        all_embeddings.append(emb.cpu().numpy())
    return np.vstack(all_embeddings)

# -------------------------------
# 3) 예측 함수
# -------------------------------
def predict_multilingual(text: str, topk=3, thresholds=None):
    emb = encode_texts([text], batch_size=1)
    proba = clf.predict_proba(emb)[0]

    if thresholds is not None:
        pick = [i for i, p in enumerate(proba) if p >= thresholds.get(mlb.classes_[i], 0.5)]
        if not pick:
            pick = np.argsort(-proba)[:topk]
    else:
        pick = np.argsort(-proba)[:topk]

    return [mlb.classes_[i] for i in pick]

# -------------------------------
# 4) BM25 인덱싱 준비
# -------------------------------
with open("perfumes.json", "r", encoding="utf-8") as f:
    perfumes = json.load(f)

corpus = [item.get("fragrances", "") for item in perfumes]
tokenized_corpus = [doc.lower().split() for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

# -------------------------------
# 5) ML 아웃풋 → BM25 검색
# -------------------------------
user_text = "시트러스하고 프루티한 향수 추천해줘"  # 사용자 입력 예시
ml_output = predict_multilingual(user_text, topk=4, thresholds=thresholds)

print("=== ML 예측 라벨 ===")
print(ml_output)

# BM25 검색 쿼리로 변환
query = " ".join(ml_output)
tokenized_query = query.lower().split()
scores = bm25.get_scores(tokenized_query)

# -------------------------------
# 6) 상위 N개 향수 출력
# -------------------------------
top_n = 5
top_idx = np.argsort(scores)[-top_n:][::-1]

print("\n=== BM25 Top-N 결과 ===")
for rank, idx in enumerate(top_idx, 1):
    perfume = perfumes[idx]
    print(f"[Rank {rank}] Score: {scores[idx]:.2f}")
    print(f"  Brand      : {perfume['brand']}")
    print(f"  Name       : {perfume['name_perfume']}")
    print(f"  Fragrances : {perfume['fragrances']}")
    print()


  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
  machine. Consider using `save_model/load_model` instead. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.  Changing `tree_method` to `hist`.
  setstate(state)
  setstate(state)
  setstate(state)
  setstate(state)


=== ML 예측 라벨 ===
['Amber', 'Fresher']

=== BM25 Top-N 결과 ===
[Rank 1] Score: 2.45
  Brand      : Tabac
  Name       : Tabac Man Fire Power
  Fragrances : Amber Fresher

[Rank 2] Score: 2.45
  Brand      : Zadig & Voltaire
  Name       : This Is Him! Undressed
  Fragrances : Amber Fresher

[Rank 3] Score: 2.45
  Brand      : Guerlain
  Name       : Habit Rouge Sport
  Fragrances : Amber Fresher

[Rank 4] Score: 2.45
  Brand      : Jardin De France
  Name       : Urban Tweed
  Fragrances : Amber Fresher

[Rank 5] Score: 2.45
  Brand      : O Boticário
  Name       : Egeo Beat
  Fragrances : Amber Fresher



In [14]:
import json
import joblib
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from rank_bm25 import BM25Okapi
from typing import List, Dict, Tuple, Optional


class PerfumeRecommender:
    """향수 추천 시스템 클래스"""
    
    def __init__(self, 
                 model_pkl_path: str = "./models.pkl", 
                 perfume_json_path: str = "perfumes.json",
                 model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
                 max_len: int = 256):
        
        self.model_name = model_name
        self.max_len = max_len
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"[Device] {self.device}")
        
        # 모델 및 데이터 로드
        self._load_ml_model(model_pkl_path)
        self._load_transformer_model()
        self._load_perfume_data(perfume_json_path)
        self._build_bm25_index()
    
    def _load_ml_model(self, pkl_path: str):
        """저장된 ML 모델 불러오기"""
        data = joblib.load(pkl_path)
        self.clf = data["classifier"]
        self.mlb = data["mlb"]
        self.thresholds = data["thresholds"]
        
        print(f"[Loaded model from {pkl_path}]")
        print(f"Labels: {list(self.mlb.classes_)}")
    
    def _load_transformer_model(self):
        """Transformer 모델 로드"""
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.base_model = AutoModel.from_pretrained(self.model_name).to(self.device)
        self.base_model.eval()
    
    def _load_perfume_data(self, json_path: str):
        """향수 데이터 로드"""
        with open(json_path, "r", encoding="utf-8") as f:
            self.perfumes = json.load(f)
        print(f"[Loaded {len(self.perfumes)} perfumes from {json_path}]")
    
    def _build_bm25_index(self):
        """BM25 인덱스 구축"""
        self.corpus = [item.get("fragrances", "") for item in self.perfumes]
        tokenized_corpus = [doc.lower().split() for doc in self.corpus]
        self.bm25 = BM25Okapi(tokenized_corpus)
        print("[BM25 index built]")
    
    def encode_texts(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        """텍스트를 임베딩으로 변환"""
        all_embeddings = []
        
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            enc = self.tokenizer(
                batch, 
                padding=True, 
                truncation=True, 
                max_length=self.max_len, 
                return_tensors="pt"
            ).to(self.device)
            
            with torch.no_grad():
                model_out = self.base_model(**enc)
                emb = model_out.last_hidden_state.mean(dim=1)
            
            all_embeddings.append(emb.cpu().numpy())
        
        return np.vstack(all_embeddings)
    
    def predict_labels(self, text: str, topk: int = 3, use_thresholds: bool = True) -> List[str]:
        """텍스트에서 향수 라벨 예측"""
        emb = self.encode_texts([text], batch_size=1)
        proba = self.clf.predict_proba(emb)[0]
        
        if use_thresholds and self.thresholds:
            # threshold 기반 선택
            pick = [
                i for i, p in enumerate(proba) 
                if p >= self.thresholds.get(self.mlb.classes_[i], 0.5)
            ]
            # threshold를 넘는 것이 없으면 topk 선택
            if not pick:
                pick = np.argsort(-proba)[:topk]
        else:
            # 상위 topk 선택
            pick = np.argsort(-proba)[:topk]
        
        return [self.mlb.classes_[i] for i in pick]
    
    def search_perfumes(self, labels: List[str], top_n: int = 5) -> List[Tuple[int, float, Dict]]:
        """BM25를 사용해 향수 검색"""
        query = " ".join(labels)
        tokenized_query = query.lower().split()
        scores = self.bm25.get_scores(tokenized_query)
        
        # 상위 N개 인덱스 선택
        top_idx = np.argsort(scores)[-top_n:][::-1]
        
        results = []
        for idx in top_idx:
            results.append((idx, scores[idx], self.perfumes[idx]))
        
        return results
    
    def recommend(self, 
                  user_text: str, 
                  topk_labels: int = 4, 
                  top_n_perfumes: int = 5,
                  use_thresholds: bool = True,
                  verbose: bool = True) -> Dict:
        """전체 추천 파이프라인"""
        
        # 1. ML 모델로 라벨 예측
        predicted_labels = self.predict_labels(
            user_text, 
            topk=topk_labels, 
            use_thresholds=use_thresholds
        )
        
        # 2. BM25로 향수 검색
        search_results = self.search_perfumes(predicted_labels, top_n=top_n_perfumes)
        
        if verbose:
            print("=== ML 예측 라벨 ===")
            print(predicted_labels)
            print(f"\n=== BM25 Top-{top_n_perfumes} 결과 ===")
            
            for rank, (idx, score, perfume) in enumerate(search_results, 1):
                print(f"[Rank {rank}] Score: {score:.2f}")
                print(f"  Brand      : {perfume.get('brand', 'N/A')}")
                print(f"  Name       : {perfume.get('name_perfume', 'N/A')}")
                print(f"  Fragrances : {perfume.get('fragrances', 'N/A')}")
                print()
        
        return {
            "user_input": user_text,
            "predicted_labels": predicted_labels,
            "recommendations": [
                {
                    "rank": rank,
                    "score": score,
                    "brand": perfume.get('brand', 'N/A'),
                    "name": perfume.get('name_perfume', 'N/A'),
                    "fragrances": perfume.get('fragrances', 'N/A'),
                    "perfume_data": perfume
                }
                for rank, (idx, score, perfume) in enumerate(search_results, 1)
            ]
        }


# 사용 예시
def main():
    # 추천 시스템 초기화
    recommender = PerfumeRecommender()
    
    # 사용자 입력 예시들
    test_inputs = [
        "시트러스하고 프루티한 향수 추천해줘",
        "로맨틱하고 플로랄한 향 원해",
        "우디하고 스파이시한 향수",
        "깔끔하고 상쾌한 향"
    ]
    
    for user_input in test_inputs:
        print(f"\n{'='*50}")
        print(f"사용자 입력: {user_input}")
        print(f"{'='*50}")
        
        # 추천 실행
        result = recommender.recommend(
            user_text=user_input,
            topk_labels=4,
            top_n_perfumes=3,
            verbose=True
        )


if __name__ == "__main__":
    main()

[Device] cpu


  Loading from a raw memory buffer (like pickle in Python, RDS in R) on a CPU-only
  machine. Consider using `save_model/load_model` instead. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.  Changing `tree_method` to `hist`.
  setstate(state)
  setstate(state)
  setstate(state)
  setstate(state)


[Loaded model from ./models.pkl]
Labels: ['Amber', 'Aromatic', 'Blossom', 'Bouquet', 'Citrus', 'Classical', 'Crisp', 'Dry', 'Floral', 'Flower', 'Fougère', 'Fresh', 'Fresher', 'Fruity', 'Gourmand', 'Green', 'Iris', 'Jasmine', 'Lily', 'Mossy', 'Musk', 'Orange', 'Rich', 'Richer', 'Rose', 'Soft', 'Spicy', 'Tuberose', 'Valley', 'Violet', 'Water', 'White', 'Woods', 'Woody']
[Loaded 26319 perfumes from perfumes.json]
[BM25 index built]

사용자 입력: 시트러스하고 프루티한 향수 추천해줘
=== ML 예측 라벨 ===
['Amber', 'Floral', 'Fresher']

=== BM25 Top-3 결과 ===
[Rank 1] Score: 2.51
  Brand      : French Connection
  Name       : Fcuk Forever Him
  Fragrances : Floral Amber Fresher

[Rank 2] Score: 2.51
  Brand      : Fiorucci
  Name       : Wallstreet
  Fragrances : Floral Amber Fresher

[Rank 3] Score: 2.51
  Brand      : Victorio & Lucchino
  Name       : Hombre
  Fragrances : Floral Amber Fresher


사용자 입력: 로맨틱하고 플로랄한 향 원해
=== ML 예측 라벨 ===
['Amber', 'Floral', 'Fresher']

=== BM25 Top-3 결과 ===
[Rank 1] Score: 2.51
  Br

# 휴먼인더루프 노드 [추후 확장성을 고려해서 노드하나 만들어둠]

In [None]:
def human_fallback(state: dict) -> str:
    """향수 관련 복잡한 질문에 대한 기본 응답"""
    query = state.get("input", "")
    return (
        f"❓ '{query}' 더 명확한 설명이 필요합니다.\n"
        f"👉 질문을 구체적으로 다시 작성해 주세요.\n"
        f"💡 또는 향수에 관한 멋진 질문을 해보시는 건 어떨까요?"
    )

# FAQ or QNA 노드 [추후 확장성을 고려해서 노드하나 만들어둠]

In [None]:
def faq_or_qna_node(state: dict) -> str:
    """FAQ/QnA 전용 노드 - 향수 전문가 답변"""
    query = state.get("input", "")
    prompt = (
        "너는 세계적인 향수 전문가야. "
        "사용자가 묻는 향수 관련 질문에 대해 "
        "전문적이고 친절하게 설명해줘.\n\n"
        f"질문: {query}"
    )
    response = llm.invoke(prompt)
    return response.content