In [1]:
from tqdm import tqdm

import pandas as pd
import numpy as np

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import UnstructuredExcelLoader
from langchain_community.document_loaders import DataFrameLoader
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [2]:
class CFG:
    # store="프랭크버거"
    output_path = "/home/user09/beaver/data/db"
    save_path = ""
    embedding_model="BAAI/bge-m3"
    retriever_k=5
    retriever_bert_weight=0.7
    version='5'
    seed=42
    
# CFG.save_path = CFG.output_path + CFG.store + "_temp.csv"

In [3]:
#### 엑셀파일 DB화(pickle파일로 변환) ####
store = "홍콩반점"

import pandas as pd
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever

# 엑셀 파일 로드
df_2 = pd.read_excel(f'/home/user09/beaver/data/dataset_v{CFG.version}.xlsx')

# Document 생성: '상품명'만을 page_content로 사용
docs = [
    Document(
        page_content=row['상품명'],
        metadata={
            '종류': row['종류'],
            '옵션': row['옵션'],
            '가격': row['가격'],
            '특징': row['특징']
        }
    )
    for _, row in df_2.iterrows()
]

# Embeddings 설정
encode_kwargs={'normalize_embeddings':True}
model_kwargs={'device':'cpu'}

hf = HuggingFaceEmbeddings(
    model_name=CFG.embedding_model,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

# FAISS 벡터스토어 생성
db = FAISS.from_documents(
    documents=docs,
    embedding=hf
)

# 벡터스토어 저장
db.save_local(f"{CFG.output_path}/{store}_faiss{CFG.version}")

# Document 리스트 저장
import pickle 
with open(f"{CFG.output_path}/{store}_docs{CFG.version}.pkl", "wb") as f:
    pickle.dump(docs, f)

# 벡터스토어 로드
db = FAISS.load_local(
    folder_path=f"/home/user09/beaver/data/db/{store}_faiss5",
    embeddings=hf,
    allow_dangerous_deserialization=True
)

# Retriever 설정
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={"k": CFG.retriever_k}
)

# BM25 Retriever 생성
bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = CFG.retriever_k

# Ensemble Retriever 설정
ensemble_retriever = EnsembleRetriever(
    retrievers=[retriever, bm25_retriever],
    weights=[CFG.retriever_bert_weight, 1 - CFG.retriever_bert_weight],
)


  hf = HuggingFaceEmbeddings(
  from tqdm.autonotebook import tqdm, trange
