In [1]:
!pip install langchain faiss-cpu sentence-transformers



In [3]:
!pip install python-dotenv



In [None]:
import os
import json
import torch
from pathlib import Path
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from dotenv import load_dotenv

# .env 로드
load_dotenv()
token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# 1. 데이터 불러오기
def load_game_data(filepath: str) -> list:
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)

# 2. Document 객체 리스트로 변환
def create_documents(game_data: list) -> list:
    return [
        Document(
            page_content=game["text"],
            metadata={"id": game["id"], "name": game["game_name"]}
        )
        for game in game_data
    ]

# 3. 임베딩 모델 로드 (GPU/CPU 자동 선택)
def create_embedding_model(device: str = None):
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    return HuggingFaceBgeEmbeddings(
        model_name="BAAI/bge-m3",
        model_kwargs={"device": device},
        encode_kwargs={"normalize_embeddings": True}
    )

# 4. FAISS 인덱스 구축 및 저장
def build_faiss_index(documents: list, embedding_model, save_path: str):
    vectorstore = FAISS.from_documents(documents, embedding_model)
    Path(save_path).mkdir(parents=True, exist_ok=True)
    vectorstore.save_local(save_path)
    print(f"✅ FAISS index 저장 완료: {save_path}")

# 5. 전체 실행
if __name__ == "__main__":
    game_data = load_game_data("game.json")
    docs = create_documents(game_data)
    embedding_model = create_embedding_model()
    build_faiss_index(docs, embedding_model, "game_faiss_index")


No sentence-transformers model found with name BAAI/bge-m3. Creating a new one with mean pooling.


OSError: There was a specific connection error when trying to load BAAI/bge-m3:
401 Client Error: Unauthorized for url: https://huggingface.co/BAAI/bge-m3/resolve/main/config.json (Request ID: Root=1-68232f4f-4eefcd801cdba10b67074f78;15ba5b5a-9f68-43c8-9b85-fe78f8e0f771)

Invalid credentials in Authorization header