In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from openai import OpenAI
from langchain_chroma.vectorstores import Chroma
import chromadb
import re
from langchain_community.document_loaders import Docx2txtLoader
from glob import glob
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from huggingface_hub import hf_hub_download
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.history import BaseChatMessageHistory, RunnableWithMessageHistory
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_ollama import ChatOllama
from langchain import hub
from chromadb.utils import embedding_functions

In [None]:
from dotenv import load_dotenv
load_dotenv()
HF_TOKEN = os.getenv('HF_TOKEN')

In [None]:
client = OpenAI()
emb = OpenAIEmbeddings( model="text-embedding-3-small" )

In [None]:
semantic_splitter = SemanticChunker(
    emb,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=50,
)
fallback_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400, chunk_overlap=80, separators=["\n\n", "\n", " ", ""]
)

In [None]:
# Docx file 전처리

docx_files = glob('./noticification/*.docx')
documents = []

In [None]:
for docx_file in docx_files:
    loader = Docx2txtLoader(docx_file)
    docs = loader.load()  # 각 페이지가 하나의 Document(메타데이터에 source/page 포함)

    sem_chunks = semantic_splitter.split_documents(docs)
    chunks = []
    for d in sem_chunks:
        if len(d.page_content) > 800:
            chunks.extend(fallback_splitter.split_documents([d]))
        else:
            chunks.append(d)

    documents.extend(chunks)

In [None]:
PERSIST_DIR = os.path.abspath("./db")  # 경로 혼동 방지(절대경로 추천)
vector_store = Chroma(
    collection_name="database",     # 처음과 동일
    persist_directory=PERSIST_DIR,  # 처음과 동일
    embedding_function=emb,         # 동일 모델/차원
)

In [None]:
vector_store.add_documents(documents)