In [1]:
import os
from langchain_openai.embeddings import OpenAIEmbeddings
from openai import OpenAI
from langchain_chroma.vectorstores import Chroma
from langchain_community.document_loaders import Docx2txtLoader
from glob import glob
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker

In [2]:
from dotenv import load_dotenv
load_dotenv()
HF_TOKEN = os.getenv('HF_TOKEN')

In [3]:
client = OpenAI()
emb = OpenAIEmbeddings( model="text-embedding-3-small" )

In [4]:
semantic_splitter = SemanticChunker(
    emb,
    breakpoint_threshold_type="percentile",
    breakpoint_threshold_amount=50,
)
fallback_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400, chunk_overlap=80, separators=["\n\n", "\n", " ", ""]
)

In [5]:
# Docx file 전처리

docx_files = glob('./noticification/*.docx')
documents = []

In [6]:
PERSIST_DIR = "./db"  # 경로 혼동 방지(절대경로 추천)
vector_store_youth = Chroma(
    collection_name="YOUTH",     # 처음과 동일
    persist_directory=PERSIST_DIR,  # 처음과 동일
    embedding_function=emb,         # 동일 모델/차원
)

In [7]:
vector_store_marry1 = Chroma(
    collection_name="MARRY1",     # 처음과 동일
    persist_directory=PERSIST_DIR,  # 처음과 동일
    embedding_function=emb,         # 동일 모델/차원
)

In [8]:
vector_store_marry2 = Chroma(
    collection_name="MARRY2",     # 처음과 동일
    persist_directory=PERSIST_DIR,  # 처음과 동일
    embedding_function=emb,         # 동일 모델/차원
)

In [9]:
for docx_file in docx_files:
    loader = Docx2txtLoader(docx_file)
    docs = loader.load()  # 각 페이지가 하나의 Document(메타데이터에 source/page 포함)

    sem_chunks = semantic_splitter.split_documents(docs)
    chunks = []
    for d in sem_chunks:
        if len(d.page_content) > 800:
            chunks.extend(fallback_splitter.split_documents([d]))
        else:
            chunks.append(d)

    documents.extend(chunks)

    if docx_file  == './noticification\\서울지역본부 신혼·신생아 매입임대주택Ⅰ 예비입주자 모집공고.docx':
        vector_store_marry1.add_documents(documents)

    elif docx_file == './noticification\\서울지역본부 청년매입임대주택 예비입주자 모집공고.docx':
        vector_store_youth.add_documents(documents)

    elif docx_file == './noticification\\서울지역본부_신혼·신생아_매입임대주택Ⅱ_전세형_예비입주자_모집공고.docx':
        vector_store_marry2.add_documents(documents)