In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from openai import OpenAI
from langchain_chroma.vectorstores import Chroma
import chromadb
import re
from langchain_community.document_loaders import Docx2txtLoader
from glob import glob
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from huggingface_hub import hf_hub_download
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.history import BaseChatMessageHistory, RunnableWithMessageHistory
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_ollama import ChatOllama
from langchain import hub
from chromadb.utils import embedding_functions
import json, hashlib

In [2]:
from dotenv import load_dotenv
load_dotenv()
HF_TOKEN = os.getenv('HF_TOKEN')

In [3]:
client = OpenAI()
emb = OpenAIEmbeddings( model="text-embedding-3-small" )

In [24]:
text_files = glob('./data/*.txt')
texts, metadatas = [], []

In [6]:
print(text_files)

['./data\\25년3차_신혼·신생아매입임대Ⅰ_공급주택목록(서울지역본부).txt', './data\\25년3차_신혼·신생아매입임대Ⅱ(전세형)_공급주택목록(서울지역본부).txt', './data\\25년_3차_청년매입임대_공급주택목록(서울지역본부).txt']


In [25]:
for file in text_files:
    with open(file, 'r', encoding = 'utf-8') as f:
        s = f.read()
        data = json.loads(s)
        for item in data:
            info = item["metadata"]
            text = ", ".join(f"{k}: {v}" for k, v in info.items())
            info['source'] = file
            metadatas.append(info)
            texts.append(text)

In [23]:
print(type(info))

<class 'dict'>


In [26]:
PERSIST_DIR = os.path.abspath("./db")  # 경로 혼동 방지(절대경로 추천)

vector_store = Chroma(
    collection_name="database",     # 처음과 동일
    persist_directory=PERSIST_DIR,  # 처음과 동일
    embedding_function=emb,         # 동일 모델/차원
)

In [27]:
vector_store.add_texts(texts, metadatas = metadatas)

['9b9c2d71-a5f4-40bc-8c80-a89c8b64e270',
 '06575231-41f2-446f-921c-ab2842d3d9e0',
 'f1028a9b-a8ab-46c8-9ace-4fb3c64a16ad',
 '23310670-833e-4a26-8c84-2c7a1b9bfc1d',
 'fc5e0947-b660-4c1d-a474-b20c513cc9cd',
 'd395cf54-f8a0-4d24-ae4f-5487815aa53e',
 'b212dcc4-1793-49ed-9c83-098062ca433d',
 'fcfabb87-1b05-4568-9a06-f1d142097547',
 '5dc5fdc5-62a3-4d33-85d5-abed6673587b',
 '30444ae3-7e74-4d69-9ef1-a89956c0d098',
 'e0d0c172-d004-4b71-8b22-8709bfe266f3',
 'cbdda2a6-2312-4f3a-98e7-615b49212ee4',
 'e8739e12-c392-4523-b4be-b83bb1b16f4a',
 '085c8b54-3c3a-49b0-99f4-a5396fec72f0',
 '43be755e-5362-44af-bf78-310a009f3d14',
 'b2c65292-790f-49a4-8e1a-20dcc451bc42',
 '42874b57-d485-45cd-851e-88e0eadb4789',
 '50f9fa09-d45f-49ce-a23a-0a73888ca95b',
 '9c9ace1a-18b3-40ae-9afe-69c8658072f5',
 '1cd17e59-7f57-4821-a46a-843c88439991',
 'ce1ff61e-e51a-4639-9bdd-5cb8c2a33400',
 '4786de55-6eb7-410c-a15e-90df24bec42f',
 '2cb63f67-583e-4d9d-9c42-577c500753e7',
 '337592e3-32ab-4c96-8fa3-05b7e9dc034f',
 '51a6ff40-62cb-