In [31]:
import os
import glob
import json
import re

import chromadb
from chromadb.config import Settings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma

In [None]:

!pip install "langchain==0.2.6"
!pip install "ibm-watsonx-ai==1.0.10"
!pip install "langchain_ibm==0.1.8"
!pip install "langchain_community==0.2.6"
!pip install "sentence-transformers==3.0.1"
!pip install "chromadb==0.5.3"
!pip install "pydantic==2.8.2"
!pip install "langchain-huggingface==0.0.3"
!pip install "python-dotenv==1.0.1"

Collecting chromadb==0.5.3
  Using cached chromadb-0.5.3-py3-none-any.whl.metadata (6.8 kB)
Collecting chroma-hnswlib==0.7.3 (from chromadb==0.5.3)
  Using cached chroma-hnswlib-0.7.3.tar.gz (31 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Using cached chromadb-0.5.3-py3-none-any.whl (559 kB)
Building wheels for collected packages: chroma-hnswlib
  Building wheel for chroma-hnswlib (pyproject.toml) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mBuilding wheel for chroma-hnswlib [0m[1;32m([0m[32mpyproject.toml[0m[1;32m)[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[63 lines of output][0m
  [31m   [0m running bdist_wheel
  [31m   [0m running build
  [31m   [0m running build_ext
  [31m   [0m creating tmp
  [31m   [0m g++ -fno-strict-overflow -Wsign-compare -DDYNA

In [33]:
import os
from dotenv import load_dotenv
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA

from ibm_watsonx_ai.foundation_models.utils.enums import ModelTypes
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from ibm_watsonx_ai.foundation_models.utils.enums import DecodingMethods
from langchain.llms import WatsonxLLM

In [36]:
# !pip install chromadb|
!pip install pysqlite3-binary

__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')



In [34]:
# 함수 정의

def clean_text(text):
    """
    text가 문자열이 아닐 경우(예: list, dict) 등을 처리해주는 버전
    """
    if isinstance(text, list):
        # 리스트이면 각 항목을 문자열로 변환 후 공백으로 join
        text = " ".join(str(t) for t in text)
    elif not isinstance(text, str):
        text = str(text)
    
    text = text.replace("\r", " ").replace("\t", " ")
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text

def _extract_table_item(table_dict: dict, results: list):
    """
    '별표단위'의 한 요소(별표번호, 별표제목, 별표내용 등)를 줄 단위로 전처리하여 results에 추가
    """
    table_num = table_dict.get("별표번호", "")
    table_num = clean_text(table_num)
    
    table_title = table_dict.get("별표제목", "")
    table_title = clean_text(table_title)
    
    if table_num or table_title:
        results.append(f"(별표번호) {table_num} (별표제목) {table_title}")

    table_contents = table_dict.get("별표내용", [])
    if isinstance(table_contents, list):
        for paragraph_list in table_contents:
            if isinstance(paragraph_list, list):
                for line in paragraph_list:
                    if isinstance(line, str):
                        txt = clean_text(line)
                        if txt:
                            results.append(txt)

def extract_text_from_law_json(json_data: dict) -> list:
    results = []
    law = json_data.get("법령", {})

    # (1) 법령명
    law_name = law.get("기본정보", {}).get("법령명_한글", "")
    results.append(f"[{clean_text(law_name)}]")

    # (2) 부칙
    sup_provisions = law.get("부칙", {}).get("부칙단위", [])
    for sup_provision in sup_provisions:
        content = sup_provision.get("부칙내용", [])
        for paragraph_list in content:  # 2차원 리스트
            cleaned_line = []
            for line in paragraph_list:
                line = clean_text(line)
                if line:
                    cleaned_line.append(line)
            merged = "".join(cleaned_line)
            if merged:
                results.append(merged)

    # (3) 조문
    provisions = law.get("조문", {}).get("조문단위", [])
    for provision in provisions:
        article_text = provision.get("조문내용", "")
        article_text = clean_text(article_text)
        if article_text:
            results.append(article_text)

        # 항(단일 dict or list)
        if "항" in provision:
            if isinstance(provision["항"], dict):
                ho_list = provision["항"].get("호", [])
                if isinstance(ho_list, list):
                    for ho_item in ho_list:
                        ho_text = ho_item.get("호내용", "")
                        ho_text = clean_text(ho_text)
                        if ho_text:
                            results.append(ho_text)
            elif isinstance(provision["항"], list):
                for paragraph_item in provision["항"]:
                    if isinstance(paragraph_item, dict):
                        para_text = paragraph_item.get("항내용", "")
                        para_text = clean_text(para_text)
                        if para_text:
                            results.append(para_text)

                        if "호" in paragraph_item:
                            ho_list = paragraph_item["호"]
                            if isinstance(ho_list, list):
                                for ho_item in ho_list:
                                    ho_text = ho_item.get("호내용", "")
                                    ho_text = clean_text(ho_text)
                                    if ho_text:
                                        results.append(ho_text)

    # (4) 별표
    if "별표" in law:
        annex_container = law["별표"]
        if isinstance(annex_container, dict):
            table_list = annex_container.get("별표단위", [])
            if isinstance(table_list, list):
                for table_item in table_list:
                    if isinstance(table_item, dict):
                        _extract_table_item(table_item, results)

    return results

def chunk_text(text_list: list, max_chunk_size: int = 500) -> list:
    """
    긴 텍스트를 일정 크기(max_chunk_size)로 분할
    """
    chunks = []
    for text in text_list:
        if len(text) <= max_chunk_size:
            chunks.append(text)
        else:
            start = 0
            while start < len(text):
                end = start + max_chunk_size
                chunks.append(text[start:end])
                start = end
    return chunks

In [35]:
# (1) JSON 파일 로드 & 텍스트 추출
all_docs = []
all_metadatas = []

json_files = glob.glob(os.path.join('laws', "*.json"))
print(f"Found {len(json_files)} JSON files")

for file_path in json_files:
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    # 전처리
    extracted_texts = extract_text_from_law_json(data)
    # 청크 분할
    chunks = chunk_text(extracted_texts, max_chunk_size=500)

    # 문서/메타데이터 수집
    for c in chunks:
        all_docs.append(c)
        # 메타데이터: 파일명 or 조문번호 등 필요한 항목
        all_metadatas.append({"source_file": os.path.basename(file_path)})

print(f"Total {len(all_docs)} chunks extracted.")

# (2) 임베딩 모델 & Chroma 생성
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"  # 예시 임베딩 모델
)

# Chroma 벡터스토어 생성
# 이미 chroma_db 폴더가 있으면 추가로 병합하려면 로우레벨 API 사용하거나
# 새로 만들 경우 아래처럼 from_texts 사용
persist_directory = 'chroma_db'
vectorstore = Chroma.from_texts(
    texts=all_docs,
    embedding=embedding_model,
    metadatas=all_metadatas,
    persist_directory=persist_directory
)
# DB를 디스크에 저장
vectorstore.persist()
print(f"Chroma DB stored at: {persist_directory}")


Found 13 JSON files


Total 13620 chunks extracted.


AttributeError: module 'chromadb' has no attribute 'config'

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vectorstore = Chroma(
    embedding_function=embedding_model,
    persist_directory="chroma_db"
)

query = "어린이 보호구역 속도 위반 과태료는 얼마인가?"
docs = vectorstore.similarity_search(query, k=3)
for i, doc in enumerate(docs, start=1):
    print(f"[DOC {i}] {doc.page_content}")
    print(f"Meta: {doc.metadata}")
    print("------------------")