In [None]:
# 1. 데이터 로드
# pdf ver.
from langchain_community.document_loaders import PyMuPDFLoader
import fitz  # PyMuPDF
import pandas as pd
import os

pdf_paths = [
            "저작권법(법률)(제20358호)(20240828).pdf", 
            "저작권상담사례집2024 (3).pdf", 
            "인공지능과 저작권 제1-2부.pdf", 
            "최진원_알기 쉬운 저작권 계약 가이드북(제2판)_2024.pdf", 
            "naver.pdf",
            "1인 미디어 창작자를 위한 저작권 안내서(2019).pdf",
            "US_copyright.pdf",
            "wipo_copyright.pdf",
            "공공저작물 저작권 관리 및 이용 지침 해설서(개정20240101업로드용).pdf",
            "네이버 블로그.pdf",
            "카카오 서비스 약관20230109.pdf",
            "하버드)해외 저작권, 공정이용 가이드라인.pdf",
            "생성형AI 저작권 가이드라인.pdf"
            ]

csv_paths = [
    "kakao_page.csv", 
    "kakao_policy_full_text.csv",
    "kakao_rights_info.csv",  
    "youtube_copyright_tools.csv"
]

# ✅ PDF에서 블록 추출
def extract_pdf_blocks(pdf_paths: list[str]) -> list[dict]:
    all_blocks = []
    for pdf_path in pdf_paths:
        doc = fitz.open(pdf_path)
        for page in doc:
            blocks_data = page.get_text("dict")["blocks"]
            for block in blocks_data:
                if "lines" not in block:
                    continue
                text = ""
                for line in block["lines"]:
                    for span in line["spans"]:
                        text += span["text"]
                text = text.strip()
                if text:
                    if text.startswith("제") and "조" in text:
                        all_blocks.append({"type": "section", "text": f"## {text}", "source": os.path.basename(pdf_path)})
                    elif text.endswith("가이드") or len(text) < 20:
                        all_blocks.append({"type": "title", "text": f"# {text}", "source": os.path.basename(pdf_path)})
                    else:
                        all_blocks.append({"type": "paragraph", "text": text, "source": os.path.basename(pdf_path)})
    return all_blocks

# ✅ CSV에서 블록 추출 (content 컬럼을 기준으로)
def extract_csv_blocks(csv_paths: list[str], content_column: str = None) -> list[dict]:
    all_blocks = []
    for csv_path in csv_paths:
        try:
            if content_column:  # 명시된 컬럼이 있을 때
                df = pd.read_csv(csv_path)
                for _, row in df.iterrows():
                    text = str(row[content_column]).strip()
                    if text:
                        all_blocks.append({"type": "paragraph", "text": text, "source": os.path.basename(csv_path)})
            else:  # 컬럼 이름이 없을 경우
                df = pd.read_csv(csv_path, header=None)
                for _, row in df.iterrows():
                    text = str(row[0]).strip()
                    if text:
                        all_blocks.append({"type": "paragraph", "text": text, "source": os.path.basename(csv_path)})
        except Exception as e:
            print(f"[오류] {csv_path} 처리 중 에러 발생: {e}")
    return all_blocks


# ✅ PDF + CSV 통합
def extract_all_blocks(pdf_paths: list[str], csv_paths: list[str]) -> list[dict]:
    pdf_blocks = extract_pdf_blocks(pdf_paths)
    csv_blocks = extract_csv_blocks(csv_paths)
    return pdf_blocks + csv_blocks

    return all_blocks

In [None]:
all_blocks = extract_all_blocks(pdf_paths, csv_paths)

In [None]:
all_blocks[8]

{'type': 'section',
 'text': '## 제2조(정의) 이 법에서 사용하는 용어의 뜻은 다음과 같다. <개정 2009. 4. 22., 2011. 6. 30., 2011. 12. 2., 2016. 3. 22.,',
 'source': '저작권법(법률)(제20358호)(20240828).pdf'}

In [None]:
from langchain.text_splitter import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# 1. Markdown header 기반 splitter 정의
header_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=[("#", "title"), ("##", "section")])

# 2. 블록 → Markdown 텍스트로 변환
def blocks_to_markdown_text(blocks):
    return "\n\n".join(b["text"] for b in blocks)

markdown_text = blocks_to_markdown_text(all_blocks)

# 3. Markdown header 기준으로 구조 단위로 분할 (Document 객체 반환됨)
structured_chunks = header_splitter.split_text(markdown_text)

# 4. Recursive splitter 설정
recursive_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)

# 5. 구조 단위 chunk들을 다시 세부적으로 쪼개고, metadata 유지
final_split_docs = []
for doc in structured_chunks:
    content = doc.page_content      # ✅ Document 객체로부터 텍스트 추출
    metadata = doc.metadata         # ✅ Document 객체로부터 메타데이터 추출

    small_chunks = recursive_splitter.split_text(content)
    for chunk in small_chunks:
        final_split_docs.append(Document(page_content=chunk, metadata=metadata))


In [None]:
import pandas as pd
import json
from langchain_core.documents import Document


def load_json_to_documents(json_path: str, text_key: str, metadata_keys: list[str] = []) -> list[Document]:
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    documents = []
    for item in data:
        content = item.get(text_key, "")
        metadata = {k: item.get(k, "") for k in metadata_keys}
        documents.append(Document(page_content=content, metadata=metadata))
    return documents

from typing import List


def load_multiple_json_to_documents(json_paths: List[str], text_key: str, metadata_keys: List[str] = []) -> List[Document]:
    all_documents = []
    for path in json_paths:
        docs = load_json_to_documents(path, text_key, metadata_keys)
        all_documents.extend(docs)
    return all_documents

In [None]:
json_paths = ["instagram_faq_answers.json", "youtube_support_faq.json"]

json_docs = load_multiple_json_to_documents(  
    json_paths, 
    text_key="content", 
    metadata_keys=["title", "source"]
)


In [None]:
def assign_metadata(docs: list[Document]) -> list[Document]:
    result = []
    for doc in docs:
        text = doc.page_content.lower()
        metadata = doc.metadata.copy()

        # 플랫폼
        platforms = ["네이버", "카카오", "유튜브", "인스타그램", "naver", "kakao", "youtube", "instagram"]
        matched_platforms = sorted({p for p in platforms if p.lower() in text})
        if matched_platforms:
            metadata["platform"] = ", ".join(matched_platforms)

        # 법 영역
        if any(w in text for w in ["fair use", "dmca", "united states", "미국"]):
            metadata["law_scope"] = "해외"
        elif any(w in text for w in ["저작권법", "공공누리", "kogl", "대한민국"]):
            metadata["law_scope"] = "국내"

        # 문서 유형
        if any(w in text for w in ["사례", "faq"]):
            metadata["doc_type"] = "사례집"
        elif any(w in text for w in ["가이드", "guide"]):
            metadata["doc_type"] = "가이드"
        elif any(w in text for w in ["법", "조항", "제"]):
            metadata["doc_type"] = "법령"

        # 출처
        if "저작권법" in text:
            metadata["source"] = "저작권법"
        elif "dmca" in text:
            metadata["source"] = "DMCA"
        elif "공공누리" in text or "kogl" in text:
            metadata["source"] = "KOGL"
        elif "크리에이티브 커먼즈" in text or "creative commons" in text:
            metadata["source"] = "CC"

        # 토픽
        keyword_to_topic = {
            "음악": "음악사용", "배경음악": "음악사용", "이미지": "이미지사용",
            "ai": "ai저작권", "인공지능": "ai저작권", "공정이용": "공정이용",
            "인용": "인용", "계약": "저작권계약", "저작권료": "저작권계약",
            "공공저작물": "공공저작물"
        }
        topics = {tag for kw, tag in keyword_to_topic.items() if kw in text}
        if topics:
            metadata["topic"] = ", ".join(sorted(topics))

        doc.metadata = metadata
        result.append(doc)

    return result


In [None]:
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
# 태깅 및 통합
result_docs = assign_metadata(final_split_docs)
json_docs = assign_metadata(json_docs)

total_docs = result_docs +  json_docs

from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_chroma import Chroma
from langchain_core.documents import Document
import math

embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")

# 1. Chroma 인스턴스 생성
vector_store = Chroma(
    embedding_function=embedding_model,
    collection_name="rag_chatbot",
    persist_directory="vector_store/chroma/rag_chatbot"
)

# # 2. 문서 배치 추가 함수 정의
def batch_add_documents(vector_store, documents: list[Document], batch_size: int = 500):
    total = len(documents)
    num_batches = math.ceil(total / batch_size)

    for i in range(num_batches):
        batch = documents[i * batch_size : (i + 1) * batch_size]
        vector_store.add_documents(batch)
        print(f"✅ Added batch {i+1}/{num_batches} (size: {len(batch)})")

batch_add_documents(vector_store, total_docs, batch_size=500)

✅ Added batch 1/12 (size: 500)
✅ Added batch 2/12 (size: 500)
✅ Added batch 3/12 (size: 500)
✅ Added batch 4/12 (size: 500)
✅ Added batch 5/12 (size: 500)
✅ Added batch 6/12 (size: 500)
✅ Added batch 7/12 (size: 500)
✅ Added batch 8/12 (size: 500)
✅ Added batch 9/12 (size: 500)
✅ Added batch 10/12 (size: 500)
✅ Added batch 11/12 (size: 500)
✅ Added batch 12/12 (size: 180)


In [None]:
filtered_docs = [doc for doc in total_docs if doc.page_content.strip() != ""]
print(f"✅ page_content 있는 문서 수: {len(filtered_docs)}")

✅ page_content 있는 문서 수: 5649
