In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
import fitz  # PyMuPDF
from langchain.schema import Document


In [None]:
# PDF 파일이 저장된 폴더 경로
path = "data"
pdf_folder = "data"
file_list = os.listdir(path)

# 원하는 모델 선택
# model_name = "Bllossom/llama-3.2-Korean-Bllossom-3B"
# model_name = "kakaocorp/kanana-nano-2.1b-embedding"
model_name = "sentence-transformers/all-MiniLM-L6-v2"  
embedder = HuggingFaceEmbeddings(model_name=model_name)

vectorstore_path = "./chroma_db"  # 로컬 저장 경로
vectorstore = Chroma(embedding_function=embedder, persist_directory=vectorstore_path)


In [16]:
# PDF 파일 목록 가져오기
pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith(".pdf")]

for file in file_list[80:]:
    file_path = os.path.join(pdf_folder, file)
    try:
        loader = PyPDFLoader(path + '\\' + file)
        documents = loader.load()
    except:
        documents = []
        with fitz.open(file_path) as pdf:
            metadata = pdf.metadata # PDF의 메타데이터 추출
            
            for page_num, page in enumerate(pdf):
                page_text = page.get_text()
                if "MuPDF error:" not in page_text:
                    documents.append(Document(
                        page_content=page_text,
                        metadata={
                            'producer': metadata.get('producer', ''),
                            'creator': metadata.get('creator', ''),
                            'creationdate': metadata.get('creationDate', ''),
                            'title': metadata.get('title', ''),
                            'author': metadata.get('author', ''),
                            'moddate': metadata.get('modDate', ''),
                            'pdfversion': metadata.get('pdfVersion', ''),
                            'source': file_path,
                            'total_pages': pdf.page_count,
                            'page': page_num,
                            'page_label': str(page_num + 1)
                        }
                    ))


    for doc in documents:
        doc.page_content = doc.page_content.replace('\n', ' ')
        doc.page_content = doc.page_content.replace('  ', ' ')
        doc.metadata["source"] = file  # 파일명 추가

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,    # 각 chunk의 최대 문자 수 (기본값: 1000)
        chunk_overlap=1000   # 인접한 텍스트 조각 간 겹치는 문자 수 (기본값: 200) 문맥 이해를 위해 사용
                       # seperators: 텍스트 분할 구분자 우선순위 (기본값: ['\n\n', '\n', ' ', ''])
    )
    docs = splitter.split_documents(documents)
    vectorstore.add_documents(docs)
    print(file)
vectorstore.persist()  # 로컬에 저장