In [8]:
import subprocess
import sys

# pinecone 재설치
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pinecone"])

print("✓ pinecone 설치 완료")

✓ pinecone 설치 완료


In [None]:
import sys
# pinecone 강제 새로고침
if 'pinecone' in sys.modules:
    del sys.modules['pinecone']

import os
import requests
import logging
from typing import List, Dict
from urllib.parse import urlparse

import PyPDF2
from bs4 import BeautifulSoup

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

from pinecone import Pinecone
from tqdm import tqdm
from uuid import uuid4

In [11]:
OPENAI_API_KEY = os.getenv("YOUR_OPENAI_KEY")
PINECONE_API_KEY = os.getenv("YOUR_PINECONE_KEY")
INDEX_NAME = "welldying-wisdom"

PDF_FILES = [
    "ALTIIA-3.1.pdf",
    "Death_Immortality_and_Meaning_in_Life_Precis_and_F.pdf"
]

TARGET_URLS = [
    "https://www.apa.org/topics/aging-older-adults/end-of-life-decisions",
    "https://plato.stanford.edu/entries/death/",
    "https://gadfly.igc.org/papers/pr/pipr.htm"
]

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [12]:
def extract_text_from_pdf(filepath: str) -> str:
    text = ""
    try:
        with open(filepath, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        logger.info(f"[PDF] 추출 완료: {filepath} ({len(text)}자)")
        return text
    except Exception as e:
        logger.error(f"[PDF] 에러 발생 ({filepath}): {e}")
        return ""

In [13]:
def extract_text_from_url(url: str) -> str:
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 불필요한 태그 제거 (스크립트, 스타일, 네비게이션 등)
        for script in soup(["script", "style", "header", "footer", "nav"]):
            script.decompose()
            
        # 본문 텍스트 추출
        text = soup.get_text(separator='\n')
        
        # 공백 정리
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        clean_text = '\n'.join(chunk for chunk in chunks if chunk)
        
        logger.info(f"[URL] 크롤링 완료: {url} ({len(clean_text)}자)")
        return clean_text
    except Exception as e:
        logger.error(f"[URL] 에러 발생 ({url}): {e}")
        return ""

In [14]:
def process_and_upload(texts: List[Dict[str, str]]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    
    pc = Pinecone(api_key=PINECONE_API_KEY)
    
    # 인덱스 확인 (없으면 생성)
    if INDEX_NAME not in pc.list_indexes().names():
        logger.info(f"인덱스 '{INDEX_NAME}' 생성 중...")
        from pinecone import ServerlessSpec
        pc.create_index(
            name=INDEX_NAME,
            dimension=1536, 
            metric='cosine',
            spec=ServerlessSpec(cloud='aws', region='us-east-1')
        )
    
    index = pc.Index(INDEX_NAME)
    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
    
    documents_to_upload = []
    
    # 청킹 작업
    for item in texts:
        source_name = item['source']
        raw_text = item['text']
        
        chunks = text_splitter.split_text(raw_text)
        logger.info(f"Processing {source_name}: {len(chunks)} chunks generated.")
        
        for chunk in chunks:
            documents_to_upload.append({
                "id": str(uuid4()),
                "text": chunk,
                "source": source_name,
                "type": "wisdom" # search_welldying_wisdom_tool이 검색할 태그
            })
            
    # 임베딩 및 업로드 (Batch 처리)
    batch_size = 100
    total_docs = len(documents_to_upload)
    
    logger.info(f"총 {total_docs}개의 청크")
    
    for i in tqdm(range(0, total_docs, batch_size)):
        batch = documents_to_upload[i:i+batch_size]
        
        # 텍스트 리스트 추출
        batch_texts = [doc['text'] for doc in batch]
        
        # 임베딩 생성
        try:
            batch_embeddings = embeddings.embed_documents(batch_texts)
        except Exception as e:
            logger.error(f"임베딩 생성 중 오류: {e}")
            continue
        
        # 벡터 데이터 구성
        vectors = []
        for doc, vector in zip(batch, batch_embeddings):
            metadata = {
                "content": doc['text'], # 검색 시 LLM에게 보여줄 본문
                "source": doc['source'],
                "type": doc['type']
            }
            vectors.append((doc['id'], vector, metadata))
            
        index.upsert(vectors=vectors)
        
    logger.info("✅ 완료")

In [15]:
collected_data = []

for pdf in PDF_FILES:
    text = extract_text_from_pdf(pdf)
    if text:
        collected_data.append({"source": pdf, "text": text})
    
for url in TARGET_URLS:
    text = extract_text_from_url(url)
    if text:
        collected_data.append({"source": url, "text": text})
        
process_and_upload(collected_data)

INFO:__main__:[PDF] 추출 완료: ALTIIA-3.1.pdf (36748자)
INFO:__main__:[PDF] 추출 완료: Death_Immortality_and_Meaning_in_Life_Precis_and_F.pdf (68136자)
INFO:__main__:[PDF] 추출 완료: Death_Immortality_and_Meaning_in_Life_Precis_and_F.pdf (68136자)
INFO:__main__:[URL] 크롤링 완료: https://www.apa.org/topics/aging-older-adults/end-of-life-decisions (0자)
INFO:__main__:[URL] 크롤링 완료: https://www.apa.org/topics/aging-older-adults/end-of-life-decisions (0자)
INFO:__main__:[URL] 크롤링 완료: https://plato.stanford.edu/entries/death/ (109233자)
INFO:__main__:[URL] 크롤링 완료: https://plato.stanford.edu/entries/death/ (109233자)
INFO:__main__:[URL] 크롤링 완료: https://gadfly.igc.org/papers/pr/pipr.htm (67110자)
INFO:__main__:[URL] 크롤링 완료: https://gadfly.igc.org/papers/pr/pipr.htm (67110자)
INFO:__main__:인덱스 'welldying-wisdom' 생성 중...
INFO:__main__:인덱스 'welldying-wisdom' 생성 중...
INFO:__main__:Processing ALTIIA-3.1.pdf: 46 chunks generated.
INFO:__main__:Processing Death_Immortality_and_Meaning_in_Life_Precis_and_F.pdf: 86 chunks gene