In [None]:
!pip install langchain langchain_chroma langchain_openai langchain_community

In [None]:
from google.colab import drive
import os

# 먼저 구글 드라이브 마운트
drive.mount('/content/drive')

In [4]:
import os
from dotenv import load_dotenv

# .env 파일에서 환경 변수 로드
load_dotenv()

# 환경 변수에서 API 키 가져오기
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")


In [6]:
from langchain_community.document_loaders import TextLoader

# 문서 로더 설정
loaders = [
    TextLoader("./Data/How_to_invest_money.txt"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [9]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 부모 문서 생성을 위한 텍스트 분할기
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
# 자식 문서 생성을 위한 텍스트 분할기 (부모보다 작은 크기로 설정)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=200)

# 자식 문서 인덱싱을 위한 벡터 저장소
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=OpenAIEmbeddings()
)
# 부모 문서 저장을 위한 저장소
store = InMemoryStore()

  vectorstore = Chroma(


In [11]:
# ParentDocumentRetriever 설정
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

# 문서 추가
retriever.add_documents(docs)

# 부모 문서 수 확인
print(f"Number of parent documents: {len(list(store.yield_keys()))}")


Number of parent documents: 219


In [12]:
# 질문 정의
query = "What are the types of investments?"

# 연관문서 수집
retrieved_docs = retriever.get_relevant_documents(query)

# 첫 번째 연관문서 출력
print(f"Parent Document: {retrieved_docs[0].page_content}")

  retrieved_docs = retriever.get_relevant_documents(query)


Parent Document: There are five chief points to be considered in the selection of all
forms of investment. These are: (1) safety of principal and interest;
(2) rate of income; (3) convertibility into cash; (4) prospect of
appreciation in intrinsic value; (5) stability of market price.

Keeping these five general factors in mind, the present chapter will
discuss real-estate mortgages as a form of investment, both as adapted
to the requirements of private funds and of a business surplus.


In [14]:
retrieved_docs

[Document(metadata={'source': './Data/How_to_invest_money.txt'}, page_content='There are five chief points to be considered in the selection of all\nforms of investment. These are: (1) safety of principal and interest;\n(2) rate of income; (3) convertibility into cash; (4) prospect of\nappreciation in intrinsic value; (5) stability of market price.\n\nKeeping these five general factors in mind, the present chapter will\ndiscuss real-estate mortgages as a form of investment, both as adapted\nto the requirements of private funds and of a business surplus.'),
 Document(metadata={'source': './Data/How_to_invest_money.txt'}, page_content='II. RAILROAD MORTGAGE BONDS              23\n\n    III. RAILROAD EQUIPMENT BONDS            40\n\n    IV. REAL-ESTATE MORTGAGES                51\n\n    V. INDUSTRIAL BONDS                      63\n\n    VI. PUBLIC-UTILITY BONDS                 76\n\n    VII. MUNICIPAL BONDS                     91\n\n    VIII. STOCKS                            100\n\n    I

In [13]:
# 자식 문서 검색
query = "What are the types of investments?"
sub_docs = vectorstore.similarity_search(query)
print(f"Child Document: {sub_docs[0].page_content}")

Child Document: forms of investment. These are: (1) safety of principal and interest;
(2) rate of income; (3) convertibility into cash; (4) prospect of
appreciation in intrinsic value; (5) stability of market price.
