In [None]:
!pip install -U langchain-community

In [None]:
!pip install pypdf

In [None]:
!pip install chromadb

In [None]:
!pip install faiss-cpu

In [None]:
import os
import urllib.request

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("API_KEY_OPENAI")

In [None]:
urllib.request.urlretrieve("https://github.com/chatgpt-kr/openai-api-tutorial/raw/main/ch06/2023_%EB%B6%81%ED%95%9C%EC%9D%B8%EA%B6%8C%EB%B3%B4%EA%B3%A0%EC%84%9C.pdf", filename = "2023_북한인권보고서.pdf")

# Chroma

In [None]:
loader = PyPDFLoader("2023_북한인권보고서.pdf")
pages = loader.load_and_split()
print("청크의 수 :", len(pages))

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)

In [None]:
splitted_docs = text_splitter.split_documents(pages)
print("분할된 청크의 수 :", len(splitted_docs))

In [None]:
chunks = [splitted_doc.page_content for splitted_doc in splitted_docs]
print('청크의 최대 길이 :', max(len(chunk) for chunk in chunks))
print('청크의 최소 길이 :', min(len(chunk) for chunk in chunks))
print('청크의 평균 길이 :', sum(map(len, chunks)) / len(chunks))

In [None]:
chunk_size = 400
for i in range(0, len(splitted_docs), chunk_size):
  chunk = splitted_docs[i : i + chunk_size]
  db = Chroma.from_documents(chunk, OpenAIEmbeddings())

In [None]:
print('문서의 수 :', db._collection.count())

In [None]:
question = "북한의 교육과정"
docs = db.similarity_search(question)
print('문서의 수 :', len(docs))

In [None]:
for doc in docs:
  print(doc)
  print('--' * 50)

In [None]:
db_to_file = Chroma.from_documents(splitted_docs, OpenAIEmbeddings(), persist_directory = './chroma_test.db')
print(db_to_file._collection.count())

In [None]:
db_from_file = Chroma(persist_directory = 'chroma_test.db', embedding_function = OpenAIEmbeddings())
print('문서의 수 :', db_from_file._collection.count())

In [None]:
question = '북한의 교육 과정'
top_docs = db_from_file.similariry_search_with_relevance_scores(question, k = 3)

for doc in top_docs:
  print(doc)
  print('--' * 50)

# FAISS

In [None]:
loader = PyPDFLoader("2023_북한인권보고서.pdf")
pages = loader.load_and_split()
print('청크의 수 :', len(pages))

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)

In [None]:
splitted_docs = text_splitter.split_documents(pages)
print('분할된 청크의 수 :', len(splitted_docs))

In [None]:
chunks = [splitted_doc.page_content for splitted_doc in splitted_docs]
print('청크의 최대 길이 :', max(len(chunk) for chunk in chunks))
print('청크의 최소 길이 :', min(len(chunk) for chunk in chunks))
print('청크의 평균 길이 :', sum(map(len, chunks)) / len(chunks))

In [None]:
chunk_size = 400

for i in range(0, len(splitted_docs), chunk_size):
  chunk = splitted_docs[i : i + chunk_size]
  faiss_db = FAISS.from_documents(chunk, OpenAIEmbeddings())

In [None]:
print('문서의 수 :', faiss_db.index.ntotal)

In [None]:
faiss_db.save_local('faiss_index')

In [None]:
new_faiss = FAISS.load_local('faiss_index', OpenAIEmbeddings(), allow_dangerous_deserialization = True)

In [None]:
question = "북한의 교육과정"
docs = new_faiss.similarity_search(question)

for doc in docs:
  print(doc)
  print('--' * 50)