# OpenAI Key Setting

In [None]:
import getpass

api_key = getpass.getpass("Please enter your input API KEY :")
organization = getpass.getpass("Please enter your input ORGANIZATION ID :")

In [None]:
path = "../data" # Data 위치 지정

# Import Pakage and Data Setting

- LangChain 관련 : langchain, langchain-openai, langchain-community
- Document Loading 관련: unstructured, pypdf, pymupdf, arxiv
- Document Embedding 관련: sentense-transformers
- Vector Store 관련: chromadb, faiss-cpu 등
- poppler-utils: PDF 파일을 조작하고 다른 형식으로 변환하기 위한 사전 컴파일된 명령줄 유틸리티

In [None]:
!pip install langchain
!pip install langchain-openai
!pip install langchain-community
!pip install torch torchvision
!pip install unstructured==0.5.6 unstructured-inferencenstall 
!pip install -q pypdf
!pip install pymupdf
!pip install -qU arxiv
!pip install -U sentence-transformers
!pip install chromadb
!pip install faiss-cpu
!pip install unstructured_pytesseract
!pip install -q pypdf pymupdf

!apt-get install -y poppler-utils

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain.schema.runnable import RunnablePassthrough
from langchain_community.document_loaders import PyPDFDirectoryLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.retrievers.document_compressors import LLMChainFilter

# Model Setting

In [None]:
llm = ChatOpenAI(
    model="gpt-3.5-turbo-0125",
    api_key=api_key,
    organization=organization       
)

# PyPDFDirectoryLoader

In [None]:
loader = PyPDFDirectoryLoader(path=path)
pdfs = loader.load()

In [None]:
docs_list = []

for docs in pdfs:
    if docs.metadata["source"] not in docs_list:
        docs_list.append(docs.metadata["source"])

print(docs_list)

# TextSplitter & Tokenizer

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=2000,
    chunk_overlap=1000,
    encoding_name="p50k_base"
)

documents = text_splitter.split_documents(pdfs)

# Vector DB & Retriever

In [None]:
faissDB = FAISS.from_documents(
    documents=documents,
    embedding=OpenAIEmbeddings(api_key=api_key, organization=organization),
    distance_strategy=DistanceStrategy.COSINE
)

In [None]:
mq_retriever = MultiQueryRetriever.from_llm(
    retriever=faissDB.as_retriever(
        search_type="mmr",
        search_kwarg={"k" : 10, 'fetch_k': 20}    
    ),
    llm=llm
)

In [None]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=LLMChainFilter.from_llm(llm), 
    base_retriever=mq_retriever
)

# LangChain Agent

In [None]:
template = '''Answer the question based only on the following context:
{context}

Question: {question}
'''

prompt = ChatPromptTemplate.from_template(template)

In [None]:
chain = (
    {'context': compression_retriever | (lambda docs: '\n\n'.join([d.page_content for d in docs])), 'question': RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
query = "AI trend 요약해줘" # Insert Query

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, message="The predict_and_parse method is deprecated")

response = chain.invoke(query)
print(response)