In [1]:
import os
import pickle
import openai
openai.api_key = os.environ["OPENAI_API_KEY"]

# from langchain.document_loaders import ReadTheDocsLoader
from langchain.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from langchain.retrievers import BM25Retriever, TFIDFRetriever, EnsembleRetriever
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.chains import RetrievalQA, LLMChain
from langchain.llms import OpenAI

from langchain.agents.agent_toolkits import create_retriever_tool
from langchain.agents import AgentExecutor, Tool, ZeroShotAgent
from langchain.memory import ConversationBufferMemory

In [2]:
# pdf_file = "/Users/jeonjunhwi/문서/Projects/GNN_Covid/refference/GNN논문/A GNN RNN Approach for Harnessing Geospatial and Temporal Information Application to Crop Yeild Prediction.pdf"
gnn_path = "/Users/jeonjunhwi/문서/Projects/GNN_Covid/refference/GNN논문/"
loader = PyPDFDirectoryLoader(path=gnn_path)
documents = loader.load()
chunk_size = 128

sentence_splitter = SentenceTransformersTokenTextSplitter(
    chunk_overlap = 30,
    model_name = 'sentence-transformers/all-mpnet-base-v2',
    tokens_per_chunk=chunk_size
)

docs = sentence_splitter.split_documents(documents=documents)

Invalid parent xref., rebuild xref
Multiple definitions in dictionary at byte 0x587b8 for key /MediaBox
Multiple definitions in dictionary at byte 0x58cb2 for key /MediaBox
Multiple definitions in dictionary at byte 0x58e64 for key /MediaBox
Multiple definitions in dictionary at byte 0x59116 for key /MediaBox
Multiple definitions in dictionary at byte 0x593cd for key /MediaBox
Multiple definitions in dictionary at byte 0x5979f for key /MediaBox
Multiple definitions in dictionary at byte 0x59ad2 for key /MediaBox
Multiple definitions in dictionary at byte 0x59f54 for key /MediaBox
Multiple definitions in dictionary at byte 0x5a137 for key /MediaBox
Multiple definitions in dictionary at byte 0x5a2ea for key /MediaBox
Multiple definitions in dictionary at byte 0x5a475 for key /MediaBox
Multiple definitions in dictionary at byte 0x5a618 for key /MediaBox
Multiple definitions in dictionary at byte 0x5a803 for key /MediaBox
Multiple definitions in dictionary at byte 0x5ab12 for key /MediaBox

In [3]:
# vector db
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore.save_local(f"./Retrieval_Database/Vector_Database/paper_faiss_openai_{chunk_size}_sentence_splitter")

In [6]:
# keyword search
keyword_store = BM25Retriever.from_documents(docs)
with open(f"./Retrieval_Database/Keyword_Database/paper_BM25_{chunk_size}_sentence_splitter.pkl", 'wb') as f:
    pickle.dump(keyword_store, f)

keyword_store = TFIDFRetriever.from_documents(docs)
keyword_store.save_local(f"./Retrieval_Database/Keyword_Database/paper_TDIDF_{chunk_size}_sentence_splitter")

In [16]:
chunk_size = 128
embeddings = OpenAIEmbeddings()

paper_vectorstore = FAISS.load_local(f"./Retrieval_Database/Vector_Database/paper_faiss_openai_{chunk_size}_sentence_splitter", embeddings)
paper_tdidf = TFIDFRetriever.load_local(f"./Retrieval_Database/Keyword_Database/paper_TDIDF_{chunk_size}_sentence_splitter")
with open (f"./Retrieval_Database/Keyword_Database/paper_BM25_{chunk_size}_sentence_splitter.pkl", 'rb') as f:
    paper_bm25 = pickle.load(f)

paper_info_retriever = Chroma(embedding_function=embeddings,
                              persist_directory='./Retrieval_Database/Vector_Database/paper_info_chroma_openai_128_sentence_splitter')


In [17]:
# initialize the ensemble retriever
paper_ensemble_retriever = EnsembleRetriever(
    retrievers=[
        paper_vectorstore.as_retriever(),
        paper_tdidf,
        paper_bm25
                ],
    weights=[0.5, 0.25, 0.25]
)

paper_retrieval_tool = create_retriever_tool(
    retriever   = paper_ensemble_retriever,
    name        = "Search_GNN_Papers",
    description = "Searches and returns papers regarding the GNN.",
)

paper_info_retrieval_tool = create_retriever_tool(
    retriever   = paper_info_retriever.as_retriever(),
    name        = "Search_GNN_Papers_info",
    description = "Searches and returns papers regarding the information",
)

tools = [paper_retrieval_tool, paper_info_retrieval_tool]

In [18]:
prefix = """Answer the following questions as best you can. You have access to the following tools:"""
suffix = """

Question: {input}
{agent_scratchpad}"""

prompt = ZeroShotAgent.create_prompt(
    tools,
    prefix=prefix,
    suffix=suffix,
    input_variables=["input", "agent_scratchpad"],
)

In [19]:
llm_chain = LLMChain(llm=OpenAI(temperature=0),
                     prompt=prompt)

agent = ZeroShotAgent(llm_chain=llm_chain,
                      tools=tools,
                      )
memory = ConversationBufferMemory(memory_key="chat_history")

In [21]:
agent_executor = AgentExecutor.from_agent_and_tools(
    agent=agent, tools=tools, verbose=True, memory=memory
)

agent_executor.run(
    input="what is DropEdge?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: DropEdge is a graph neural network (GNN) technique
Action: Search_GNN_Papers
Action Input: DropEdge[0m
Observation: [36;1m[1;3m[Document(page_content='in network depth. this paper proposes dropedge, a novel and ﬂexible technique to alleviate both issues. at its core, dropedge randomly removes a certain number of edges from the input graph at each training epoch, acting like a data augmenter and also a message passing reducer. furthermore, we theoretically demonstrate that dropedge either reduces the convergence speed of over - smoothing or relieves the information loss caused by it. more importantly, our dropedge is a general skill that can be equipped with many other backbone models ( e. g. gcn, resgcn, graphsage, and jknet ) for enhanced', metadata={'source': '/Users/jeonjunhwi/문서/Projects/GNN_Covid/refference/GNN논문/DropEdge.pdf', 'page': 0}), Document(page_content='published as a conference paper at iclr 202

'DropEdge is a graph neural network technique that randomly removes a certain number of edges from the input graph at each training epoch, acting like a data augmenter and also a message passing reducer. It is used to prevent over-fitting and reduce over-smoothing, and can be applied to many popular GNN backbones such as GCN, ResGCN, GraphSAGE, and JKNet.'

In [35]:
# query = "Give me the gist of Graph Attention Network(GAT) in 3 sentences. Tell me you don't know if you don't know."
# query = "Give me the gist of DropEdge and github adress. Tell me you don't know if you don't know."
# query = "Give me the gist of Body Composition. Based on a given context, tell me you don't know if you don't know."
query = "Sahai's paper?"
res = qa({"query":query})

In [36]:
res['query']

"Sahai's paper?"

In [37]:
print(res['result'])
print()
print(res['source_documents'][1].page_content)
print()
print(res['source_documents'][1].metadata)

 Sahai did not write the paper.

n. b., a. b., c. c., j. f., s. h., m. k., j. k., j. r., v. s., e. s., and h. j. s. wrote the paper. the authors declare no competing interest. this article is a pnas direct submission. published under the pnas license. 1j. l. and m. m. contributed equally to this work. 2to whom correspondence may be addressed. email : josef. ludescher @ pik - potsdam. de ormaria. martin @

{'source': '/Users/jeonjunhwi/문서/Projects/GNN_Covid/refference/GNN논문/Network-based forecasting for climate phenomena.pdf', 'page': 0}


In [None]:
import os
import sys
from dotenv import load_dotenv
# .env 파일 로드
load_dotenv()

sys.path.append('./')
from langchain.memory import ConversationBufferMemory
from typing import Any

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA, ConversationalRetrievalChain, HypotheticalDocumentEmbedder, LLMChain
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
query="What is DropEdge"
k = 3
threshold = 0.5
    
embeddings = OpenAIEmbeddings()
# docsearch = FAISS.load_local("faiss_index_256", embeddings)
# docsearch = FAISS.load_local("faiss_openai_512_sentence_splitter", embeddings)
docsearch = FAISS.load_local("faiss_openai_128_sentence_splitter", embeddings)
# chat = ChatOpenAI(verbose=True, temperature=0)

# 답을 찾을 수 없는 경우, 답을 찾을 수 없다고 말하고 자체 지식에 의존하세요 라는 내용 추가
custom_template = """
Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question. If you do not know the answer reply with 'I am sorry'.
If you can't find an answer, say you can't find an answer and rely on your own knowledge.
Chat History:
{chat_history}

Question:
{question}
Answers:"""

CUSTOM_QUESTION_PROMPT = PromptTemplate.from_template(custom_template)

embeddings = OpenAIEmbeddings()
memory = ConversationBufferMemory(memory_key="chat_history",
                                output_key = 'answer',
                                return_messages=True)

llm = llm=ChatOpenAI(verbose=True,
                        temperature=0,
                    #  streaming=True,
                    #  callbacks=[StreamingStdOutCallbackHandler()]
                    )

qa = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever = docsearch.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={
            "k":k,
            "score_threshold": threshold
            }
        ),
    condense_question_prompt=CUSTOM_QUESTION_PROMPT,
    return_source_documents=True,
    chain_type='refine',
    memory=memory
)

qa({"question": query})

{'question': 'What is DropEdge',
 'chat_history': [HumanMessage(content='What is DropEdge'),
  AIMessage(content='DropEdge is a technique that can be employed in conjunction with dropnode methods to further enhance their performance. This enhancement can be observed by referring to the increase in performance achieved by DropEdge for GraphSAGE in Table 1.5.2. DropEdge helps address the limitations of deep neural networks for graph data by mitigating overfitting and oversmoothing issues. Although the detailed analysis and evaluations of DropEdge and its variants are mainly focused on the Cora dataset, the technique has shown promising results. It is worth noting that DropEdge is not primarily concerned with pushing state-of-the-art results but rather aims to provide a deeper understanding of its effectiveness.')],
 'answer': 'DropEdge is a technique that can be employed in conjunction with dropnode methods to further enhance their performance. This enhancement can be observed by referri

In [4]:
custom_template = """
Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question. If you do not know the answer reply with 'I am sorry'.
If you can't find an answer, say you can't find an answer and rely on your own knowledge.
Chat History:
{chat_history}

Question:
{question}
Answers:"""
prompt = PromptTemplate(input_variables=["question", "chat_history"], template=custom_template)

llm_chain = LLMChain(llm=llm, prompt=prompt)
bge_embeddings = OpenAIEmbeddings()
embeddings = HypotheticalDocumentEmbedder(
    llm_chain=llm_chain,
    base_embeddings=bge_embeddings
)

result = embeddings.embed_query(
    "What is DropEdge?"
)