# ChatBot

In [1]:
import os
import openai
from dotenv import load_dotenv
import requests
import textract
import tiktoken
from llama_index import download_loader, GPTSimpleVectorIndex, SimpleDirectoryReader
from pathlib import Path
from llama_index import GPTListIndex, LLMPredictor
from langchain import OpenAI
from llama_index.composability import ComposableGraph
from langchain.agents import Tool
from langchain.chains.conversation.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent
from llama_index.langchain_helpers.agents import LlamaToolkit, create_llama_chat_agent, IndexToolConfig, GraphToolConfig
from llama_index.indices.query.query_transform.base import DecomposeQueryTransform

load_dotenv()

os.environ["http_proxy"] = "http://10.10.1.3:10000"
os.environ["https_proxy"] = "http://10.10.1.3:10000"

# Load your API key from an environment variable or secret management service
openai.api_key = os.getenv("OPENAI_API_KEY")
os.environ['OPENAI_API_KEY'] = openai.api_key

In [2]:
# PDFReader = download_loader("PDFReader")
# loader = PDFReader()

# paper_docs = loader.load_data(file=Path('./data/radiolarian/466.pdf'))
# paper_index = GPTSimpleVectorIndex(paper_docs, chunk_size_limit=512)
# paper_index.save_to_disk(f'./data/indexed/radiolarian_index_466.json')

In [3]:
paper_index = GPTSimpleVectorIndex.load_from_disk(f'./data/indexed/radiolarian_index_466.json')

In [4]:
index_configs = []
tool_config = IndexToolConfig(
    index=paper_index,
    name=f"Vector Index 466",
    description=f"useful for when you want to answer queries about the paper 446",
    index_query_kwargs={"similarity_top_k": 3},
    tool_kwargs={"return_direct": True}
)
index_configs.append(tool_config)

toolkit = LlamaToolkit(
    index_configs=index_configs
)

memory = ConversationBufferMemory(memory_key="chat_history")
llm=ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
agent_chain = create_llama_chat_agent(
    toolkit,
    llm,
    memory=memory,
    verbose=True
)

In [17]:
# input_str = "hi, i am bob"
# print(f'Input: {input_str}')
# response = agent_chain.run(input=input_str)
# print(f'Agent: {response}')

# print("-" * 80)

query_str = 'Extract key pieces of information from this indexed file.\nIf a particular piece of information is not present, output \"Not specified\".\nWhen you extract a key piece of information, include the closest page number.\nUse the following format:\n0. What is the title\n1. What is the number of species or genera found/identified\"\n\nDocument: \"\"\"{document}\"\"\"\n\n0. Who is the title: radiolarain paper 446 (Page 1)\n1.'
print(f'Input: {query_str}')
response = agent_chain.run(input=query_str)
print(f'Agent: {response}')


# while True:
#     text_input = input("User: ")
#     response = agent_chain.run(input=text_input)
#     print(f'Agent: {response}')


'''
Extract key pieces of information from this regulation document.
If a particular piece of information is not present, output \"Not specified\".
When you extract a key piece of information, include the closest page number.
Use the following format:\n0. What is the title\n1. What is the location or address related to boulder, samples and radiolarians\n2. Which sentence is in the form of "XXX (boulder, samples, radiolarians) was/were collected in/from\n\nDocument: \"\"\"{document}\"\"\"\n\n0. Who is the title: paper title (Page 1)\n1.
'''

Input: Extract key pieces of information from this indexed file.
If a particular piece of information is not present, output "Not specified".
When you extract a key piece of information, include the closest page number.
Use the following format:
0. What is the title
1. What is the number of species or genera found/identified"

Document: """{document}"""

0. Who is the title: radiolarain paper 446 (Page 1)
1.


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Do I need to use a tool? No
AI: The title of the document is "Radiolarian Paper 446". In the Buryella tetradica -Bekoma campechensis Interval Zone, a total of fourteen species or genera were found/identified.[0m

[1m> Finished chain.[0m
Agent: The title of the document is "Radiolarian Paper 446". In the Buryella tetradica -Bekoma campechensis Interval Zone, a total of fourteen species or genera were found/identified.


'\nExtract key pieces of information from this regulation document.\nIf a particular piece of information is not present, output "Not specified".\nWhen you extract a key piece of information, include the closest page number.\nUse the following format:\n0. What is the title\n1. What is the location or address related to boulder, samples and radiolarians\n2. Which sentence is in the form of "XXX (boulder, samples, radiolarians) was/were collected in/from\n\nDocument: """{document}"""\n\n0. Who is the title: paper title (Page 1)\n1.\n'