# Pandas Dataframe Agent

This notebook shows how to use agents to interact with a pandas dataframe. It is mostly optimized for question answering.

**NOTE: this agent calls the Python agent under the hood, which executes LLM generated Python code - this can be bad if the LLM generated Python code is harmful. Use cautiously.**

In [1]:
import json
filepath = 'credentials.json'
file = open(filepath, 'r')

# Open the credentials file with json.load
credentials = json.load(file)
file.close()

In [2]:
!pip install langchain
!pip install pandas
!pip install openpyxl
!pip install openai
!pip install tabulate
!pip install asyncio
!pip install pyee
!pip install playwright



In [3]:
from scraper.fda_generated_scraper import *

In [4]:
data = await main()

INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent object is the object
INFO: selector not valid - probably the parent o

In [5]:
import os
import pandas as pd
from langchain.experimental.autonomous_agents.autogpt.agent import AutoGPT
from langchain.chat_models import ChatOpenAI

from langchain.agents.agent_toolkits.pandas.base import create_pandas_dataframe_agent
from langchain.docstore.document import Document
import asyncio
import nest_asyncio
# Needed synce jupyter runs an async eventloop
nest_asyncio.apply()

In [6]:
import os
from contextlib import contextmanager
from typing import Optional
from langchain.agents import tool, Tool
from langchain.tools.file_management.read import ReadFileTool
from langchain.tools.file_management.write import WriteFileTool

ROOT_DIR = "./data/"
if not os.path.exists(ROOT_DIR):
    os.makedirs(ROOT_DIR)

In [7]:
@contextmanager
def pushd(new_dir):
    """Context manager for changing the current working directory."""
    prev_dir = os.getcwd()
    os.chdir(new_dir)
    try:
        yield
    finally:
        os.chdir(prev_dir)

@tool
def process_csv(
    csv_file_path: str, instructions: str, output_path: Optional[str] = None
) -> str:
    """Process a CSV by with pandas in a limited REPL.\
 Only use this after writing data to disk as a csv file.\
 Any figures must be saved to disk to be viewed by the human.\
 Instructions should be written in natural language, not code. Assume the dataframe is already loaded."""
    with pushd(ROOT_DIR):
        try:
            df = pd.read_csv(csv_file_path)
        except Exception as e:
            return f"Error: {e}"
        agent = create_pandas_dataframe_agent(llm, df, max_iterations=30, verbose=True)
        if output_path is not None:
            instructions += f" Save output to disk at {output_path}"
        try:
            result = agent.run(instructions)
            return result
        except Exception as e:
            return f"Error: {e}"

In [8]:
!playwright install

In [9]:
from bs4 import BeautifulSoup

In [10]:
async def async_load_webpage(url: str) -> str:
    """Load the specified URLs using Playwright and parse using BeautifulSoup."""
    browser = await launch(
        headless=True,
        timeout=100000,
        ignoreDefaultArgs=["--enable-automation"],
        args=[],
        defaultViewport=None
    )

    page = await browser.newPage()

    await page.goto(url, waitUntil=["networkidle2"], timeout=15000)
    
    # Get the HTML content of the page
    content = await page.content()

    soup = BeautifulSoup(content, "html.parser")

    for script in soup(["script", "style"]):
        script.extract()

    text = soup.get_text()
    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    results = "\n".join(chunk for chunk in chunks if chunk)

    await browser.close()
    return results

In [11]:
url = 'https://www.google.com'
page = await async_load_webpage(url)

In [12]:
from langchain.tools import BaseTool, DuckDuckGoSearchRun
from langchain.text_splitter import RecursiveCharacterTextSplitter

from pydantic import Field
from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain, BaseCombineDocumentsChain

In [13]:
def _get_text_splitter():
    return RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size = 500,
        chunk_overlap  = 20,
        length_function = len,
    )


class WebpageQATool(BaseTool):
    name = "query_webpage"
    description = "Browse a webpage and retrieve the information relevant to the question."
    text_splitter: RecursiveCharacterTextSplitter = Field(default_factory=_get_text_splitter)
    qa_chain: BaseCombineDocumentsChain
    
    def _run(self, url: str, question: str) -> str:
        """Useful for browsing websites and scraping the text information."""
        result = asyncio.get_event_loop().run_until_complete(async_load_webpage(url))
        # print(type(result))
        docs = [Document(page_content=result, metadata={"source": url})]
        web_docs = self.text_splitter.split_documents(docs)
        results = []
        # TODO: Handle this with a MapReduceChain
        for i in range(0, len(web_docs), 4):
            input_docs = web_docs[i:i+4]
            window_result = self.qa_chain({"input_documents": input_docs, "question": question}, return_only_outputs=True)
            results.append(f"Response from window {i} - {window_result}")
        results_docs = [Document(page_content="\n".join(results), metadata={"source": url})]
        return self.qa_chain({"input_documents": results_docs, "question": question}, return_only_outputs=True)
    
    async def _arun(self, url: str, question: str) -> str:
        raise NotImplementedError

In [15]:
!pip install faiss-cpu



In [14]:
# Memory
import faiss
from langchain.vectorstores import FAISS
from langchain.docstore import InMemoryDocstore
from langchain.embeddings import OpenAIEmbeddings
from langchain.tools.human.tool import HumanInputRun
from langchain.utilities import SerpAPIWrapper


embeddings_model = OpenAIEmbeddings(openai_api_key=credentials["OPENAI_API_KEY"])
embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {})

In [17]:
# !pip install duckduckgo_search
# web_search = DuckDuckGoSearchRun()

search = SerpAPIWrapper(serpapi_api_key=credentials["SERPAPI_API_KEY"],)

In [15]:
data[0].keys()

dict_keys(['drugName', 'link'])

In [16]:
ClinicalData = data
drug_names = [d['drugName'] for d in ClinicalData]
drug_tuples = [(d['drugName'], embeddings_model.embed_query(d['drugName'])) for d in ClinicalData]

In [17]:
index = faiss.IndexFlatL2(embedding_size)
drug_store = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {})
drug_store.add_embeddings(drug_tuples)

['9d0b470c-9776-4145-88d5-f6913fc9eae4',
 '04154076-23df-43d0-b069-31a5ee22fef1',
 '453cb1df-117e-456c-9221-f2918f57fe64',
 'f3936892-e33b-4b53-8619-133d1010a94e',
 '50106ec1-4f9c-4f5d-9acd-fe0ffcde0e8f',
 '4e683537-f674-4a1d-b206-c977157a2143',
 '1ae76cdc-7836-48ed-9525-d28c9254ea4b',
 '6b92bb34-1bba-4f26-8d63-f1d94ac64287',
 '3e4544bd-0612-45f8-8468-29655418a441',
 'b08ae60c-80c2-45fb-bda3-9b1546692c25',
 'bfde4944-019a-46e0-8b8c-1e92e4a93e23',
 'f9f5d5d6-4d37-4ef7-ae33-0315cf1143fb',
 'afff2bc2-a8f9-4df5-9acb-8714a4e75441',
 '131b8bb7-a10e-4988-8a95-a23ed18e0c1a',
 'db719640-3c03-47a4-8df2-a52fa587c476',
 'e08fa2f6-ba4e-4daa-9fb9-bac4c33082ac',
 '450b492c-3b3c-4b61-ad73-a5c8b13088ca',
 '31a6ef96-c18f-41a7-befe-6038b9845c27',
 '108aa089-d26e-4045-9844-00a9c581801e',
 'fdbd8193-ea4f-4d1c-9952-060c6d89bb7e']

In [21]:
closest_drug = drug_store.similarity_search("Abcema", k=1)[0].page_content

In [22]:
closest_link = ClinicalData[drug_names.index(closest_drug)]["link"]

In [23]:
closest_link

'https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/4697-abcema-idecabtagene-vicleucel'

In [18]:
class ClinicalLinkTool(BaseTool):
    name = "get_link_for_clinical_trial"
    description = "useful for getting  the link of the clinical trial data for a specific drug"
    drug_name: str = Field(default="", description="the drug name")

    def _run(self, drug_name: str) -> str:
        # query = embeddings_model.embed_query(drug_name)
        closest_drug = drug_store.similarity_search(drug_name, k=1)[0].page_content
        closest_link = ClinicalData[drug_names.index(closest_drug)]["link"]
        return f"The closest drug found was {closest_drug} whose corresponding link is {closest_link}"
    
    async def _arun(self, url: str, question: str) -> str:
        raise NotImplementedError

In [19]:
llm = ChatOpenAI(temperature=1.0, openai_api_key=credentials["OPENAI_API_KEY"])

In [20]:
query_website_tool = WebpageQATool(qa_chain=load_qa_with_sources_chain(llm))

In [21]:
query_website_tool._run('https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/4697-abcema-idecabtagene-vicleucel', "What are the clinical trial results?")

TimeoutError: Navigation Timeout Exceeded: 15000 ms exceeded.

In [33]:
ClinicalLinkTool()._run("Abecema")

'The closest drug found was Abcema (idecabtagene vicleucel) whose corresponding link is https://www.centerwatch.com/directories/1067-fda-approved-drugs/listing/4697-abcema-idecabtagene-vicleucel'

In [26]:
tools = [
    # Tool(
    #     name = "search",
    #     func=search.run,
    #     description="General internet search for high level information "
    # ),
    ClinicalLinkTool(),
    WriteFileTool(root_dir="./data"),
    ReadFileTool(root_dir="./data"),
    process_csv,
    query_website_tool,
    HumanInputRun(), # Activate if you want the permit asking for help from the human
]

In [None]:
!pip install google-search-results



In [27]:
agent = AutoGPT.from_llm_and_tools(
    ai_name="Subbarao",
    ai_role="Assistant",
    tools=tools,
    llm=llm,
    memory=vectorstore.as_retriever(search_kwargs={"k": 8}),
    # human_in_the_loop=True, # Set to True if you want to add feedback at each step.
)
agent.chain.set_verbose(True)

True

In [37]:
agent.run(["What is the clinical trial results of the drug Abcema? Use the link from the clinical trials tool, query the webpage, and write the response to a text file"])

{
    "thoughts": {
        "text": "I need to find information on the clinical trial results of a drug called Abcema. I should use the get_link_for_clinical_trial command to get the link to the clinical trial data for Abcema, and then use the query_webpage command to get the relevant data from the webpage. Finally, I should use the write_file command to write the data to a text file.",
        "reasoning": "This approach will allow me to efficiently locate the relevant data and store it for future reference.",
        "plan": "- Use get_link_for_clinical_trial to get the link to the clinical trial data for Abcema\n- Use query_webpage to get the relevant data from the webpage\n- Use write_file to write the data to a text file",
        "criticism": "",
        "speak": "I will use the get_link_for_clinical_trial, query_webpage, and write_file commands to get the clinical trial results for Abcema."
    },
    "command": {
        "name": "get_link_for_clinical_trial",
        "args": {


'All objectives have been successfully completed.'

In [None]:
agent.run(["What is the latest data about the top pharamaceutical companies with the most recent clinically approved drugs on the market.  Use the link from the clinical trials tool, query the webpage, and write the response to a text file."])

In [40]:
print(agent.llm_chain.prompt.messages[0].prompt.template)

AttributeError: 'AutoGPT' object has no attribute 'llm_chain'

In [None]:
sys_msg = """Assistant is a large language model trained by OpenAI.

Assistant is designed to be able to assist with a wide range of tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. As a language model, Assistant is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.

Assistant is constantly learning and improving, and its capabilities are constantly evolving. It is able to process and understand large amounts of text, and can use this knowledge to provide accurate and informative responses to a wide range of questions. Additionally, Assistant is able to generate its own text based on the input it receives, allowing it to engage in discussions and provide explanations and descriptions on a wide range of topics.

Unfortunately, Assistant is terrible at maths. When provided with math questions, no matter how simple, assistant always refers to it's trusty tools and absolutely does NOT try to answer math questions by itself

Overall, Assistant is a powerful system that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics. Whether you need help with a specific question or just want to have a conversation about a particular topic, Assistant is here to assist.
"""

In [None]:
new_prompt = agent.agent.create_prompt(
    system_message=sys_msg,
    tools=tools
)

agent.agent.llm_chain.prompt = new_prompt
agent.tools = tools


In [41]:
import requests, json

In [None]:
url = "https://www.parsehub.com/api/v2/projects/accessdata.fda.gov/run"
headers = {
    "Accept": "application/json",
    "Content-Type": "application/json",
    "Authorization": f"Bearer {credentials['ParseHub_API_KEY']}",
}
data = {
    "start_url": "https://www.example.com",
    "api_key": 'API_KEY',
    "output_format": "json"
}
response = requests.post(url, headers=headers, data=json.dumps(data))
if response.status_code == 200:
    run_token = response.json()["run_token"]
    # retrieve the data from the Parsehub API using the run token
else:
    print("Error:", response.content)

In [None]:
import requests
project_token = 'tkkE7ZdThusq'
api_key = credentials["ParseHub_API_KEY"]
url = f'https://www.parsehub.com/api/v2/projects/{project_token}/run'
headers = {
    'Content-Type': 'application/json',
    'Authorization': f'Bearer {api_key}',
}
data = {
    'start_url': 'https://www.example.com',
    'start_template': 'main_template',
    'start_value_override': '{"query": "python"}',
    'start_url_params': {'lang': 'en'},
    'send_email': False,
    'email': '',
    'webhook': '',
    'output_format': 'csv',
    'store_data': True,
    'csv_filename': 'output.csv',
    'json_filename': '',
    'xml_filename': '',
}
response = requests.post(url, headers=headers, json=data)
if response.status_code == 200:
    run_token = response.json()['run_token']
    print(f'Run started successfully. Run token: {run_token}')
else:
    print(f'Error starting run. Status code: {response.status_code}, message: {response.text}')

In [74]:
params = {
  "api_key": api_key,
  "format": "json"
}
run_token = "tH0n_Abvu-R8"
r = requests.get(f'https://www.parsehub.com/api/v2/runs/{run_token}/data', params=params)
fda_data = json.loads(r.text)

In [54]:
import os
import yaml

from langchain.agents import (
    create_json_agent,
    AgentExecutor
)
from langchain.agents.agent_toolkits import JsonToolkit
from langchain.chains import LLMChain
from langchain.llms.openai import OpenAI
from langchain.requests import TextRequestsWrapper
from langchain.tools.json.tool import JsonSpec

In [68]:
json_spec = JsonSpec(dict_=fda_data, max_value_length=4000)
json_toolkit = JsonToolkit(spec=json_spec)

json_agent_executor = create_json_agent(
    llm=llm,
    toolkit=json_toolkit,
    verbose=True
)

In [None]:
json_agent_executor.run("What is the earliest data of the first drug to be approved and from what company is it from?")

In [160]:
params = {
    "api_key": credentials['ParseHub_API_KEY'],
    "format": "csv"
}
run_token = "txQULvz0mXRa"
r = requests.get(f'https://www.parsehub.com/api/v2/runs/{run_token}/data', params=params)
fda_data = r.text

In [136]:
from langchain.tools import BaseTool, DuckDuckGoSearchRun
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pydantic import Field
from langchain.chains.combine_documents.base import BaseCombineDocumentsChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

# from langchain.chains.qa_with_sources.loading import load_qa_with_sources_chain, BaseCombineDocumentsChain
def _get_text_splitter():
    return RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size = 500,
        chunk_overlap  = 20,
        length_function = len,
    )

In [148]:
from langchain.chains.llm import LLMChain
from langchain import PromptTemplate, OpenAI, LLMChain


template = """{context} What do you know about the documents?"""
prompt = PromptTemplate(template=template, input_variables=["context"])
# Prepare LLMChain instance
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Create StuffDocumentsChain instance
stuff_documents_chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_variable_name="context",
    document_separator="\n\n"
)

# Prepare documents
doc1 = Document(page_content="This is document 1.")
doc2 = Document(page_content="This is document 2.")

# Combine documents
combined_documents = stuff_documents_chain.chains([doc1, doc2])

# Generate result using the combined_documents as context
result = llm_chain.generate(
    prompt="What do you know about the documents?",
    context=combined_documents.page_content
)

print(result)

AttributeError: 'StuffDocumentsChain' object has no attribute 'chains'

In [None]:
from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate

from langchain.docstore.document import Document

docs = [Document(page_content=t) for t in texts[:3]]
chain = load_summarize_chain(OpenAI(temperature=0), chain_type="map_reduce", return_intermediate_steps=True)
chain.run(docs)
prompt_template = """Write a concise summary of the following:


{text}


CONCISE SUMMARY IN ITALIAN:"""
PROMPT = PromptTemplate(template=prompt_template, input_variables=["text"])
chain = load_summarize_chain(OpenAI(temperature=0), chain_type="map_reduce", return_intermediate_steps=True, map_prompt=PROMPT, combine_prompt=PROMPT)
chain({"input_documents": docs}, return_only_outputs=True)

In [150]:
doc1 = Document(page_content="This is document 1.")
doc2 = Document(page_content="This is document 2.")
docs = [doc1, doc2]

In [167]:
fda_data = (r.text)
docs = [Document(page_content=fda_data)]
# web_docs = RecursiveCharacterTextSplitter().split_text(fda_data)
web_docs = RecursiveCharacterTextSplitter().split_documents(docs)
len(web_docs)

222

In [173]:
from langchain.chains.summarize import load_summarize_chain
import textwrap
question = "How many drugs are there in this dataset?"
prompt_template = """Using the below dataset
{text}

Answer this """ + question + ":"

fda_data = (r.text)
docs = [Document(page_content=fda_data)]
# web_docs = RecursiveCharacterTextSplitter().split_text(fda_data)
web_docs = RecursiveCharacterTextSplitter().split_documents(docs)


BULLET_POINT_PROMPT = PromptTemplate(template=prompt_template,
                        input_variables=["text"])

chain = load_summarize_chain(llm, 
                             chain_type="map_reduce")
                            #  prompt = BULLET_POINT_PROMPT)

output_summary = chain.run(web_docs)
wrapped_text = textwrap.fill(output_summary, 
                             width=100,
                             break_long_words=False,
                             replace_whitespace=False)
print(wrapped_text)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 4773f1cff2f475b82c1652a8045f09a6 in your message.).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can retry your request, or contact us through our help center at help.openai.com if the error persists. (Please include the request ID 32ff9f8b4e39741969cc4bc8f870f1bf in your message.).
Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 1.0 seconds as it raised RateLimitError: That model is currently overloaded with other requests. You can r

KeyboardInterrupt: 

In [None]:
class FDADatabaseTool(BaseTool):
    name = "table_for_searching_through_fda_database"
    description = "useful for getting the data about FDA approved drugs"
    question: str = Field(default="", description="the question you want to ask of the database")
    text_splitter: RecursiveCharacterTextSplitter = Field(default_factory=_get_text_splitter)

    def _run(self, question: str) -> str:
        """Useful for browsing websites and scraping the text information."""
        prompt_template = """Using the below dataset
        {text}

        Answer this """ + question + ":"

        BULLET_POINT_PROMPT = PromptTemplate(template=prompt_template,
                                input_variables=["text"])

        chain = load_summarize_chain(llm, 
                                    chain_type="stuff",
                                    prompt = BULLET_POINT_PROMPT)
        web_docs = self.text_splitter.split_documents(docs)

        output_summary = chain.run(web_docs)
        wrapped_text = textwrap.fill(output_summary, 
                                    width=100,
                                    break_long_words=False,
                                    replace_whitespace=False)
        
        return wrapped_text
    
    async def _arun(self, question: str) -> str:
        raise NotImplementedError

In [158]:
FDADatabaseTool()._run('What drug is the most recently approved?')

'There is no relevant information in the given dataset to answer this question.'

In [174]:
tools = [
    Tool(
        name = "search",
        func=search.run,
        description="General internet search for high level information "
    ),
    ClinicalLinkTool(),
    # FDADatabaseTool(),
    WriteFileTool(root_dir="./data"),
    ReadFileTool(root_dir="./data"),
    process_csv,
    query_website_tool,
    HumanInputRun(), # Activate if you want the permit asking for help from the human
]

In [175]:
agent = AutoGPT.from_llm_and_tools(
    ai_name="Subbarao",
    ai_role="Assistant",
    tools=tools,
    llm=llm,
    memory=vectorstore.as_retriever(search_kwargs={"k": 8}),
    # human_in_the_loop=True, # Set to True if you want to add feedback at each step.
)
agent.chain.set_verbose(True)

True