In [170]:
# !pip install feedparser
# !pip install arxiv
# !pip install langchain
# !pip install langgraph
# !pip install langchain_openai
# !pip install langchain_coreTh
# !pip install langchain_community

In [2]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage
from langgraph.checkpoint.memory import MemorySaver
from langgraph.prebuilt import create_react_agent
from langchain_core.tools import tool
from langchain.agents import load_tools

import os
import urllib, urllib.request
import time
import feedparser
from datetime import datetime

import re
import json

In [3]:
# Set API key to call on LLM
os.environ["OPENAI_API_KEY"] = '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'

In [4]:
obj = [{}]

In [45]:
# @tool
def check_history(obj, filename='test.json'):
    """Write DAS paper information to output file if not already in history."""
    # Create history set for efficient lookup
    history = set()
    # Read through pre-existing json file
    if os.path.exists(filename) and os.path.getsize(filename) > 0:
        with open(filename, 'r') as file:
            for line in file:
                try:
                    # Read each line, add to history set (unique values)
                    existing_entry = json.loads(line.strip())
                    history.add(json.dumps(existing_entry, sort_keys=True))
                except json.JSONDecodeError:
                        continue  # Skip invalid lines

    # Extract entries that have not been added already
    new_entries = [entry for entry in obj if json.dumps(entry, sort_keys=True) not in history]

    # If there are new entries to add, open correponding file and dump JSON objects
    if new_entries:
        with open(filename, 'a') as file:
            for entry in new_entries:
                file.write(json.dumps(entry, sort_keys=True) + '\n')

In [46]:
@tool
def write(obj, filename='test.json'):
    """Write DAS paper information to output file for records."""
    # Filepath where we will store found papers' information.
    path = '/Users/salblanco/Desktop/das/das_proj/'

    # If corresponding output folder does not exist, create it
    os.listdir(path)
    if 'out' not in os.listdir(path):
        os.mkdir(path+'out')
        os.chdir(path+'out')
    else:
        os.chdir(path+'out')
        
    # Create and/or append to output.json file to handle dictionary format
    f = open('test.json', 'a')
    # Write each entry on its own line
    check_history(obj, filename)
    # for entry in obj:
    #     f.write(json.dumps(entry)+'\n')

In [47]:
@tool
def paginate(papers_per_call, index, success=True):
    """Increment counter for pagination."""
    # Update index if successfully returned new papers; else, keep index the same
    new_index = index + papers_per_call if success else index
    print(f"Paginating: index {index} → {new_index} (success={success})")
    return new_index

In [48]:
@tool
# adapted from https://info.arxiv.org/help/api/examples/python_arXiv_paging_example.txt
def search(results_per_iteration, start):
  """Search for DAS papers"""

  obj = []
  
  # Get current date + time
  now = datetime.now()
  today = now.strftime("%Y%m%d%H%M")

  # Base API query url
  base_url = 'http://export.arxiv.org/api/query?'

  # API search parameters for DAS papers from 1/01/2010 to today.
  search_query = f'all:DAS+AND+all:distributed+AND+all:acoustic+AND+all:sensing+submittedDate:201001010000+TO+{today}'

  # Number of seconds to wait beetween calls; recommended by arXiv documentation
  wait_time = 3 

  print(f'Searching arXiv for {search_query}')

  while True:

    print(f"Results {start} - {start+results_per_iteration}")

    query = f'search_query={search_query}&start={start}&max_results={start+results_per_iteration}'

    # GET request using the base_url and query
    response = urllib.request.urlopen(base_url+query).read()

    if not response or len(response) == 0:
      print("No more results. Please try again.")
      break

    # Parse the response using feedparser
    feed = feedparser.parse(response)

    # Run through each entry, and print out information
    for entry in feed.entries:
      paper_id = entry.id.split('/abs/')[-1]
      title = entry.title
      author = entry.author
      summary = entry.summary if hasattr(entry, 'summary') else ""

      # Format as a search result
      content = f"Paper ID: {paper_id}\nTitle: {title}\nAuthor: {author}\nSummary: {summary}"

      # Add to results
      obj.append({
          'url': entry.id,  # Use arXiv URL
          'content': content
      })

      print("arxiv-id: ", paper_id)
      print("Title: ", title)
      # feedparser v4.1 only grabs the first author
      print("First Author: ", author)

    # Break loop after extracting relevant information
    break

  # Sleep before calling API again
  print(f"Sleeping for {wait_time} seconds")
  time.sleep(wait_time)
  
  return obj

In [49]:
# Need to add a tool so the agent can use the links it found to actually extract data availability statements and/or links

In [50]:
# Starting index for search
index = 0
# Number of papers retrieved
papers_per_call = 5

In [52]:
index = 0

In [None]:
# Track the last index
tools = [search, paginate, write, check_history]
memory = MemorySaver()
model = ChatOpenAI(model_name="gpt-4o")
agent_executor = create_react_agent(model, tools, checkpointer=memory)

config = {"configurable": {"thread_id": "t15"}}

# Initial prompt
for step in agent_executor.stream(
    {"messages": [HumanMessage(content=f"""
    You are a research assistant helping me find Distributed Acoustic Sensing (DAS) papers on arXiv. 

    **Task Instructions:**  
    1. Use the `search` tool to retrieve the next {papers_per_call} papers, starting from index {index}.
    2. Return the following details for each paper:  
        - title  
        - id  
        - author
        - link
    3. If you are able to correctly search, use the return 'obj' from the 'search' tool as the input for the 'write' tool. 
    4. If you are able to correctly search and write to the corresponding file, update the next index using the `paginate` tool. 
    5. Return the next starting index for future searches with the format: 'The next starting index for future searches is [index].'

    **Notes:**  
    - Always start from the most recent index and paginate correctly to avoid duplicates.  
    - If no new papers are found, return an empty list along with the current index.  
    """)]},
    config,
    stream_mode="values",
    ):
    step["messages"][-1].pretty_print()
    match = re.search(r"The next starting index for future searches is (\d+)", str(step["messages"][-1]))
    if match:
        index = int(match.group(1))  # Extract and convert to integer




    You are a research assistant helping me find Distributed Acoustic Sensing (DAS) papers on arXiv. 

    **Task Instructions:**  
    1. Use the `search` tool to retrieve the next 5 papers, starting from index 5.
    2. Return the following details for each paper:  
        - title  
        - id  
        - author
        - link
    3. If you are able to correctly search, use the return 'obj' from the 'search' tool as the input for the 'write' tool. 
    4. If you are able to correctly search, update the next index using the `paginate` tool. 
    5. Return the next starting index for future searches with the format: 'The next starting index for future searches is [index].'

    **Notes:**  
    - Always start from the most recent index and paginate correctly to avoid duplicates.  
    - If no new papers are found, return an empty list along with the current index.  
    
Tool Calls:
  search (call_0bj60C8csqO9noQH6e90392j)
 Call ID: call_0bj60C8csqO9noQH6e90392j
  Args:
    result