In [1]:
import logging
import os

import requests
from dotenv import load_dotenv

load_dotenv()

True

Step 1: Call the Ask-xDD hybrid endpoint to get

- `paper_id`
- `preprocessor_id`
- `hashed_text` (I treat this as paragraph id)

Step 2: Gather extra information from xDD

- publisher's URL

In [5]:
class USGSRetriever:
    """This is a mockup for the USGS specific retriever."""

    def query_ask_xdd(self, query: str) -> dict:
        """Query the AskXDD API and return the response."""

        ASK_XDD_APIKEY = os.getenv("ASK_XDD_APIKEY")
        ASK_XDD_URL = os.getenv("ASK_XDD_URL")
        headers = {"Content-Type": "application/json", "Api-Key": ASK_XDD_APIKEY}
        data = {
            "topic": "criticalmaas",
            "question": query,
            "top_k": 1,
        }

        response = requests.post(ASK_XDD_URL + "/hybrid", headers=headers, json=data)
        response.raise_for_status()
        paragraph = response.json()[0]
        paragraph["url"] = self.get_url(paragraph["paper_id"])
        return paragraph

    def get_url(self, paper_id: str) -> str | None:
        """Get the URL for a paper in the XDD database."""

        XDD_ARTICLE_ENDPOINT = os.getenv("XDD_ARTICLE_ENDPOINT")
        response = requests.get(f"{XDD_ARTICLE_ENDPOINT}?docid={paper_id}")
        response.raise_for_status()

        try:
            data = response.json()["success"]["data"]
            # Return the first publisher link
            for d in data:
                links = d["link"]
                for link in links:
                    if link["type"] == "publisher":
                        return link["url"]
            return links
        except Exception as e:
            logging.error(f"Error getting URL for paper {paper_id}: {e}")

In [6]:
retriever = USGSRetriever()

In [7]:
retriever.query_ask_xdd("Iron ore in the US.")

{'paper_id': '55b7deb7e13823bd29ba840d',
 'preprocessor_id': 'haystack_v0.0.2',
 'doc_type': 'paragraph',
 'topic_list': ['criticalmaas'],
 'text_content': 'The iron industry on the Pacific coast................................. 7 Literature............................................................. 8 Scope of the work..................................................... 9 Acknowledgements................................................... 10 Geography and topography................................................ 10 Location of the district................................................ 10\nWatering places...................................................... 14 Cottonwood springs............................................... 14 Boulder well....................................^................ 15 Placer Canyon reservoir........................................... 15 Eagle tank...................................................... 15 Other watering places............................