# JSONQueryEngine RAG

In [1]:
!pip install llama-index llama-index-embeddings-huggingface llama_index-llms-ollama
!pip install llama-index-readers-json
!pip install jsonpath-ng





Collecting jsonpath-ng
  Downloading jsonpath_ng-1.6.1-py3-none-any.whl.metadata (18 kB)
Downloading jsonpath_ng-1.6.1-py3-none-any.whl (29 kB)
Installing collected packages: jsonpath-ng
Successfully installed jsonpath-ng-1.6.1


In [21]:
# NOTE: This is ONLY necessary in jupyter notebook.
# Details: Jupyter runs an event-loop behind the scenes.
#          This results in nested event-loops when we start an event-loop to make async queries.
#          This is normally not allowed, we use nest_asyncio to allow it for convenience.
import nest_asyncio

nest_asyncio.apply()

In [3]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)
from llama_index.core import Settings
from llama_index.core.agent import ReActAgent
from llama_index.core.indices.struct_store import JSONQueryEngine
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from IPython.display import Markdown, display

In [15]:
# Set up OpenAI
import os
import getpass
import openai

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
openai.api_key = os.environ["OPENAI_API_KEY"]

OpenAI API Key:········


# RAG Search on An Academic Paper in Llama-Index

In [22]:
# Set LLM
embed_model = HuggingFaceEmbedding(model_name='sentence-transformers/msmarco-distilbert-base-v4')
llm = Ollama(model="llama3", temperature=0)

## JSON Query Engine

https://docs.llamaindex.ai/en/stable/examples/query_engine/json_query_engine/

In [42]:
import json

background_json = "./data/pubmed_background.json"

with open(background_json) as f:
   json_value = json.load(f)

print(json_value[0])

{'article_title': 'Disulfidptosis: a new target for metabolic cancer therapy.', 'article_abstract': 'Altered metabolism is a hallmark of cancer and presents a vulnerability that can be exploited in cancer treatment. Regulated cell death (RCD) plays a crucial role in cancer metabolic therapy. A recent study has identified a new metabolic-related RCD known as disulfidptosis. Preclinical findings suggest that metabolic therapy using glucose transporter (GLUT) inhibitors can trigger disulfidptosis and inhibit cancer growth. In this review, we summarize the specific mechanisms underlying disulfidptosis and outline potential future research directions. We also discuss the challenges that may arise in the clinical translation of disulfidptosis research.', 'pub_date': {'year': '2023', 'month': '04', 'day': '27'}}


In [46]:
# JSON Schema object that the above JSON value conforms to
json_schema = {
    "$schema": "http://json-schema.org/draft-07/schema#",
    "description": "Schema for research documents",
        "type": "object",
        "properties": {
            "article_title": {
                "description": "Document title",
                "type": "string",
            },
            "article_abstract": {
                "description": "Document summary",
                "type": "string",
            },
            "pub_date": {
                "description": "Publication date",
                "type": "object",
                "properties": {
                    "year": {
                        "description": "Publication year",
                        "type": "integer",
                    },
                    "month": {
                        "description": "Publication year",
                        "type": "integer",
                    },
                    "day": {
                        "description": "Publication year",
                        "type": "integer",
                    },
                },
                "required": ["year", "month", "day"],
            },
        },
        "required": ["article_title","article_abstract", "pub_date"]
}

In [47]:
from llama_index.core.indices.struct_store import JSONQueryEngine

nl_query_engine = JSONQueryEngine(
    json_value=json_value,
    json_schema=json_schema,
    llm=llm,
    verbose=True
)
raw_query_engine = JSONQueryEngine(
    json_value=json_value,
    json_schema=json_schema,
    llm=llm,
    synthesize_response=False,
    verbose=True
)

In [48]:
nl_response = nl_query_engine.query(
    "What is a good title for medical research?",
)
display(Markdown(f"Natural language Response{nl_response}"))
# get the json path query string. Same would apply to raw_response
print(nl_response.metadata["json_path_response_str"])

> JSONPath Instructions:
```
Based on the provided schema, I would recommend the following JSON Path query:

JSONPath: $.article_title

This query will retrieve the value of the "article_title" property from the JSON object that matches the schema.
```


ValueError: Invalid JSON Path: Based on the provided schema

In [None]:
raw_response = raw_query_engine.query(
    "What is a good title for medical research?",
)
display(Markdown(f"Raw JSON Response{raw_response}"))
# get the json path query string. Same would apply to raw_response
print(raw_response.metadata["json_path_response_str"])