# Page Index

In [1]:
from pageindex import PageIndexClient
import pageindex.utils as utils

# Get your PageIndex API key from https://dash.pageindex.ai/api-keys
PAGEINDEX_API_KEY = "12caa205c32d4bfea89badc9c77bff03"
pi_client = PageIndexClient(api_key=PAGEINDEX_API_KEY)

In [34]:
from langchain_ollama import ChatOllama

def call_llm(prompt, model="deepseek-r1:8b", temperature=0):
    
    llm = ChatOllama(
        model=model,
        temperature=temperature
    )
    
    response = llm.invoke(prompt)
    
    return response

## Step 1: Page Index Tree generation

1.1 Submit a document for generating PageIndex tree

In [None]:
# import os, requests

# # You can also use our GitHub repo to generate PageIndex tree
# # https://github.com/VectifyAI/PageIndex

# pdf_url = ""
# pdf_path = os.path.join("data", pdf_url.split('/')[-1])
# os.makedirs(os.path.dirname(pdf_path), exist_ok=True)

# response = requests.get(pdf_url)
# with open(pdf_path, "wb") as f:
#     f.write(response.content)
# print(f"Downloaded {pdf_url}")

Downloaded https://arxiv.org/pdf/2501.12948.pdf


In [3]:
pdf_path = r"C:\Users\prana\Desktop\VS_CODE\LangChainModels\PDFs\Financial_Market_Securities_BSE.one.pdf"
doc_id = pi_client.submit_document(pdf_path)["doc_id"]
print('Document Submitted:', doc_id)

Document Submitted: pi-cmm5u3rl006fi0io94szy4mxr


1.2 Get the generated PageIndex tree structure

In [5]:
if pi_client.is_retrieval_ready(doc_id):
    tree = pi_client.get_tree(doc_id, node_summary=True)['result']
    print('Simplified Tree Structure of the Document:')
    utils.print_tree(tree)
else:
    print("Processing document, please try again later...")

Simplified Tree Structure of the Document:
[{'title': 'Comprehensive Study Material on Financia...',
  'node_id': '0000',
  'prefix_summary': '# Comprehensive Study Material on Financ...',
  'nodes': [{'title': 'Topic 1: Financial Market Securities Tra...',
             'node_id': '0001',
             'prefix_summary': '## Topic 1: Financial Market Securities ...',
             'nodes': [{'title': '1.1 Equity Instruments',
                        'node_id': '0002',
                        'summary': 'The text defines equity instruments as r...'},
                       {'title': '1.2 Debt Instruments',
                        'node_id': '0003',
                        'summary': 'The text defines debt instruments as loa...'},
                       {'title': '1.3 Derivatives',
                        'node_id': '0004',
                        'summary': 'The text describes derivatives as financ...'},
                       {'title': '1.4 Other Traded Instruments',
                     

In [15]:
tree[0]['nodes'][0].keys()

dict_keys(['title', 'node_id', 'page_index', 'prefix_summary', 'text', 'nodes'])

In [20]:
tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])
tree_without_text[0]['nodes'][0].keys()

dict_keys(['title', 'node_id', 'page_index', 'prefix_summary', 'nodes'])

In [33]:
from anytree import Node, RenderTree

data = tree  # your JSON structure

def build_tree(node_data, parent=None):
    
    node_id = node_data.get("node_id", "")
    title = node_data.get("title", "")
    prefix_summary = ""
    
    # Include prefix summary if it exists
    if prefix_summary:
        node_label = f"{node_id} - {title}\n  →   {prefix_summary}"
    else:
        node_label = f"{node_id} - {title}"
    
    node = Node(node_label, parent=parent)

    for child in node_data.get("nodes", []):
        build_tree(child, node)

    return node


# Create root
root = build_tree(data[0])

# Display tree
for pre, fill, node in RenderTree(root):
    print(f"{pre}{node.name}")

0000 - Comprehensive Study Material on Financial Markets
├── 0001 - Topic 1: Financial Market Securities Traded in BSE, India
│   ├── 0002 - 1.1 Equity Instruments
│   ├── 0003 - 1.2 Debt Instruments
│   ├── 0004 - 1.3 Derivatives
│   └── 0005 - 1.4 Other Traded Instruments
├── 0006 - Topic 2: Characteristics of Securities &amp; Price Dynamics
│   ├── 0007 - 2.1 Equity Shares: Price Dynamics
│   ├── 0008 - 2.2 Debt Instruments (Bonds): Price Dynamics
│   ├── 0009 - 2.3 Derivatives (Options &amp; Futures): Price Dynamics
│   └── 0010 - 2.4 ETFs and Mutual Funds: Price Dynamics
├── 0011 - Topic 3: Debt and Debt Instruments
│   ├── 0012 - 3.1 Government Debt Instruments
│   ├── 0013 - 3.2 Corporate Debt Instruments
│   └── 0014 - 3.3 Money Market Instruments
└── 0015 - Topic 4: Risk Management in Financial Markets
    ├── 0016 - 4.1 The Importance of Risk Management
    ├── 0017 - 4.2 Major Types of Financial Risk
    ├── 0018 - 4.3 The Risk Management Process &amp; Strategies
    └── 001

## Step 2: Reasoning-Based Retrieval with Tree Search

2.1 Use LLM for tree search and identify nodes that might contain relevant context

In [36]:
import json

query = "what is a preferencec Stock"

tree_without_text = utils.remove_fields(tree.copy(), fields=['text'])

search_prompt = f"""
You are given a question and a tree structure of a document.
Each node contains a node id, node title, and a corresponding summary.
Your task is to find all nodes that are likely to contain the answer to the question.

Question: {query}

Document tree structure:
{json.dumps(tree_without_text, indent=2)} 

Please reply in the following JSON format:
{{
    "thinking": "<Your thinking process on which nodes are relevant to the question>",
    "node_list": ["node_id_1", "node_id_2", ..., "node_id_n"]
}}
Directly return the final JSON structure. Do not output anything else.
"""

response =  call_llm(search_prompt)
tree_search_result = response.content.strip()

2.2 Print retrieved nodes and reasoning process

In [43]:
node_map = utils.create_node_mapping(tree)
tree_search_result_json = json.loads(tree_search_result)

print('Reasoning Process:')
utils.print_wrapped(tree_search_result_json['thinking'])

print('\nRetrieved Nodes:')
for node_id in tree_search_result_json["node_list"]:
    node = node_map[node_id]
    print(f"Node ID: {node['node_id']}\t Page: {node['page_index']}\t Title: {node['title']}")

Reasoning Process:
The question asks about 'preferencec Stock', which appears to be a typo for 'preference stock'. I
will search the document for nodes discussing preference shares. Node 0002 under Topic 1 explicitly
defines preference shares as a hybrid security with preferential rights, including fixed dividends
and claims on assets over equity shareholders. Other nodes discuss debt instruments, derivatives,
ETFs, price dynamics, and risk management, but do not mention preference stocks. Therefore, only
node 0002 is relevant.

Retrieved Nodes:
Node ID: 0002	 Page: 1	 Title: 1.1 Equity Instruments


In [54]:
retrieval_token_count = response.usage_metadata
retrieval_token_count

{'input_tokens': 3115, 'output_tokens': 487, 'total_tokens': 3602}

## Step 3: Answer Generation

3.1 Extract relevant context from retrieved nodes

In [47]:
node_list = json.loads(tree_search_result)["node_list"]
relevant_content = "\n\n".join(node_map[node_id]["text"] for node_id in node_list)

print('Retrieved Context:\n')
utils.print_wrapped(relevant_content[:1000] + '...')

Retrieved Context:

### 1.1 Equity Instruments

Equity instruments represent ownership in a company. When you buy a share, you become a part-owner
of the business, entitling you to a portion of its profits and a say in its governance, depending on
the type of share.

#### Equity Shares (Common Shares)

Equity shares are the most common type of security traded on the BSE. They represent the primary
ownership stake in a corporation. Shareholders are entitled to voting rights on key corporate
matters and receive dividends, which are a portion of the company's profits distributed to
shareholders. However, dividend payments are not guaranteed and depend on the company's
profitability and board decisions. In the event of liquidation, equity shareholders have the last
claim on the company's assets after all debts and other obligations have been settled (Bajaj
Finserv).

#### Preference Shares (Preferred Stock)

Preference shares are a hybrid instrument, blending features of both equity and de

3.2 Generate answer based on retrieved context

In [48]:
node_map[node_id]

{'title': '1.1 Equity Instruments',
 'node_id': '0002',
 'page_index': 1,
 'summary': 'The text defines equity instruments as representing company ownership. It details equity shares, highlighting their commonality, ownership stake, voting rights, variable dividends, and last claim on assets during liquidation. It then explains preference shares as a hybrid with preferential rights, including priority for fixed dividends and claims on assets over equity shareholders, but typically without voting rights. Various types of preference shares are also mentioned.',
 'text': "### 1.1 Equity Instruments\n\nEquity instruments represent ownership in a company. When you buy a share, you become a part-owner of the business, entitling you to a portion of its profits and a say in its governance, depending on the type of share.\n\n#### Equity Shares (Common Shares)\n\nEquity shares are the most common type of security traded on the BSE. They represent the primary ownership stake in a corporation. Sha

In [51]:
answer_prompt = f"""
Answer the question based on the context:

Question: {query}
Context: {relevant_content}

Provide a clear, concise answer based only on the context provided.
"""

print('Generated Answer:\n')
answer = call_llm(answer_prompt)
utils.print_wrapped(answer.content.strip())

Generated Answer:

Based on the provided context:

A **Preference Share (or Preferred Stock)** is a type of equity instrument that is a **hybrid**
blending features of both equity and debt.

Its key features include:

*   **Dividend Priority:** Receives dividends before equity shareholders, usually at a **fixed
rate**.
*   **Liquidation Priority:** Has a prior claim on company assets during liquidation, **after**
creditors but **before** equity shareholders.
*   **No Voting Rights:** Generally lacks voting rights, unlike equity shareholders.

There are various types, such as cumulative (unpaid dividends accumulate), non-cumulative,
convertible, and redeemable preference shares.


In [58]:
final_token_count = answer.usage_metadata
final_token_count

{'input_tokens': 417, 'output_tokens': 683, 'total_tokens': 1100}

In [63]:
input_tokens = retrieval_token_count['input_tokens'] + final_token_count['input_tokens']
output_tokens = retrieval_token_count['output_tokens']  + final_token_count['output_tokens']
input_tokens + output_tokens

4702