# **SimpleNodeParser + NodeReference (MetaData Reference)**

In [2]:
!pip install -qU llama-index

!pip install -qU llama-hub
!pip install -qU PyMuPDF

!pip install -qU langchain
!pip install -qU sentence-transformers
!pip install -qU chromadb
!pip install -qU "arize-phoenix[experimental]"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m784.6/784.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.8/143.8 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fo

In [1]:
!mkdir data

**Documents Loader**

In [3]:
import os
from llama_hub.file.pymu_pdf.base import PyMuPDFReader
import multiprocessing
import itertools
import psutil

loader = PyMuPDFReader()
num_cpus = psutil.cpu_count(logical=False)

def load_document(file_path):
    return loader.load(file_path)

def dir_loader(dir_path='.'):

    if dir_path=='.':
        dir_path = os.getcwd()

    # dir_name = dir_path.split('/')[-1]

    # Ensure dir_path is an absolute path
    dir_path = os.path.abspath(dir_path)

    file_paths = [os.path.join(dir_path, file_name)
                for file_name in os.listdir(dir_path)
                if file_name.endswith('.pdf')]

    # Create a multiprocessing pool
    pool = multiprocessing.Pool(num_cpus)

    # Load all the PDF documents in parallel
    docs = pool.map(load_document, file_paths)

    # Close the multiprocessing pool
    pool.close()
    pool.join()

    # Flatten the list of lists into a single list of documents
    flat_docs = list(itertools.chain(*docs))

    return flat_docs


documents = dir_loader('./data')
print(f"loaded {len(documents)} documents")

loaded 98 documents


**Vector Store**

In [4]:
# save to disk
from llama_index.vector_stores import ChromaVectorStore
import chromadb

# create client and a new collection
db = chromadb.PersistentClient(path="./storage")
chroma_collection = db.get_or_create_collection("Quickstart")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [5]:
# API Key
import os
import openai

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"
os.environ["OPENAI_API_KEY"] = "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"
openai.api_key = os.environ["OPENAI_API_KEY"]

**Convert into String**

In [8]:
from llama_index import Document

doc_text = "\n\n".join([d.get_content() for d in documents])
docs = [Document(text=doc_text)]

**Node Parser - MetaData Node Reference**

In [6]:
import json

# cache metadata dicts
def save_metadata_dicts(path):
    with open(path, "w") as fp:
        for m in metadata_dicts:
            fp.write(json.dumps(m) + "\n")


def load_metadata_dicts(path):
    with open(path, "r") as fp:
        metadata_dicts = [json.loads(l) for l in fp.readlines()]
        return metadata_dicts

In [9]:
from llama_index.node_parser import SimpleNodeParser
from llama_index.schema import IndexNode
from llama_index.node_parser.extractors import SummaryExtractor, QuestionsAnsweredExtractor, MetadataExtractor
import copy


node_parser = SimpleNodeParser.from_defaults(chunk_size=1024)
base_nodes = node_parser.get_nodes_from_documents(docs)


metadata_extractor = MetadataExtractor(
    extractors=[
        SummaryExtractor(summaries=["self"], show_progress=True),
        QuestionsAnsweredExtractor(questions=5, show_progress=True),
    ],
)

metadata_dicts = metadata_extractor.extract(base_nodes)

Extracting summaries:   0%|          | 0/73 [00:00<?, ?it/s]

Extracting questions:   0%|          | 0/73 [00:00<?, ?it/s]

In [10]:
save_metadata_dicts("data/metadata_dicts.jsonl")

In [11]:
# all nodes consists of source nodes, along with metadata
all_nodes = copy.deepcopy(base_nodes)
for idx, d in enumerate(metadata_dicts):
    inode_q = IndexNode(
        text=d["questions_this_excerpt_can_answer"],
        index_id=base_nodes[idx].node_id,
    )
    inode_s = IndexNode(
        text=d["section_summary"], index_id=base_nodes[idx].node_id
    )
    all_nodes.extend([inode_q, inode_s])

In [12]:
all_nodes_dict = {n.node_id: n for n in all_nodes}

In [None]:
for idx, node in enumerate(base_nodes):
    node.id_ = f"node-{idx}"

all_nodes_dict = {n.node_id: n for n in all_nodes}

**Embeddings**

In [15]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.embeddings import LangchainEmbedding

# define embedding function
embed_model = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="BAAI/bge-base-en")
)

Downloading (…)9a243/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)1e3c49a243/README.md:   0%|          | 0.00/90.1k [00:00<?, ?B/s]

Downloading (…)3c49a243/config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)9a243/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)1e3c49a243/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)c49a243/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

**Large Language Model**

In [16]:
from llama_index import LLMPredictor
from llama_index.llms import OpenAI

# define llm predictor
llm = OpenAI(
      model="gpt-3.5-turbo",
      temperature=0,
      max_tokens=256)
llm_predictor = LLMPredictor(llm=llm)

In [17]:
from llama_index.indices.prompt_helper import PromptHelper

# prompt helper
prompt_helper = PromptHelper(
  context_window=4096,
  num_output=256,
  chunk_overlap_ratio=0.1,
  chunk_size_limit=None
)

In [18]:
# To view traces in Phoenix, you will first have to start a Phoenix server. You can do this by running the following:
import phoenix as px
session = px.launch_app()

🌍 To view the Phoenix app in your browser, visit https://gt01q10w6bu1-496ff2e9c6d22116-6060-colab.googleusercontent.com/
📺 To view the Phoenix app in a notebook, run `px.active_session().view()`
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [19]:
from llama_index.logger.base import LlamaLogger
from llama_index.callbacks import CallbackManager
from phoenix.trace.llama_index import OpenInferenceTraceCallbackHandler

openinference_callback = OpenInferenceTraceCallbackHandler()
callback_manager = CallbackManager(handlers=[openinference_callback])

**Context**

In [20]:
from llama_index import StorageContext, ServiceContext

# Storage Context
storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
)

# Service Context
service_context = ServiceContext.from_defaults(
    node_parser=node_parser,
    embed_model=embed_model,
    llm_predictor=llm_predictor,
    prompt_helper=prompt_helper,
    llama_logger=LlamaLogger(),
    callback_manager=callback_manager,
)

**Index**

In [21]:
from llama_index import VectorStoreIndex

index = VectorStoreIndex(
    all_nodes,
    show_progress=True,
    storage_context=storage_context,
    service_context=service_context,
)

Generating embeddings:   0%|          | 0/219 [00:00<?, ?it/s]

In [22]:
from IPython.display import Markdown, display

**Base Query Engine**

In [23]:
# Query Data
base_query_engine = index.as_query_engine(similarity_top_k=3)

response = base_query_engine.query("What were the developments regarding Russian troop deployments near the Ukraine border in April 2021 and November 2021, including details about troop numbers, formations, and any notable statements or actions from Ukrainian officials or President Zelenskyy?")
display(Markdown(f"<b>{response}</b>"))

<b>I'm sorry, but the provided context information does not include any developments regarding Russian troop deployments near the Ukraine border in April 2021. However, it does mention that in November 2021, President Zelenskyy stated that nearly 100,000 Russian troops had massed on the border with Ukraine.</b>

**Base Retriever + Retriever Query Engine**

In [24]:
from llama_index.query_engine import RetrieverQueryEngine

# Base Retriever
retriever = index.as_retriever(
    similarity_top_k=3,
)

# Retriever Query Engine
rq_query_engine = RetrieverQueryEngine.from_args(
    retriever, service_context=service_context
)

response = rq_query_engine.query("What were the developments regarding Russian troop deployments near the Ukraine border in April 2021 and November 2021, including details about troop numbers, formations, and any notable statements or actions from Ukrainian officials or President Zelenskyy?")
display(Markdown(f"<b>{response}</b>"))

<b>I'm sorry, but the provided context information does not include any developments regarding Russian troop deployments near the Ukraine border in April 2021. However, it does mention that in November 2021, President Zelenskyy stated that nearly 100,000 Russian troops had massed on the border with Ukraine.</b>

**Recursive Retriever + Recursive Query Engine**

In [25]:
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.retrievers import RecursiveRetriever

base_retriever = index.as_retriever(
    similarity_top_k=3,
)
recursive_retriever = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": base_retriever},
    node_dict=all_nodes_dict,
    verbose=False,
)

rr_query_engine = RetrieverQueryEngine.from_args(
    recursive_retriever, service_context=service_context
)

In [26]:
response = rr_query_engine.query("What were the developments regarding Russian troop deployments near the Ukraine border in April 2021 and November 2021, including details about troop numbers, formations, and any notable statements or actions from Ukrainian officials or President Zelenskyy?")
display(Markdown(f"<b>{response}</b>"))

<b>I'm sorry, but the provided context information does not include any developments regarding Russian troop deployments near the Ukraine border in April 2021. However, it does mention that in November 2021, President Zelenskyy stated that nearly 100,000 Russian troops had massed on the border with Ukraine.</b>

In [27]:
from phoenix.trace.span_json_encoder import spans_to_jsonl

with open("metadatareference_trace.jsonl", "w") as f:
    f.write(spans_to_jsonl(openinference_callback.get_spans()))