In [None]:
!pip install llama-index-llms-openai
!pip install llama-index
!pip install llama-index-embeddings-huggingface


In [None]:
import os
import logging
from llama_index.core import Settings
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.llms.ollama import Ollama

# Configure logging
logging.basicConfig(level=logging.ERROR)

# Configure Ollama LLM and Embeddings
llm = Ollama(
    model="llama3.2:latest",
    base_url="http://localhost:11434",
    temperature=0.1,
)

embed_model = HuggingFaceEmbedding(
    model_name="sentence-transformers/all-mpnet-base-v2", max_length=512
)

# Set global settings
Settings.llm = llm
Settings.embed_model = embed_model


In [None]:
# Create Sentence Window Node Parser
node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

# Set a base text splitter for default indexing
text_splitter = SentenceSplitter()
Settings.text_splitter = text_splitter


In [None]:
# Download sample document
!curl -o IPCC_AR6_WGII_Chapter03.pdf https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_Chapter03.pdf

In [None]:
from llama_index.core import SimpleDirectoryReader

# Load the document

documents = SimpleDirectoryReader(input_files=["../data_ipcc/IPCC_AR6_WGII_Chapter03.pdf"]).load_data()

In [None]:
len(documents)

In [None]:
# Extract nodes using Sentence Window Parser
nodes = node_parser.get_nodes_from_documents(documents)

# Extract base nodes with default text splitting
base_nodes = text_splitter.get_nodes_from_documents(documents)


In [None]:
print(len(nodes))
print(len(base_nodes))

In [None]:
from llama_index.core import VectorStoreIndex

# Create indexes
sentence_index = VectorStoreIndex(nodes)
base_index = VectorStoreIndex(base_nodes)


In [None]:
# Query Engine with Metadata Replacement PostProcessor
query_engine = sentence_index.as_query_engine(
    similarity_top_k=2,
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ]
)

# Execute query
response = query_engine.query("What are the concerns surrounding the AMOC?")
print(response)

# Extract context window and original sentence
window = response.source_nodes[0].node.metadata["window"]
original_sentence = response.source_nodes[0].node.metadata["original_text"]

print(f"Window: {window}")
print("------------------")
print(f"Original Sentence: {original_sentence}")


In [None]:
# Default query engine
base_query_engine = base_index.as_query_engine(similarity_top_k=2)

# Execute query
base_response = base_query_engine.query("What are the concerns surrounding the AMOC?")
print(base_response)


In [None]:
# Compare retrieved nodes for both methods
print("Sentence Window Method:")
for source_node in response.source_nodes:
    print(source_node.node.metadata["original_text"])
    print("--------")

print("Base Index Method:")
for source_node in base_response.source_nodes:
    print("AMOC mentioned?", "AMOC" in source_node.node.text)
    print("--------")
