In [1]:
### Fixing import errors of the

import sys
import os

# This code navigates up one directory from the notebook's location ('examples/')
# to get the project's root directory.
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

# We check if the path is already in the system path.
# If not, we add it to the beginning of the list.
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"Added project root to Python path: {project_root}")
else:
    print(f"Project root is already in Python path: {project_root}")

# Optional: You can print the first few paths to verify
print("\nVerifying sys.path:")
for i, path in enumerate(sys.path[:5]):
    print(f"{i}: {path}")

Added project root to Python path: /home/nick/projects/Llama-Index-GliREL-GraphRAG

Verifying sys.path:
0: /home/nick/projects/Llama-Index-GliREL-GraphRAG
1: /usr/lib/python312.zip
2: /usr/lib/python3.12
3: /usr/lib/python3.12/lib-dynload
4: 


In [2]:
import os
import argparse
import json
import logging
from typing import Dict, List
from dotenv import load_dotenv
from pathlib import Path
from transformers import AutoTokenizer
from tqdm import tqdm
import asyncio
import nest_asyncio

#llama index imports
from llama_index.core import SimpleDirectoryReader, PropertyGraphIndex,Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import StorageContext, load_index_from_storage

In [3]:
# Load environment variables
load_dotenv()

# Set CUDA device
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Import HippoRAG components after setting environment


# Configure logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("llamaindex_processing_test.log")
    ]
)

In [4]:
llm = Ollama(
    model= "gemma3:12b",
    request_timeout=120.0,
    context_window=4096, #8128,
    temperature=0.0
)

Settings.llm = llm
Settings.chunk_size=512
Settings.chunk_overlap=64

embed_model = OllamaEmbedding(
    model_name="snowflake-arctic-embed2:latest",
    ollama_additional_kwargs={"mirostat": 0},
)
Settings.embed_model = embed_model

In [5]:
def group_questions_by_source(question_list: List[dict]) -> Dict[str, List[dict]]:
    """Group questions by their source"""
    grouped_questions = {}
    for question in question_list:
        source = question.get("source")
        if source not in grouped_questions:
            grouped_questions[source] = []
        grouped_questions[source].append(question)
    return grouped_questions


In [6]:
async def process_corpus(
    corpus_name: str,
    questions: List[dict],
    mode: str
):
    """Process a single corpus: index it and answer its questions"""
    logging.info(f"📚 Processing corpus: {corpus_name}")
    
    # Prepare output directory
    output_dir = f"./.persistent_storage/.results/{mode}/{corpus_name}"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"predictions_{corpus_name}.json")
    
    # initialize Llama-index retrieval engine
    strorage_cotext = StorageContext.from_defaults(persist_dir=f"./.persistent_storage/{mode}/{corpus_name}")


    # Get questions for this corpus
    corpus_questions = questions.get(corpus_name, [])
    if not corpus_questions:
        logging.warning(f"⚠️ No questions found for corpus: {corpus_name}")
        return
    
    
    logging.info(f"🔍 Found {len(corpus_questions)} questions for {corpus_name}")
    
    # Prepare queries and gold answers
    all_queries = [q["question"] for q in corpus_questions]
    gold_answers = [[q['answer']] for q in corpus_questions]
    
    # initlaize RAG engine
    index = load_index_from_storage(storage_context=strorage_cotext)


    logging.info(f"✅ Indexed corpus: {corpus_name}")
    query_engine = index.as_query_engine(
        llm=llm,
        response_mode="compact",
        similarity_top_k=8,
        embedding_mode="hybrid",
        include_text=True, 
    )

    # Process questions
    results = []
    solutions =[]
    for query in all_queries:
        #nest_asyncio.apply()
        response_object = await query_engine.aquery(query)
        solution_dict = {"question":query,
                         "answer":response_object.response,
                         "docs":response_object.get_formatted_sources(10000)
                         }
        solutions.append(solution_dict)
    for question in corpus_questions:
        solution = next((sol for sol in solutions if sol['question'] == question['question']), None)
        if solution:
            results.append({
                "id": question["id"],
                "question": question["question"],
                "source": corpus_name,
                "context": solution.get("docs", ""),
                "evidence": question.get("evidence", ""),
                "question_type": question.get("question_type", ""),
                "generated_answer": solution.get("answer", ""),
                "gold_answer": question.get("answer", "")
            })
    
    # Save results
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    logging.info(f"💾 Saved {len(results)} predictions to: {output_path}")
    #print(results)

In [7]:
try:
    with open("../.data/novel.json", "r", encoding="utf-8") as f:
        corpus_data = json.load(f)
    logging.info(f"📖 Loaded corpus with {len(corpus_data)} documents from ../.data/novel.json")
except Exception as e:
    logging.error(f"❌ Failed to load corpus: {e}")
    #return

# Sample corpus data if requested

# Load question data
try:
    with open("../.data/novel_questions.json", "r", encoding="utf-8") as f:
        question_data = json.load(f)
    grouped_questions = group_questions_by_source(question_data)
    logging.info(f"❓ Loaded questions with {len(question_data)} entries from ../.data/novel_questions.json")
except Exception as e:
    logging.error(f"❌ Failed to load questions: {e}")
    #return

# Process each corpus in the subset
for mode in ["gli","hybrid","llm"]:
    for item in corpus_data:
        corpus_name = item["corpus_name"]
        context = item["context"]
        nest_asyncio.apply()
        await process_corpus(
            corpus_name=corpus_name,
            questions=grouped_questions,
            mode=mode
        )

2025-07-22 20:09:55,063 - INFO - 📖 Loaded corpus with 20 documents from ../.data/novel.json
2025-07-22 20:09:55,073 - INFO - ❓ Loaded questions with 2010 entries from ../.data/novel_questions.json
2025-07-22 20:09:55,074 - INFO - 📚 Processing corpus: Novel-30752


Loading llama_index.core.storage.kvstore.simple_kvstore from ./.persistent_storage/gli/Novel-30752/docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ./.persistent_storage/gli/Novel-30752/index_store.json.


2025-07-22 20:09:56,791 - INFO - 🔍 Found 87 questions for Novel-30752
2025-07-22 20:09:56,792 - INFO - Loading all indices.
2025-07-22 20:09:56,793 - INFO - ✅ Indexed corpus: Novel-30752
2025-07-22 20:09:57,025 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-07-22 20:09:57,557 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:09:58,431 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:09:58,469 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-07-22 20:09:58,965 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:10:00,050 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:10:00,505 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:10:01,244 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HT

Loading llama_index.core.storage.kvstore.simple_kvstore from ./.persistent_storage/gli/Novel-51410/docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ./.persistent_storage/gli/Novel-51410/index_store.json.


2025-07-22 20:16:37,785 - INFO - 🔍 Found 78 questions for Novel-51410
2025-07-22 20:16:37,785 - INFO - Loading all indices.
2025-07-22 20:16:37,786 - INFO - ✅ Indexed corpus: Novel-51410
2025-07-22 20:16:38,056 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-07-22 20:16:38,506 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:16:39,478 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:16:39,826 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:16:40,195 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:16:40,280 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-07-22 20:16:40,669 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:16:41,642 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HT

Loading llama_index.core.storage.kvstore.simple_kvstore from ./.persistent_storage/gli/Novel-58553/docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ./.persistent_storage/gli/Novel-58553/index_store.json.


2025-07-22 20:22:57,319 - INFO - 🔍 Found 112 questions for Novel-58553
2025-07-22 20:22:57,319 - INFO - Loading all indices.
2025-07-22 20:22:57,320 - INFO - ✅ Indexed corpus: Novel-58553
2025-07-22 20:22:57,434 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-07-22 20:22:57,984 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:22:59,109 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:22:59,575 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:23:00,263 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:23:00,341 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-07-22 20:23:00,742 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:23:01,703 - INFO - HTTP Request: POST http://localhost:11434/api/chat "H

Loading llama_index.core.storage.kvstore.simple_kvstore from ./.persistent_storage/gli/Novel-10762/docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ./.persistent_storage/gli/Novel-10762/index_store.json.


2025-07-22 20:29:15,485 - INFO - 🔍 Found 78 questions for Novel-10762
2025-07-22 20:29:15,485 - INFO - Loading all indices.
2025-07-22 20:29:15,486 - INFO - ✅ Indexed corpus: Novel-10762
2025-07-22 20:29:15,672 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-07-22 20:29:16,116 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:29:16,740 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:29:16,835 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-07-22 20:29:17,110 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:29:17,582 - INFO - HTTP Request: POST http://localhost:11434/api/chat "HTTP/1.1 200 OK"
2025-07-22 20:29:17,696 - INFO - HTTP Request: POST http://localhost:11434/api/embeddings "HTTP/1.1 200 OK"
2025-07-22 20:29:17,990 - INFO - HTTP Request: POST http://localhost:11434/api/ch

CancelledError: 