In [None]:
### Fixing import errors of the

import sys
import os

# This code navigates up one directory from the notebook's location ('examples/')
# to get the project's root directory.
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

# We check if the path is already in the system path.
# If not, we add it to the beginning of the list.
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"Added project root to Python path: {project_root}")
else:
    print(f"Project root is already in Python path: {project_root}")

# Optional: You can print the first few paths to verify
print("\nVerifying sys.path:")
for i, path in enumerate(sys.path[:5]):
    print(f"{i}: {path}")

In [None]:
import os
import argparse
import json
import logging
from typing import Dict, List
from dotenv import load_dotenv
from pathlib import Path
from transformers import AutoTokenizer
from tqdm import tqdm
import asyncio
import nest_asyncio

#llama index imports
from llama_index.core import SimpleDirectoryReader, PropertyGraphIndex,Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import StorageContext, load_index_from_storage
import openlit

In [None]:
openlit.init(
  otlp_endpoint="http://127.0.0.1:4318",
  application_name="query2",
  environment="obama_enviroment")


In [None]:
# Load environment variables
load_dotenv()

# Set CUDA device
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Import HippoRAG components after setting environment


# Configure logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler("llamaindex_processing_test.log")
    ]
)

In [None]:
llm = Ollama(
    model= "gemma3:12b",
    request_timeout=120.0,
    context_window=4096, #8128,
    temperature=0.0
)

Settings.llm = llm
Settings.chunk_size=512
Settings.chunk_overlap=64

embed_model = OllamaEmbedding(
    model_name="snowflake-arctic-embed2:latest",
    ollama_additional_kwargs={"mirostat": 0},
)
Settings.embed_model = embed_model

In [None]:
def group_questions_by_source(question_list: List[dict]) -> Dict[str, List[dict]]:
    """Group questions by their source"""
    grouped_questions = {}
    for question in question_list:
        source = question.get("source")
        if source not in grouped_questions:
            grouped_questions[source] = []
        grouped_questions[source].append(question)
    return grouped_questions


In [None]:
all_errors = []

In [None]:
async def process_corpus(
    corpus_name: str,
    questions: List[dict],
    mode: str
):
    """Process a single corpus: index it and answer its questions"""
    logging.info(f"📚 Processing corpus: {corpus_name}")
    
    # Prepare output directory
    output_dir = f"./.persistent_storage/.results2/{mode}/{corpus_name}"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"predictions_{corpus_name}.json")
    
    if Path(output_path).is_file():
        logging.info(f"BASE: {corpus_name} has alleady been analized")
        return

    # initialize Llama-index retrieval engine
    strorage_cotext = StorageContext.from_defaults(persist_dir=f"./.persistent_storage/.storage_context/{mode}/{corpus_name}")


    # Get questions for this corpus
    corpus_questions = questions.get(corpus_name, [])
    if not corpus_questions:
        logging.warning(f"⚠️ No questions found for corpus: {corpus_name}")
        return
    
    
    logging.info(f"🔍 Found {len(corpus_questions)} questions for {corpus_name}")
    
    # Prepare queries and gold answers
    all_queries = [q["question"] for q in corpus_questions]
    gold_answers = [[q['answer']] for q in corpus_questions]
    
    # initlaize RAG engine
    index = load_index_from_storage(storage_context=strorage_cotext)


    logging.info(f"✅ Indexed corpus: {corpus_name}")
    query_engine = index.as_query_engine(
        llm=llm,
        response_mode="compact",
        similarity_top_k=8,
        embedding_mode="hybrid",
        include_text=True, 
    )


    # Process questions
    results = []
    solutions =[]
    for query in all_queries:
        #nest_asyncio.apply()
        
        try:
            response_object = await query_engine.aquery(query)
        except Exception as e:
            all_errors.append(f"{mode} : {corpus_name}")
            return
        solution_dict = {"question":query,
                         "answer":response_object.response,
                         "docs":response_object.get_formatted_sources(10000)
                         }
        solutions.append(solution_dict)
    for question in corpus_questions:
        solution = next((sol for sol in solutions if sol['question'] == question['question']), None)
        if solution:
            results.append({
                "id": question["id"],
                "question": question["question"],
                "source": corpus_name,
                "context": solution.get("docs", ""),
                "evidence": question.get("evidence", ""),
                "question_type": question.get("question_type", ""),
                "generated_answer": solution.get("answer", ""),
                "gold_answer": question.get("answer", "")
            })
    
    # Save results
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    logging.info(f"💾 Saved {len(results)} predictions to: {output_path}")
    #print(results)

In [None]:
try:
    with open("../.data/novel.json", "r", encoding="utf-8") as f:
        corpus_data = json.load(f)
    logging.info(f"📖 Loaded corpus with {len(corpus_data)} documents from ../.data/novel.json")
except Exception as e:
    logging.error(f"❌ Failed to load corpus: {e}")
    #return

# Sample corpus data if requested

# Load question data
try:
    with open("../.data/novel_questions.json", "r", encoding="utf-8") as f:
        question_data = json.load(f)
    grouped_questions = group_questions_by_source(question_data)
    logging.info(f"❓ Loaded questions with {len(question_data)} entries from ../.data/novel_questions.json")
except Exception as e:
    logging.error(f"❌ Failed to load questions: {e}")
    #return

# Process each corpus in the subset
for mode in ["embed",#"gli","hybrid","llm"
             ]:
    for item in corpus_data:
        corpus_name = item["corpus_name"]
        context = item["context"]
        nest_asyncio.apply()
        await process_corpus(
            corpus_name=corpus_name,
            questions=grouped_questions,
            mode=mode
        )

In [None]:
all_errors