In [None]:
!pip install -q transformers bitsandbytes accelerate sentence_transformers langchain faiss-cpu rank_bm25 langchain-community quepasa openai anthropic xlsxwriter codecarbon==2.8.4 ecologits[anthropic,openai]

In [None]:
import torch
if torch.cuda.is_available():
    device = torch.device("cuda")

In [None]:
import os
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = user_secrets.get_secret("HF_TOKEN")
os.environ["QUEPASA_API_TOKEN"] = user_secrets.get_secret("QUEPASA_TOKEN")
os.environ["OPENAI_API_KEY"] = user_secrets.get_secret("ILB_OpenAI_key")
os.environ["ANTHROPIC_API_KEY"] = user_secrets.get_secret("ILB_Anthropic_key")
os.environ["NEBIUS_API_KEY"] = user_secrets.get_secret("QuePasa_Nebius_key")

In [None]:
!git config --global credential.helper store
!huggingface-cli login --token $HUGGINGFACEHUB_API_TOKEN

In [None]:
"""
RAG Pipeline - Query Answering
------------------------------------
This script demonstrates a robust pipeline for:

1) Loading previously built Vector Stores (Docling-based or LangChain-based).
2) Retrieving relevant chunks using an Ensemble (hybrid: FAISS + BM25) or minimal approach.
3) Optionally re-ranking with a CrossEncoder (only relevant if retrieval_mode="hybrid").
4) Generating final answers via different LLM backends:
   - Local Nemo model (4-bit)
   - Local LLaMA model (4-bit)
   - OpenAI GPT-4
   - Claude 3.5 (Anthropic's API)
   - QuePasa LLMs (Qwen, Claude Sonnet, DeepSeek, etc.)
5) Storing results to an Excel file with column names that reflect chosen settings.

Author: [PLADIFES]
Date: [17_02_2025]
"""

import os
import time
import pickle
import re
import pandas as pd
from typing import List, Optional, Tuple, Union

# -------------- LangChain & Transformers Imports --------------
from langchain.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.prompts import PromptTemplate

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)

# -------------- Cross-Encoder Re-ranking --------------
from sentence_transformers import CrossEncoder

# -------------- QuePasa Imports --------------
import quepasa
from quepasa.rest import ApiException

# -------------- OpenAI / Anthropic (Claude) --------------
from openai import OpenAI
import anthropic

# ----------------------- Emissions -----------------------
from codecarbon import EmissionsTracker
from ecologits import EcoLogits

# -------------- Constants and Configuration --------------
XLSX_PATH = "/kaggle/input/cfb-vector-stores/Climate Finance Bench - Dataset.xlsx"
OUTPUT_XLSX = "./Climate Finance Bench - Dataset_with_answers.xlsx"
EXCEL_SHEET_NAME = "Annotations"

# Root directory containing the two styles of vector stores: "docling" and "langchain"
FAISS_DB_ROOT = "/kaggle/input/cfb-vector-stores/FAISS_DB"

# If using a shared store, it lives under:
#   /kaggle/input/cfb-vector-stores/FAISS_DB/docling/GLOBAL_DB  (Docling)
#   /kaggle/input/cfb-vector-stores/FAISS_DB/langchain/GLOBAL_DB (LangChain)
# (We will switch based on the vector_store_style parameter.)
GLOBAL_DB_SUBFOLDER = "GLOBAL_DB"

TARGET_PDF_DIRECTORY = "/kaggle/input/sustainability-reports/sustain_reports/"

EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2" # "intfloat/multilingual-e5-large"
CROSS_ENCODER_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-12-v2"

# ------------------ Retrieval & Ranking Parameters ------------------
TOP_K = 20  # Maximum # of chunks retrieved from each method (FAISS/BM25).
KEEP_FIRST_N = 8  # How many of the initially retrieved docs to keep before re-ranking, should be 8
ADD_AFTER_RERANK = 4  # How many to add after re-ranking, should be 4
# Total final doc count = KEEP_FIRST_N + ADD_AFTER_RERANK = 12

# ------------------ Generation Parameters ------------------
RAG_MAX_TOKENS = 256  # Max number of tokens (or new tokens) in generation
RAG_TEMPERATURE = 1

# -------------- Model choices --------------
MODEL_CHOICE = "nemo"  # "nemo", "llama", "openai", "claude", "QuePasa"
LLAMA_MODEL_PATH = "meta-llama/Llama-3.1-8B-Instruct"
NEMO_MODEL_PATH = "mistralai/Mistral-Nemo-Instruct-2407"

# -------------- Vector Store choices --------------
VECTOR_STORE_STYLE = "docling"  # "docling" or "langchain"
STORE_SCOPE = "single"          # "single" or "shared"
RETRIEVAL_MODE = "hybrid"       # "minimal" or "hybrid"
DO_RERANK = True

# -------------- Prompt templates --------------
RAG_PROMPT_TEMPLATE = """
You're a documentary assistant.
Answer the question indicated between <<< and >>> about the company "{company}" based on the context provided below extracted from climate or sustainability reports.
Do not add any additional notes. 
If the answer to the question is missing from the provided context and you cannot conclude on it on your own, indicate this sincerely.

Here are three examples of the format to follow in your reply as an AI assistant:
###
Human: Does the company have a climate change mitigation objective for FY2023? If yes, specify it.
AI: Yes, the company aims to become net zero by 2010, on their Scopes 2 and 3 emissions.

Human: Does the company have a climate change mitigation objective for FY2023? If yes, specify it.
AI: No, the company clarifies its will to not achieve net zero.

Human: Does the company disclose a Transition Plan for FY2023? If yes, highlight its main characteristics.
AI: Not available in the retrieved information.
###

Here are excerpts from documents that may contain the information, serving as context to help you:
###
{context}
###

Here's the question asked by the user:
Question: <<< {question} >>>

Read the instructions again:

You're a documentary assistant.
Answer the question indicated between <<< and >>> about the company "{company}" based on the context provided below extracted from climate or sustainability reports.
Do not add any additional notes. 
If the answer to the question is missing from the provided context and you cannot conclude on it on your own, indicate this sincerely.

Here are three examples of the format to follow in your reply as an AI assistant:
###
Human: Does the company have a climate change mitigation objective for FY2023? If yes, specify it.
AI: Yes, the company aims to become net zero by 2010, on their Scopes 2 and 3 emissions.

Human: Does the company have a climate change mitigation objective for FY2023? If yes, specify it.
AI: No, the company clarifies its will to not achieve net zero.

Human: Does the company disclose a Transition Plan for FY2023? If yes, highlight its main characteristics.
AI: Not available in the retrieved information.
###

Here's the question asked by the user:
Question: <<< {question} >>>

[Your answer here]
"""

RAG_PROMPT_TEMPLATE_NEMO = """
[INST]
You're a documentary assistant.
Answer the question indicated between <<< and >>> about the company "{company}" based on the context provided below extracted from climate or sustainability reports.
Do not add any additional notes. 
If the answer to the question is missing from the provided context and you cannot conclude on it on your own, indicate this sincerely.

Here are three examples of the format to follow in your reply as an AI assistant:
###
Human: Does the company have a climate change mitigation objective for FY2023? If yes, specify it.
AI: Yes, the company aims to become net zero by 2010, on their Scopes 2 and 3 emissions.

Human: Does the company have a climate change mitigation objective for FY2023? If yes, specify it.
AI: No, the company clarifies its will to not achieve net zero.

Human: Does the company disclose a Transition Plan for FY2023? If yes, highlight its main characteristics.
AI: Not available in the retrieved information.
###

Here's the question asked by the user:
Question: <<< {question} >>>

Read the instructions again:

You're a documentary assistant.
Answer the question indicated between <<< and >>> about the company "{company}" based on the context provided below extracted from climate or sustainability reports.
Do not add any additional notes. 
If the answer to the question is missing from the provided context and you cannot conclude on it on your own, indicate this sincerely.

Here are three examples of the format to follow in your reply as an AI assistant:
###
Human: Does the company have a climate change mitigation objective for FY2023? If yes, specify it.
AI: Yes, the company aims to become net zero by 2010, on their Scopes 2 and 3 emissions.

Human: Does the company have a climate change mitigation objective for FY2023? If yes, specify it.
AI: No, the company clarifies its will to not achieve net zero.

Human: Does the company disclose a Transition Plan for FY2023? If yes, highlight its main characteristics.
AI: Not available in the retrieved information.
###

Here are excerpts from documents that may contain the information, serving as context to help you:
###
{context}
###

Here's the question asked by the user:
Question: <<< {question} >>>

[Your answer here]
[/INST]
"""

API_PROMPT_TEMPLATE = """\
You are a documentary assistant.
Answer the question about the mentioned company based on the provided context that was extracted from climate or sustainability reports.
Do not add any additional notes. 
If the answer to the question is missing from the provided context and you cannot conclude on it on your own, indicate this sincerely.

Here are three examples of the format to follow in your reply:
###
Human: Does the company have a climate change mitigation objective for FY2023? If yes, specify it.
AI: Yes, the company aims to become net zero by 2010, on their Scopes 2 and 3 emissions.

Human: Does the company have a climate change mitigation objective for FY2023? If yes, specify it.
AI: No, the company clarifies its will to not achieve net zero.

Human: Does the company disclose a Transition Plan for FY2023? If yes, highlight its main characteristics.
AI: Not available in the retrieved information.
###
"""

# --------------------------------------------------------------------------
#                          CROSS-ENCODER LOGIC
# --------------------------------------------------------------------------
def local_rerank(query: str, docs: List[Document], cross_encoder_model: CrossEncoder) -> List[Document]:
    """
    Re-rank documents using a local CrossEncoder from SentenceTransformers.
    Sort them by descending score (best first).

    :param query: The user query/question.
    :param docs: A list of Document objects to be re-ranked.
    :param cross_encoder_model: A CrossEncoder model (from sentence-transformers).
    :return: The re-ranked list of Document objects.
    """
    if cross_encoder_model is None or not docs:
        return docs
    doc_contents = [d.page_content for d in docs]
    pairs = [(query, c) for c in doc_contents]
    scores = cross_encoder_model.predict(pairs)
    scored_docs = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True)
    return [sd[0] for sd in scored_docs]

def select_final_documents(
    question: str,
    docs: List[Document],
    cross_encoder_model: Optional[CrossEncoder] = None,
    top_k: int = TOP_K,
    keep_first_n: int = KEEP_FIRST_N,
    add_after_rerank: int = ADD_AFTER_RERANK
) -> List[Document]:
    """
    1) Truncate docs to top_k from the retriever.
    2) Keep the first keep_first_n from the original order.
    3) Optionally re-rank the truncated list using cross_encoder_model.
    4) From the re-ranked list, add the top add_after_rerank that are not already in the final set.
    5) Return the combined unique set.

    :param question: The user question (used for re-ranking).
    :param docs: A list of Document objects from the retriever.
    :param cross_encoder_model: If provided, used for re-ranking.
    :param top_k: Maximum number of docs to consider from initial retrieval.
    :param keep_first_n: Number of docs to preserve in original order (before re-rank).
    :param add_after_rerank: Number of docs to add from the re-rank set not already included.
    :return: A list of selected Document objects.
    """
    truncated_docs = docs[:top_k]
    top_n_original = truncated_docs[:keep_first_n]

    # Re-rank if a cross-encoder model is provided
    if cross_encoder_model:
        reranked_docs = local_rerank(question, truncated_docs, cross_encoder_model)
    else:
        reranked_docs = truncated_docs

    final_docs = []
    seen_keys = set()

    # Add the original top_n first
    for d in top_n_original:
        meta = d.metadata or {}
        doc_key = (d.page_content, meta.get("filename", ""), meta.get("page", ""))
        if doc_key not in seen_keys:
            final_docs.append(d)
            seen_keys.add(doc_key)

    # Then add some from the re-ranked set
    added_count = 0
    for d in reranked_docs:
        if added_count >= add_after_rerank:
            break
        meta = d.metadata or {}
        doc_key = (d.page_content, meta.get("filename", ""), meta.get("page", ""))
        if doc_key not in seen_keys:
            final_docs.append(d)
            seen_keys.add(doc_key)
            added_count += 1

    return final_docs


def load_cross_encoder(model_name: str = CROSS_ENCODER_MODEL_NAME) -> Optional[CrossEncoder]:
    """
    Load a cross-encoder model from sentence-transformers for re-ranking.

    :param model_name: The name of the cross-encoder model on HuggingFace.
    :return: An instance of CrossEncoder or None if load fails.
    """
    print(f"[INFO] Loading cross-encoder model: {model_name}")
    try:
        return CrossEncoder(model_name, device='cpu')
    except Exception as e:
        print(f"[ERROR] Could not load cross-encoder: {e}")
        return None

# --------------------------------------------------------------------------
#                          VECTOR STORE LOADING
# --------------------------------------------------------------------------
def load_local_retriever_for_company(
    company_name: str,
    vector_store_style: str,
    retrieval_mode: str = "hybrid",
    embedding_model: str = EMBEDDING_MODEL
) -> Optional[Union[BM25Retriever, EnsembleRetriever]]:
    """
    Load a local (per-company) retriever for the given company.  
      - vector_store_style: "docling" or "langchain"  
      - retrieval_mode: "minimal" => FAISS only, "hybrid" => Ensemble (FAISS+BM25)

    :param company_name: Name of the company folder under SP500/CAC40/Other.
    :param vector_store_style: Either "docling" or "langchain".
    :param retrieval_mode: Either "minimal" or "hybrid".
    :param embedding_model: HF embedding model for the FAISS store.
    :return: A configured Retriever (BM25Retriever, EnsembleRetriever, or None).
    """
    # Attempt to find it under SP500/CAC40/Other
    found_path = None
    for index_name in ("SP500", "CAC40", "Other"):
        path_candidate = os.path.join(FAISS_DB_ROOT, vector_store_style, index_name, company_name)
        if os.path.isdir(path_candidate):
            found_path = path_candidate
            break

    if not found_path:
        print(f"[WARNING] No FAISS folder found for '{company_name}' with style '{vector_store_style}'.")
        return None

    embeddings = HuggingFaceEmbeddings(
        model_name=embedding_model,
        model_kwargs={"device": "cpu", "trust_remote_code": True},
        encode_kwargs={"normalize_embeddings": True},
    )
    # Load the FAISS store
    db = FAISS.load_local(found_path, embeddings, allow_dangerous_deserialization=True)

    if retrieval_mode == "minimal":
        print(f"[INFO] Using minimal FAISS retriever for '{company_name}' (style={vector_store_style}).")
        return db.as_retriever(k=TOP_K, search_type="mmr")

    # Attempt to load BM25 for the ensemble
    retriever_pickle = os.path.join(found_path, "retrievers", "keyword_retriever.pkl")
    if not os.path.isfile(retriever_pickle):
        print(f"[WARNING] No BM25 found for '{company_name}'. Using FAISS only.")
        return db.as_retriever(k=TOP_K, search_type="mmr")

    with open(retriever_pickle, "rb") as f:
        bm25_retriever: BM25Retriever = pickle.load(f)
    bm25_retriever.k = TOP_K

    vector_retriever = db.as_retriever(k=TOP_K, search_type="mmr")
    ensemble = EnsembleRetriever(
        retrievers=[vector_retriever, bm25_retriever],
        weights=[0.75, 0.25]
    )
    print(f"[INFO] Using hybrid (FAISS+BM25) retriever for '{company_name}' (style={vector_store_style}).")
    return ensemble

def load_global_retriever(
    vector_store_style: str,
    retrieval_mode: str = "hybrid",
    embedding_model: str = EMBEDDING_MODEL
) -> Optional[Union[BM25Retriever, EnsembleRetriever]]:
    """
    Load a shared retriever from docling/GLOBAL_DB or langchain/GLOBAL_DB.
    If retrieval_mode='minimal', uses FAISS only; else tries to load BM25 as well.

    :param vector_store_style: "docling" or "langchain".
    :param retrieval_mode: "minimal" or "hybrid".
    :param embedding_model: HF embedding model for the FAISS store.
    :return: A configured Retriever or None if load fails.
    """
    global_folder = os.path.join(FAISS_DB_ROOT, vector_store_style, GLOBAL_DB_SUBFOLDER)
    if not os.path.isdir(global_folder):
        print(f"[ERROR] No global DB found at {global_folder}.")
        return None

    embeddings = HuggingFaceEmbeddings(
        model_name=embedding_model,
        model_kwargs={"trust_remote_code": True},
        encode_kwargs={"normalize_embeddings": True},
    )
    db = FAISS.load_local(global_folder, embeddings, allow_dangerous_deserialization=True)

    if retrieval_mode == "minimal":
        print(f"[INFO] Using minimal global retriever (FAISS only) for style={vector_store_style}.")
        return db.as_retriever(k=TOP_K, search_type="mmr")

    retriever_pickle = os.path.join(global_folder, "retrievers", "keyword_retriever.pkl")
    if not os.path.isfile(retriever_pickle):
        print(f"[WARNING] No BM25 found in global store (style={vector_store_style}). Using FAISS only.")
        return db.as_retriever(k=TOP_K, search_type="mmr")

    with open(retriever_pickle, "rb") as f:
        bm25_retriever: BM25Retriever = pickle.load(f)
    bm25_retriever.k = TOP_K

    vector_retriever = db.as_retriever(k=TOP_K, search_type="mmr")
    ensemble = EnsembleRetriever(
        retrievers=[vector_retriever, bm25_retriever],
        weights=[0.75, 0.25]
    )
    print(f"[INFO] Using hybrid shared retriever (FAISS+BM25) for style={vector_store_style}.")
    return ensemble

# --------------------------------------------------------------------------
#                Combine Documents into a Single String
# --------------------------------------------------------------------------
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(
    template=(
        "<Company: {company}>\n"
        "<Filename: {filename}>\n"
        "<Heading: {heading}>\n"
        "<Content>:\n{page_content}"
    )
)

def format_document(doc: Document, document_prompt: PromptTemplate) -> str:
    """
    Format a single Document using the given PromptTemplate.
    Fills in placeholders (company, filename, heading, page_content).

    :param doc: The Document to be formatted.
    :param document_prompt: A PromptTemplate containing placeholders for doc metadata.
    :return: A formatted string representation of the Document.
    """
    meta = doc.metadata or {}
    return document_prompt.format(
        company=meta.get("company", "N/A"),
        filename=meta.get("filename", "N/A"),
        heading=meta.get("heading", "N/A"),
        page_content=doc.page_content
    )

def combine_documents(
    docs: List[Document],
    document_prompt: PromptTemplate = DEFAULT_DOCUMENT_PROMPT,
    document_separator: str = "\n\n### NEW SOURCE ###\n\n"
) -> str:
    """
    Combines multiple documents into a single string with structured metadata.

    :param docs: A list of Document objects.
    :param document_prompt: A PromptTemplate to format each Document.
    :param document_separator: A string used to separate each Document's content.
    :return: A single string containing all formatted documents separated accordingly.
    """
    doc_strings = [format_document(d, document_prompt) for d in docs]
    return document_separator.join(doc_strings)

# --------------------------------------------------------------------------
#              Unified Model Loading for Nemo / LLaMA
# --------------------------------------------------------------------------
def load_local_transformers_model(
    model_name: str,
    approach: str = "nemo",
    device_map: str = "auto",
    quantize: bool = True
) -> Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer]]:
    """
    Load a local Transformers model. If quantize=True, load 4-bit quantized via BitsAndBytes.
    Otherwise, load in normal precision (e.g. FP16).

    :param model_name: The HF model repo ID (e.g. "meta-llama/Llama-3.1-8B-Instruct").
    :param approach: "nemo" or "llama" or anything else you want to handle distinctly.
    :param device_map: "auto" or a dictionary for device placement.
    :param quantize: If True, use 4-bit quantization. If False, load standard HF model.
    :return: (model, tokenizer), or (None, None) if load fails.
    """
    if quantize:
        print(f"[INFO] Loading local model '{model_name}' with approach='{approach}' in 4-bit quant.")
        compute_dtype = getattr(torch, "float16")
        # BitsAndBytes config
        quant_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=False,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=compute_dtype,
        )

        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                quantization_config=quant_config,
                device_map=device_map
            )
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            print("[INFO] Local 4-bit model loaded successfully.")
            return model, tokenizer
        except Exception as e:
            print(f"[ERROR] Failed to load 4-bit quantized model: {e}")
            return None, None
    else:
        print(f"[INFO] Loading local model '{model_name}' with *no* quantization.")
        # Optionally force torch_dtype to half or bf16 if you're on GPU. You can also load to CPU.
        # This example uses FP16 if CUDA is available, else CPU fallback:
        torch_dtype = torch.float16 if torch.cuda.is_available() else None

        try:
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map=device_map,
                torch_dtype=torch_dtype
            )
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            print("[INFO] Non-quantized model loaded successfully.")
            return model, tokenizer
        except Exception as e:
            print(f"[ERROR] Failed to load non-quantized model: {e}")
            return None, None

# --------------------------------------------------------------------------
#     Nubius (DeepSeek R1 and Qwen 2.5), GPT-4o (OpenAI) and Claude 3.5 (Anthropic) Generation
# --------------------------------------------------------------------------
def generate_answer_nubius(
    question: str,
    context: str,
    company: str,
    nebius_api_key: str,
    model_name: str = "deepseek-ai/DeepSeek-R1",  # or "Qwen/Qwen2.5-72B-Instruct"
    temperature: float = RAG_TEMPERATURE,
    max_tokens: int = RAG_MAX_TOKENS,
    system_instructions: str = API_PROMPT_TEMPLATE
) -> str:
    """
    Generate an answer using Nebius's API. The interface is very similar to OpenAI's:
      client = OpenAI(base_url=..., api_key=...)
      client.chat.completions.create(...)
    
    :param question: The user question.
    :param context: The retrieved context as a string.
    :param company: Company name for the answer prompt.
    :param nebius_api_key: Your Nebius API key.
    :param model_name: The Nebius model name, e.g. 'deepseek-ai/DeepSeek-R1' or 'Qwen/Qwen2.5-72B-Instruct'.
    :param temperature: Sampling temperature for generation.
    :param max_tokens: Maximum tokens for the model to generate.
    :param system_instructions: A system prompt or “instructions” to guide the model.
    :return: The generated answer string or an error message on failure.
    """
    if not nebius_api_key:
        return "[ERROR] No NEBIUS_API_KEY provided."

    # Create a Nebius-based client – same usage as openai.OpenAI, just with the base_url changed
    client = OpenAI(
        base_url="https://api.studio.nebius.com/v1/",
        api_key=nebius_api_key,
    )
    
    # Build the prompt
    user_prompt = (
        f"""\
        Here are excerpts from documents about the company {company}:
        ###
        {context}
        ###
        
        Here's the question asked by the user:
        Question: <<< {question} >>>
        """)
    if model_name=="deepseek-ai/DeepSeek-R1":
        max_tokens = max_tokens * 20
    # API call
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": system_instructions},
                {"role": "user", "content": user_prompt}
            ],
            temperature=temperature,
            max_tokens=max_tokens)
        answer_text = response.choices[0].message.content
        if model_name=="deepseek-ai/DeepSeek-R1":
            answer_text = re.sub(r"<think>[\s\S]*?</think>", "", answer_text, flags=re.MULTILINE).strip()
        return answer_text
    except Exception as e:
        return f"[ERROR] Nebius {model_name} call failed: {e}"

def generate_answer_openai(
    question: str,
    context: str,
    company: str,
    openai_api_key: str,
    model_name: str = "gpt-4o",
    temperature: float = RAG_TEMPERATURE,
    max_tokens: int = RAG_MAX_TOKENS,
    system_instructions: str = API_PROMPT_TEMPLATE
) -> str:
    """
    Generate an answer using OpenAI ChatCompletion (e.g. GPT-4).
    Requires OPENAI_API_KEY in environment or passed in `openai_api_key`.

    :param question: The user question.
    :param context: The retrieved context as a string.
    :param company: Company name for the answer prompt.
    :param openai_api_key: Your OpenAI API key.
    :param model_name: The OpenAI model name (e.g. "gpt-4").
    :param temperature: Sampling temperature for generation.
    :param max_tokens: Maximum tokens for the model to generate.
    :return: The generated answer string or an error message on failure.
    """
    
    if not openai_api_key:
        return "[ERROR] No OPENAI_API_KEY provided."
    
    client = OpenAI()
    # Build the prompt
    user_prompt = (
        f"""\
        Here are excerpts from documents about the company {company}:
        ###
        {context}
        ###
        
        Here's the question asked by the user:
        Question: <<< {question} >>>
        """)

    # API call
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": system_instructions},
                {"role": "user", "content": user_prompt}
            ],
            temperature=temperature,
            max_tokens=max_tokens)
        print(f"Energy consumption: {response.impacts.energy.value} kWh") 
        print(f"GHG emissions: {response.impacts.gwp.value} kgCO2eq")
        answer_text = response.choices[0].message.content
        return answer_text
    except Exception as e:
        return f"[ERROR] OpenAI {model_name} call failed: {e}"

def generate_answer_claude(
    question: str,
    context: str,
    company: str,
    anthropic_api_key: str,
    model_name: str = "claude-3-5-sonnet-20241022",
    temperature: float = RAG_TEMPERATURE,
    max_tokens: int = RAG_MAX_TOKENS,
    system_instructions: str = API_PROMPT_TEMPLATE
) -> str:
    """
    Generate an answer using Anthropic's Claude via the official 'anthropic' library.
    Requires ANTHROPIC_API_KEY in environment or passed in `anthropic_api_key`.

    :param question: The user question.
    :param context: The retrieved context as a string.
    :param company: Company name for the answer prompt.
    :param anthropic_api_key: Your Anthropic API key.
    :param model_name: The Claude model name (e.g. "claude-instant-1").
    :param temperature: Sampling temperature for generation.
    :param max_tokens: Maximum tokens for Claude to generate.
    :return: The generated answer string or an error message on failure.
    """
    if not anthropic_api_key:
        return "[ERROR] No ANTHROPIC_API_KEY provided."

    client = anthropic.Client()
    user_prompt = (
        f"""\
        Here are excerpts from documents about the company {company}:
        ###
        {context}
        ###
        
        Here's the question asked by the user:
        Question: <<< {question} >>>
        """)

    try:
        response = client.messages.create(
            system=system_instructions,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": user_prompt
                        }
                    ]
                }
            ],
            model=model_name,
            max_tokens=max_tokens,
            temperature=temperature)
        print(f"Energy consumption: {response.impacts.energy.value} kWh") 
        print(f"GHG emissions: {response.impacts.gwp.value} kgCO2eq")
        return response.content[0].text
    except Exception as e:
        return f"[ERROR] Claude call failed: {e}"

# --------------------------------------------------------------------------
#     Nemo / LLaMA Generation
# --------------------------------------------------------------------------
def generate_answer_rag_nemo(
    question: str,
    context: str,
    company: str,
    model: Optional[AutoModelForCausalLM],
    tokenizer: Optional[AutoTokenizer],
    device: str = "cuda",
    max_new_tokens: int = 512,
    temperature: float = 0.2
) -> str:
    """
    Generate an answer using the Nemo-style prompt (RAG_PROMPT_TEMPLATE_NEMO).

    :param question: The user question.
    :param context: The retrieved context to help answer.
    :param company: Company name for the prompt.
    :param model: The local Nemo model instance (CausalLM).
    :param tokenizer: The corresponding tokenizer.
    :param device: The device to use ("cuda" or "cpu").
    :param max_new_tokens: Max new tokens to generate.
    :param temperature: Sampling temperature for generation.
    :return: The generated answer string or an error message if model not loaded.
    """
    if model is None or tokenizer is None:
        return "[ERROR] Nemo model not loaded."

    prompt = RAG_PROMPT_TEMPLATE_NEMO.format(context=context, question=question, company=company)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    prompt_length = inputs['input_ids'].shape[1]

    tokens_out = model.generate(
        **inputs, 
        max_new_tokens=max_new_tokens,
        do_sample=False,
        temperature=temperature
    )
    raw_output = tokenizer.decode(tokens_out[0][prompt_length:], skip_special_tokens=True).strip()
    return raw_output

def generate_answer_rag_llama(
    question: str,
    context: str,
    company: str,
    model: Optional[AutoModelForCausalLM],
    tokenizer: Optional[AutoTokenizer],
    device: str = "cuda",
    max_new_tokens: int = 512,
    temperature: float = 0.2,
    system_content=API_PROMPT_TEMPLATE,
    top_p: float = 0.8,
) -> str:
    """
    Generates the final answer using a structured chat approach for Llama-based models.

    :param question: The user question.
    :param context: The retrieved context to help answer.
    :param company: Company name for the prompt.
    :param model: The local LLaMA model instance (CausalLM).
    :param tokenizer: The corresponding tokenizer.
    :param device: The device to use ("cuda" or "cpu").
    :param max_new_tokens: Maximum tokens to generate after the prompt.
    :param temperature: Sampling temperature.
    :param top_p: Nucleus sampling top-p parameter.
    :return: The generated answer string or an error message if model not loaded.
    """
    if model is None or tokenizer is None:
        return "[ERROR] Llama model or tokenizer not loaded."

    # -----------------------------
    # 1) User content
    # -----------------------------
    user_content = (
        f"Company: {company}\n\n"
        f"Context:\n{context}\n\n"
        f"Question: {question}"
    )

    # -----------------------------
    # 2) Build the conversation
    # -----------------------------
    messages = [
        {"role": "system", "content": system_content},
        {"role": "user",   "content": user_content},
    ]

    try:
        # This is a LLaMA-2/3 style method, may differ by library
        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(device)

        # Use EOS token to stop generation
        terminators = [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in tokenizer.vocab else None
        ]
        terminators = [t for t in terminators if t is not None]

        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            eos_token_id=terminators,
            pad_token_id=tokenizer.eos_token_id
        )

        generated_toks = outputs[0][input_ids.shape[-1]:]
        decoded = tokenizer.decode(generated_toks, skip_special_tokens=True)
        return decoded.strip()

    except AttributeError:
        # If tokenizer doesn't have apply_chat_template, fallback
        print("[WARNING] The tokenizer does not support apply_chat_template. Using a fallback prompt.")
        # Use a simpler prompt approach
        fallback_prompt = (f"{system_content}\n\nUser: {user_content}\n\nAssistant:")
        inputs = tokenizer(fallback_prompt, return_tensors="pt").to(device)
        prompt_length = inputs['input_ids'].shape[1]
        
        # Use EOS token to stop generation
        terminators = [
            tokenizer.eos_token_id,
            tokenizer.convert_tokens_to_ids("<|eot_id|>") if "<|eot_id|>" in tokenizer.vocab else None
        ]
        terminators = [t for t in terminators if t is not None]
        
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            eos_token_id=terminators,
            pad_token_id=tokenizer.eos_token_id
        )
        raw_output = tokenizer.decode(outputs[0][prompt_length:], skip_special_tokens=True).strip()
        return raw_output

    except Exception as e:
        return f"[ERROR] Llama generation error: {e}"

# --------------------------------------------------------------------------
#          QuePasa Path for "single" usage
# --------------------------------------------------------------------------
def find_pdf_for_company(company_name: str) -> Optional[str]:
    """
    Look for a PDF in subfolders SP500/CAC40/Other/[company_name].
    Return the first PDF found or None.

    :param company_name: The company name to search for.
    :return: Path to the first PDF found or None if none is found.
    """
    for index_name in ("SP500", "CAC40", "Other"):
        path_candidate = os.path.join(TARGET_PDF_DIRECTORY, index_name, company_name)
        if os.path.isdir(path_candidate):
            for fn in os.listdir(path_candidate):
                if fn.lower().endswith(".pdf"):
                    return os.path.join(path_candidate, fn)
    return None

def process_quepasa(
    df: pd.DataFrame,
    answer_col: str,
    context_col: str,
    quepasa_llm: str = "nebius:Qwen/Qwen2.5-72B-Instruct",
    quepasa_use_per_company_domain: bool = True
) -> pd.DataFrame:
    """
    For each company in df:
      1) Finds its PDF under SP500/CAC40/Other/ (if any).
      2) Upserts it to QuePasa, either under a single domain or a per-company domain.
      3) Queries the LLM for each question, storing answer/context in the specified columns.

    :param df: The dataframe containing 'Company's name' and 'Question'.
    :param answer_col: The column name to store answers.
    :param context_col: The column name to store used context.
    :param quepasa_llm: The ID of the QuePasa LLM to be used, e.g. "anthropic:claude-3-5-sonnet-20240620".
    :param quepasa_use_per_company_domain: Whether to use a unique domain for each company (True) or a single shared domain (False).
    :return: Updated dataframe with answers and contexts.
    """
    configuration = quepasa.Configuration(
        access_token=os.environ.get("QUEPASA_API_TOKEN", "YOUR_QUEPASA_API_TOKEN")
    )
    api_client = quepasa.ApiClient(configuration)
    client = quepasa.DefaultApi(api_client)

    # If you're using a single domain for everyone, define it here:
    global_domain = "Climate_Report_Global"

    for comp_name, group_df in df.groupby("Company's name"):
        comp_name_str = str(comp_name)
        # Check if all answers are filled for this group
        answer_series = group_df[answer_col].fillna("").apply(str.strip)
        # If *all* answers are non-empty, skip
        if answer_series.apply(len).all():
            print(f"[INFO] All questions for '{comp_name_str}' already answered. Skipping.")
            continue

        pdf_path = find_pdf_for_company(comp_name_str)
        if not pdf_path:
            print(f"[WARNING] No PDF for '{comp_name_str}'. Skipping this company.")
            continue

        # Decide domain name depending on use_per_company_domain
        if quepasa_use_per_company_domain:
            domain = f"Climate report_{comp_name_str}"
        else:
            # If you want a single domain for everyone (shared vector store), use global_domain
            domain = global_domain
        print(f"\n[QuePasa] Upserting PDF for '{comp_name_str}': {pdf_path}")
        try:
            response_upsert = client.upsert_files(domain, pdf_path)
            batch_id = response_upsert.data.batch_id if response_upsert and response_upsert.data else None
        except ApiException as e:
            print(f"[ERROR] QuePasa upsert failed for '{comp_name_str}': {e}")
            continue

        # Wait for the ingestion batch to complete
        while batch_id:
            time.sleep(10)
            try:
                status_resp = client.get_batch_status(batch_id)
                print(f"[QuePasa] Batch status for '{comp_name_str}': {status_resp.status}")
                if status_resp.status == 'Batch state: done':
                    break
            except ApiException as e:
                print(f"[ERROR] Checking batch status: {e}")
                break

        # Now retrieve & store answers
        for idx in group_df.index:
            existing_answer = str(df.at[idx, answer_col]).strip()
            if existing_answer != "":
                continue  # Skip if already answered

            original_question = str(df.at[idx, "Question"]).strip()
            question = f"For '{comp_name_str}', {original_question}"
            if not question:
                continue

            try:
                # We unify the retrieval with top_k = KEEP_FIRST_N + ADD_AFTER_RERANK = 12
                # so it matches local retrieval approach in chunk count
                top_k_for_quepasa = KEEP_FIRST_N + ADD_AFTER_RERANK

                ans_resp = client.retrieve_answer(
                    {
                        "question": question,
                        "domain": domain,
                        "llm": quepasa_llm
                    }
                )
                ctx_resp = client.retrieve_chunks({'question': question, "domain": domain, 'top_k': top_k_for_quepasa})

                chunks_list = ctx_resp.data if (ctx_resp and ctx_resp.data) else []
                context_text = "\n\n#####".join([ch.text for ch in chunks_list])

                answer_text = ans_resp.data.markdown if (ans_resp and ans_resp.data) else ""
                df.at[idx, answer_col] = answer_text
                df.at[idx, context_col] = context_text

                # Save after each question
                df.to_excel(OUTPUT_XLSX, sheet_name=EXCEL_SHEET_NAME, index=False, engine='xlsxwriter')
                print(f"Domain: {domain}")
                print(f"Q: {question}")
                print(f"A: {answer_text[:200]}...")

            except ApiException as e:
                print(f"[ERROR] QuePasa API => {e}")
                df.at[idx, answer_col] = f"[ERROR] {e}"
                df.to_excel(OUTPUT_XLSX, sheet_name=EXCEL_SHEET_NAME, index=False, engine='xlsxwriter')

    return df

# --------------------------------------------------------------------------
#                   COLUMN NAME CONSTRUCTION
# --------------------------------------------------------------------------
def shorten_quepasa_llm_name(quepasa_llm: str) -> str:
    """
    Extract a short label from a QuePasa LLM ID, removing special chars like ':' or '/'.

    :param quepasa_llm: The QuePasa LLM string (e.g. "anthropic:claude-3-5-sonnet-20240620").
    :return: A short name for the LLM (e.g. "anthropic_claude-3-5-sonnet-20240620").
    """
    short_name = quepasa_llm.replace(":", "_").replace("/", "_")
    return short_name

def build_column_names(
    vector_store_style: str,
    store_scope: str,
    model_choice: str,
    retrieval_mode: str,
    do_rerank: bool,
    quepasa_llm: str,
    quepasa_use_per_company_domain: bool
) -> Tuple[str, str]:
    """
    Construct (answer_col_name, context_col_name) that reflect the user’s choices.

    - If model_choice == "QuePasa", we include the short label for `quepasa_llm` in the column name.
    - Otherwise, we reflect store_scope, vector_store_style, model_choice, retrieval_mode,
      and 'withRerank' (if do_rerank==True and retrieval_mode=="hybrid").

    :param vector_store_style: "docling" or "langchain".
    :param store_scope: "single" or "shared".
    :param model_choice: "QuePasa", "nemo", "llama", "openai", or "claude".
    :param retrieval_mode: "minimal" or "hybrid".
    :param do_rerank: Whether cross-encoder re-ranking is used.
    :param quepasa_llm: The selected QuePasa LLM string.
    :param quepasa_use_per_company_domain: If we are using a single store configuration
    :return: A tuple of (answer_column_name, context_column_name).
    """
    if model_choice == "QuePasa":
        short_llm_name = shorten_quepasa_llm_name(quepasa_llm)
        if quepasa_use_per_company_domain:
            short_llm_name = short_llm_name + "_single"
        else:
            short_llm_name = short_llm_name + "_shared"
        answer_name = f"Answer_QuePasa_{short_llm_name}"
        context_name = f"Used_Context_QuePasa_{short_llm_name}"
    else:
        base_parts = [store_scope, vector_store_style, model_choice, retrieval_mode]
        # add a suffix for reranking only if retrieval_mode="hybrid"
        if retrieval_mode == "hybrid":
            if do_rerank:
                base_parts.append("withRerank")
            else:
                base_parts.append("noRerank")
        joined_str = "_".join(base_parts)
        answer_name = f"Answer_{joined_str}"
        context_name = f"Used_Context_{joined_str}"

    return answer_name, context_name

# --------------------------------------------------------------------------
#                             MAIN PIPELINE
# --------------------------------------------------------------------------
def main(
    vector_store_style: str = VECTOR_STORE_STYLE,  # "docling" or "langchain"
    store_scope: str = STORE_SCOPE,                # "single" or "shared"
    model_choice: str = MODEL_CHOICE,              # "nemo_4bit", "llama_4bit", "llama_full", "openai", "claude", "QuePasa"
    retrieval_mode: str = RETRIEVAL_MODE,          # "minimal" or "hybrid"
    do_rerank: bool = DO_RERANK,
    quepasa_llm: str = "nebius:Qwen/Qwen2.5-72B-Instruct",
    quepasa_use_per_company_domain=True
):
    """
    Main pipeline for RAG-based query answering, writing answers to Excel.

    Steps:
      1) Builds dynamic column names based on user’s choices (including any QuePasa model).
      2) Reads the Excel (XLSX_PATH).
      3) If using QuePasa, upserts PDFs & retrieves answers from that LLM, stores them in new columns.
      4) Otherwise, does a local RAG approach with docling/langchain store, optional re-rank,
         and a local or API-based LLM (Nemo, Llama, OpenAI GPT-4, or Claude).
      5) Writes final answers back to Excel.

    :param vector_store_style: Which index style to use: "docling" or "langchain".
    :param store_scope: Whether to use "single" store (per-company) or "shared" store (global).
    :param model_choice: The LLM approach: "nemo", "llama", "openai", "claude", or "QuePasa".
    :param retrieval_mode: "minimal" => FAISS only, "hybrid" => FAISS + BM25 ensemble.
    :param do_rerank: If True and retrieval_mode=="hybrid", use cross-encoder re-ranking.
    :param quepasa_llm: If model_choice=="QuePasa", the LLM ID for the QuePasa service.
    """
    
    EcoLogits.init(providers=["anthropic", "openai"])
    
    # 1) Build dynamic column names
    answer_col, context_col = build_column_names(
        vector_store_style=vector_store_style,
        store_scope=store_scope,
        model_choice=model_choice,
        retrieval_mode=retrieval_mode,
        do_rerank=do_rerank,
        quepasa_llm=quepasa_llm,
        quepasa_use_per_company_domain=quepasa_use_per_company_domain
    )

    # 2) Load the Excel sheet
    df = pd.read_excel(XLSX_PATH, sheet_name=EXCEL_SHEET_NAME)
    if "Company's name" not in df.columns or "Question" not in df.columns:
        print("[ERROR] XLSX must have columns 'Company's name' and 'Question'. Exiting.")
        return

    # Ensure the answer/context columns exist
    if answer_col not in df.columns:
        df[answer_col] = ""
    if context_col not in df.columns:
        df[context_col] = ""

    # ========== If using QuePasa => skip local vector store logic ==========
    if model_choice == "QuePasa":
        print(f"[INFO] Running pipeline with QuePasa LLM = '{quepasa_llm}'.")
        # For QuePasa, we ignore local retrieval logic and do direct PDF ingestion + QA
        df = process_quepasa(
            df=df,
            answer_col=answer_col,
            context_col=context_col,
            quepasa_llm=quepasa_llm,
            quepasa_use_per_company_domain=quepasa_use_per_company_domain
        )
        print(f"[DONE] QuePasa pipeline. Results in '{OUTPUT_XLSX}'")
        return

    # ========== Otherwise, local RAG approach ==========
    # 3) If store_scope == "shared", load the shared retriever once
    retriever_shared = None
    if store_scope == "shared":
        print("[INFO] Loading shared/global retriever.")
        retriever_shared = load_global_retriever(
            vector_store_style=vector_store_style,
            retrieval_mode=retrieval_mode,
            embedding_model=EMBEDDING_MODEL
        )
        if retriever_shared is None:
            print("[ERROR] Could not load shared retriever. Exiting.")
            return

    # 4) Load the chosen LLM (only if we need it for Nemo or LLaMA)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model, tokenizer = None, None

    if model_choice == "nemo_4bit":
        model, tokenizer = load_local_transformers_model(
            NEMO_MODEL_PATH,
            approach="nemo",
            device_map="auto",
            quantize=True
        )
    elif model_choice == "llama_4bit":
        model, tokenizer = load_local_transformers_model(
            LLAMA_MODEL_PATH,
            approach="llama",
            device_map="auto",
            quantize=True
        )
    elif model_choice == "llama_full":
        # LLaMA non-quantized
        model, tokenizer = load_local_transformers_model(
            LLAMA_MODEL_PATH,
            approach="llama",
            device_map="auto",
            quantize=False
        )
    elif model_choice in ("openai", "claude", "nebius_deepseek", "nebius_qwen"):
        pass  # We'll handle generation via API
    else:
        print(f"[ERROR] Model choice '{model_choice}' not recognized. Exiting.")
        return

    # 5) If "hybrid" + do_rerank, load a cross-encoder for re-ranking
    cross_encoder_model = None
    if retrieval_mode == "hybrid" and do_rerank:
        cross_encoder_model = load_cross_encoder(CROSS_ENCODER_MODEL_NAME)

    # 6) Iterate over each row in the Excel
    for idx, row in df.iterrows():
        company = str(row["Company's name"]).strip()
        question = str(row["Question"]).strip()
        existing_answer = str(row[answer_col]).strip()

        if not question:
            continue
        if existing_answer != "":
            # Already answered => skip
            continue

        # 6a) Retrieve documents (single-company or global)
        if store_scope == "single":
            # Load local retriever for each company
            retriever_local = load_local_retriever_for_company(
                company_name=company,
                vector_store_style=vector_store_style,
                retrieval_mode=retrieval_mode,
                embedding_model=EMBEDDING_MODEL
            )
            if not retriever_local:
                df.at[idx, answer_col] = "[WARNING] No local retriever found for this company."
                df.to_excel(OUTPUT_XLSX, sheet_name=EXCEL_SHEET_NAME, index=False, engine='xlsxwriter')
                continue
            retrieved_docs = retriever_local.get_relevant_documents(question)
        else:
            # store_scope == "shared"
            retrieved_docs = retriever_shared.get_relevant_documents(question)

        # 6b) Optionally re-rank
        final_docs = select_final_documents(
            question=question,
            docs=retrieved_docs,
            cross_encoder_model=cross_encoder_model,
            top_k=TOP_K,
            keep_first_n=KEEP_FIRST_N,
            add_after_rerank=ADD_AFTER_RERANK
        )

        # 6c) Combine context
        context_text = combine_documents(final_docs)

        # 6d) Generate answer based on model choice
        if model_choice == "nemo_4bit":
            answer_text = generate_answer_rag_nemo(
                question=question,
                context=context_text,
                company=company,
                model=model,
                tokenizer=tokenizer,
                device=device,
                max_new_tokens=RAG_MAX_TOKENS,
                temperature=RAG_TEMPERATURE
            )
        elif model_choice[:5] == "llama":
            answer_text = generate_answer_rag_llama(
                question=question,
                context=context_text,
                company=company,
                model=model,
                tokenizer=tokenizer,
                device=device,
                max_new_tokens=RAG_MAX_TOKENS,
                temperature=RAG_TEMPERATURE
            )
        elif model_choice == "openai":
            openai_api_key = os.environ.get("OPENAI_API_KEY", "")
            answer_text = generate_answer_openai(
                question=question,
                context=context_text,
                company=company,
                openai_api_key=openai_api_key,
                model_name="gpt-4o",
                temperature=RAG_TEMPERATURE,
                max_tokens=RAG_MAX_TOKENS
            )
        elif model_choice == "claude":
            anthropic_api_key = os.environ.get("ANTHROPIC_API_KEY", "")
            answer_text = generate_answer_claude(
                question=question,
                context=context_text,
                company=company,
                anthropic_api_key=anthropic_api_key,
                model_name="claude-3-5-sonnet-20241022",
                temperature=RAG_TEMPERATURE,
                max_tokens=RAG_MAX_TOKENS
            )
        elif model_choice == "nebius_deepseek":
            nebius_api_key = os.environ.get("NEBIUS_API_KEY", "")
            answer_text = generate_answer_nubius(
                question=question,
                context=context_text,
                company=company,
                nebius_api_key=nebius_api_key,
                model_name="deepseek-ai/DeepSeek-R1",
                temperature=RAG_TEMPERATURE,
                max_tokens=RAG_MAX_TOKENS
            )
        elif model_choice == "nebius_qwen":
            nebius_api_key = os.environ.get("NEBIUS_API_KEY", "")
            answer_text = generate_answer_nubius(
                question=question,
                context=context_text,
                company=company,
                nebius_api_key=nebius_api_key,
                model_name="Qwen/Qwen2.5-72B-Instruct",
                temperature=RAG_TEMPERATURE,
                max_tokens=RAG_MAX_TOKENS
            )
        else:
            answer_text = "[ERROR] Unknown model choice in generation step."

        # 6e) Store & save
        df.at[idx, answer_col] = answer_text
        df.at[idx, context_col] = context_text
        df.to_excel(OUTPUT_XLSX, sheet_name=EXCEL_SHEET_NAME, index=False, engine='xlsxwriter')

        # Optional short console print
        print(f"\n== Row {idx} / {company} ==")
        print(f"Q: {question}")
        print(f"A: {answer_text[:200]}...")

    print(f"[DONE] Queries processed. Output saved in '{OUTPUT_XLSX}'.")


if __name__ == "__main__":
    """
    Example usage:
      python run_inference.py
      or:
      python run_inference.py --vector_store_style docling --store_scope single \
                              --model_choice QuePasa --retrieval_mode hybrid \
                              --do_rerank True --quepasa_llm 'anthropic:claude-3-5-sonnet-20240620'
    """
    # Just a hard-coded example call here, runnable within a notebook context:
    # In order to enable the upload of files for QuePasa (not useful if all documents are already uploaded), you need to uncomment the upload part in the process_quepasa function
    with EmissionsTracker(project_name="Climate_Finance_Bench_RAG_pipeline", output_dir="/kaggle/working/") as tracker:
        main(
            vector_store_style="langchain",   # "docling" or "langchain"
            store_scope="single",             # "single" or "shared"
            model_choice="QuePasa",        # "nemo_4bit", "llama_4bit", "llama_full", "openai", "claude", "nebius_deepseek", "nebius_qwen" or "QuePasa"
            retrieval_mode="hybrid",         # "hybrid" or "minimal"
            do_rerank=True,
            quepasa_llm="anthropic:claude-3-5-sonnet-20240620",
            quepasa_use_per_company_domain=True # True corresponds to one domain on QuePasa's platform for each company, which translates to a single vector store configuration. False corresponds to one domain for all companies, which translates to a shared vector store.
            ### Separated the Single vs. Shared Vector Store logic between QuePasa's pipeline and ours for clarity's sake on the process
        )

In [None]:
from IPython.display import FileLink

!zip -r file.zip /kaggle/working/
FileLink(r'file.zip')

In [None]:
!pip freeze