In [13]:
# !pip install neo4j
# !pip install langchain
# !pip install PyPDF2
# !pip install tiktoken
# !pip install openai  # Only if you want to use the OpenAI API
# !pip install transformers  # For open (HF) models
# !pip install sentence_transformers
# !pip install -U langchain-community
# !pip install -qU  langchain_milvus
# !pip install -U langchain-ollama
# !pip install graphdatascience
# For advanced community detection with Leiden, you might need external libraries (e.g., igraph, networkx, etc.).


In [14]:
#!ollama pull llama3.1

In [15]:
import os
from typing import List, Dict, Any
import tqdm
import concurrent.futures

# -----------------------
# Neo4j Database imports
# -----------------------
from neo4j import GraphDatabase

# -----------------------
# LLM / Embeddings imports
# -----------------------
# If using HuggingFace transformers:
from transformers import pipeline

# If using LangChain for retrieval + QA
from langchain.embeddings import HuggingFaceEmbeddings

from langchain.text_splitter import RecursiveCharacterTextSplitter


from langchain_openai import ChatOpenAI
from langchain_community.chat_models import ChatOllama


from langchain_milvus import Milvus
from uuid import uuid4
from pymilvus import MilvusClient

from langchain.schema import Document
from langchain.schema import HumanMessage, SystemMessage

# If you want to use OpenAI, uncomment:
import openai
openai_model="gpt-4o-mini"

# -----------------------
# Load environment variables
import dotenv
dotenv.load_dotenv()

# -----------------------
# ArXiv API
# -----------------------
import arxiv

# -----------------------
# PDF Parsing library
# -----------------------
import PyPDF2  # or "pypdf" if needed

In [16]:
#############################################
# 1) CONFIGURATION: toggle open vs. OpenAI
#############################################

USE_OPENAI = True  # Set to True if you want to switch to OpenAI’s ChatGPT
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# For Neo4j:
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

In [17]:
##################################################
# 2.1 SOURCE DOCUMENTS → TEXT CHUNKS
##################################################
def parse_pdf(pdf_path: str) -> str:
    """
    Extract raw text from a PDF file using PyPDF2.
    """
    text = ""
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text

def find_metadata(doc_id: str) -> Dict[str, Any]:
    """
    (Step 2.1) Retrieve metadata for a document from a database or API.
    """
    client = arxiv.Client()
    search = arxiv.Search(
        id_list=[doc_id]
    )

    try:
        result = next(client.results(search))
        return \
            {"title": result.title, 
            "summary": result.summary, 
            "url": result.entry_id,
            "authors": ', '.join([a.name for a in result.authors]),
            "categories": ', '.join(result.categories)
            }
    except StopIteration:
        return {}

def chunk_text(text: str, chunk_size: int = 600, chunk_overlap: int = 100) -> List[str]:
    """
    (Step 2.1) Split text into chunks. 
    Following the guidance in 2.1, we use a smaller chunk size (e.g., ~600 tokens).
    This can improve entity recall at the cost of more LLM calls.
    """
    # Note: the chunk_size is in characters by default using this splitter;
    # you may want to adapt to token-based splitting.
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    
    chunks = text_splitter.split_text(text)
    return chunks

In [18]:
##################################################
# EMBEDDING UTILITIES
##################################################

def get_hf_embedding_function(model_name: str = "sentence-transformers/all-MiniLM-L6-v2", device: str = "mps", USE_OPENAI: bool = False):
    """
    Returns a function that can generate embeddings using a HuggingFace model.
    """
    if not USE_OPENAI:
        hf_embed = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={'device': device})
        return hf_embed
    else:
        def _embeddings(texts: List[str], model_name: str = "text-embedding-ada-002") -> List[List[float]]:
            response = openai.Embedding.create(
                input=texts,
                model=model_name
            )
            embeddings = [item["embedding"] for item in response["data"]]
            return embeddings
        return _embeddings


In [19]:
##################################################
# VECTORSTORE UTILITIES
##################################################

def init_milvus_db(collection_name: str, uri: str, embedding_function):
    """
    Initialize the Milvus database if it does not exist.
    """
    if not os.path.exists(uri):
        print(f"Creating database at {uri}")

    vectorstore = Milvus(
        collection_name=collection_name,
        embedding_function=embedding_function,
        connection_args={"uri": uri},
    )

    return vectorstore

# Function to add multiple documents to Milvus
def add_documents_to_milvus(docs: list, embedding_function, collection_name: str = "rag_milvus", uri: str = "./vector_db_graphRAG/milvus_ingest.db"):
    """
    Add multiple documents to the Milvus vector store.

    Args:
        docs (list): List of tuples containing text and metadata.
        collection_name (str): Name of the Milvus collection.
        uri (str): URI for the Milvus database.
    
    # Example usage
    docs = [
        {"text": "Chunk 1 of the document", "metadata": {"doc_id": "doc_1", "chunk": 1}},
        {"text": "Chunk 2 of the document", "metadata": {"doc_id": "doc_1", "chunk": 2}}
    ]

    add_documents_to_milvus(docs)
    """
    # Initialize the database
    vectorstore = init_milvus_db(collection_name, uri, embedding_function)

    # Prepare documents
    document_list = []
    for doc in docs:
        text = doc.get("text", "")
        metadata = doc.get("metadata", {})
        document_list.append(Document(page_content=text, metadata=metadata))
    
    uuids = [str(uuid4()) for _ in range(len(document_list))]

    # Add documents to the vector store
    vectorstore.add_documents(document_list, ids=uuids)

    return vectorstore

# Function to search for similar documents in Milvus
def search_milvus(query: str, vectorstore, top_k: int = 5):
    """
    Search for similar documents in the Milvus vector store.

    Args:
        query (str): Text to search for.
        vectorstore: Milvus vector store.
        top_k (int): Number of similar documents to return.
    """
    return vectorstore.similarity_search_with_score_by_vector(query, top_k=top_k)



In [20]:
##################################################
# 2.2. Text Chunks → Element Instances
##################################################

def extract_element_instances_from_chunk(
    chunk_text: str,
    gleaning_rounds: int = 1,
    USE_OPENAI: bool = True,
    local_llm: str = "llama3.1"
) -> List[Dict[str, Any]]:
    """
    (Step 2.2) Use an LLM prompt to identify entity references, relationships, and covariates.
    - Identifies entities (name, type, description) and relationships.
    - Supports multiple rounds of "gleanings" to find any missed entities.
    """
    extracted_elements = []
    not_parsed = []

    # Select the LLM to use
    if USE_OPENAI:
        llm = ChatOpenAI(model=openai_model, temperature=0.0)
    else:
        llm = ChatOllama(model=local_llm, temperature=0)

    # Base prompt for extracting entities and relationships
    base_prompt = (
        "Extract entities and relationships from the following text. "
        "For each entity, provide its name, type, and description. "
        "For each relationship, provide the source entity, target entity, and description. "
        "Text: \n{chunk_text}\n"
        "Output format: List of dictionaries with keys 'entity_name', 'entity_type', 'entity_description', 'relationship'. "
        "Follow this format in the example: [{{\"entity_name\": \"Alice\", \"entity_type\": \"Person\", \"entity_description\": \"A person of interest.\", \"relationship\": {{\"source_entity\": \"Alice\", \"target_entity\": \"Bob\", \"description\": \"Knows\"}}}}]"
        "Return just the list, so that we can parse it."
        "No bullet list or asterisks needed."
        "It must be a unique list, do NOT separate entities and relationships in different lists."
    )

    # Loop through gleaning rounds
    for round_num in range(gleaning_rounds):
        prompt = base_prompt.format(chunk_text=chunk_text)

        # Send prompt to the selected LLM
        response = llm.invoke([HumanMessage(content=prompt)])

        # Parse the LLM response
        new_elements = response.content
        new_elements = new_elements.replace('```json','').replace('```','')

        # Assume the response is already in JSON format
        try:
            new_elements = eval(new_elements)  # Convert string to list of dicts
        except Exception as e:
            print(f"Error parsing LLM output: {e}")
            not_parsed.extend(new_elements)
            new_elements = []

        # Add new elements to the result
        extracted_elements.extend(new_elements)

        # Check if gleaning is needed (e.g., ask LLM if entities were missed)
        if round_num < gleaning_rounds - 1:
            print("Asking for validation...")
            validation_prompt = (
                "Were any entities or relationships missed in the previous extraction? "
                "Answer 'Yes' or 'No'."
            )
            validation_response = llm.invoke(
                [ HumanMessage(content=validation_prompt)], 
                chat_history=
                [HumanMessage(content=prompt), SystemMessage(content=new_elements)]
            )

            # If LLM says 'No', break early
            if 'No' in validation_response.content:
                break

    # Return all extracted elements
    return extracted_elements, not_parsed


In [21]:
##################################################
# 2.3 ELEMENT INSTANCES → ELEMENT SUMMARIES
##################################################

summarization_instances_prompt = """
You are an expert in text summarization and knowledge graph construction. I will provide you with:

1. A list of initial nodes, each containing:
   - A unique identifier
   - A textual description extracted from the source
   - Additional properties (optional)

2. A list of relationships between these nodes (e.g., an entity "Einstein" related to another entity "Relativity").

Your task is to:
- Identify when multiple nodes actually refer to the same entity or concept.
- Generate *summarized nodes* by consolidating their textual descriptions and removing duplicates or near-duplicates.
- Maintain references to each node's original ID within your summarized node.
- Create new relationships among these summarized nodes that reflect the original relationships, but merged and simplified where appropriate.

**Important requirements and format details:**
1. Each summarized node should have:
   - A `summary` field with the merged description.
   - A list of `original_ids` that were merged into this new summary node.
   - Any relevant `type` or `label` (e.g., Person, Theory, Location) if it can be inferred from the text.
   - (Optional) A short list of `keywords` extracted from the descriptions.

2. Each relationship should:
   - Include `source` and `target` references to the new summarized nodes.
   - Provide a `relation_type` (e.g., "INVENTED", "WORKS_ON", "LOCATED_IN", etc.).
   - Have a `weight` or `relevance_score` if it can be inferred (e.g., frequency or importance).
   - (Optional) Include an `original_relationships` list indicating which original relationships were merged.

3. Return the final data in **JSON** format, containing two top-level keys: `summarized_nodes` and `summarized_relationships`.

4. Be concise but ensure the summaries and relationships accurately capture the original meaning.

---

### **Here is the initial Nodes and Relationships**:

{initial_data}

---

### **Instructions to the LLM**:
1. **Identify duplicates or near-duplicates** (e.g., "Albert Einstein" and "Einstein" might refer to the same entity).
2. **Create a new summarized node** that merges the descriptions of "N1" and "N2" if they represent the same entity (in this case, Albert Einstein).
3. **Consolidate relationships** so that if multiple original relationships lead to the same concept, you unify them into a single relationship with an updated weight (e.g., sum or average of the original).
4. Provide your final answer in the following **JSON** structure:

```json
{{
  "summarized_nodes": [
    {{
      "title": "NewTitle1",
      "summary": "Your merged summary text here...",
      "original_ids": ["ExampleID1", "ExampleID2", ...],
      "type": "Person",
      "keywords": ["Einstein", "relativity", "physics"]
    }},
    {{
      "title": "NewTitle2",
      "summary": "Your summary text here...",
      "original_ids": ["ExampleID3", "ExampleID4"],
      "type": "Theory",
      "keywords": ["relativity", "physics"]
    }}
  ],
  "summarized_relationships": [
    {{
      "source": "NewTitle1",
      "target": "NewTitle2",
      "relation_type": "DEVELOPED_OR_ASSOCIATED_WITH",
      "weight": 5,
      "original_relationships": ["N1->N3(DEVELOPED)", "N2->N3(ASSOCIATED_WITH)"]
    }}
  ]
}}
```

Please **only** output valid JSON in the format described above, without additional commentary so that we can parse it correctly. 
Make sure to capture the essence of each original node and relationship in your summarized version.
"""

In [22]:
def summarize_element_instances(
    element_instances: List[Dict[str, Any]], 
    USE_OPENAI: bool = True,
    local_llm: str = "llama3.1") -> str:
    """
    (Step 2.3) Summarize extracted nodes/relationships into a single descriptive block of text
    for each chunk. This is an additional LLM-based summarization step, forming "element summaries."
    """
    # Select the LLM to use
    if USE_OPENAI:
        llm = ChatOpenAI(model=openai_model, temperature=0.0)
    else:
        llm = ChatOllama(model=local_llm, temperature=0)

    prompt = summarization_instances_prompt.format(initial_data=element_instances)
    response = llm.invoke([HumanMessage(content=prompt)])
    response = response.content

    try:
        response = response.replace('```json','').replace('```','')
        response = eval(response)
    except Exception as e:
        print(f"Error parsing LLM output: {e}")
    
    return response

In [23]:
def store_element_summary_in_graph(tx, data: Dict[str, Any], doc_id: str, chunks_bounds: tuple):
    """
    Load the summarized graph data into Neo4j.

    Args:
        tx: Neo4j transaction object.
        data (Dict[str, Any]): Summarized nodes and relationships.
        doc_id (str): Document ID.
        chunks_bounds (tuple): Tuple containing the start and end positions
            of the text chunk in the original document.
    """
    
    # Creazione dei nodi
    for node in data["summarized_nodes"]:
        query_create_node = """
        CREATE (n:SummarizedNode {
            title: $title,
            summary: $summary,
            original_ids: $original_ids,
            type: $type,
            keywords: $keywords,
            doc_id : $doc_id,
            chunks_lower_bound: $chunks_lower_bound,
            chunks_upper_bound: $chunks_upper_bound
        })
        """
        tx.run(
            query_create_node,
            title=node.get("title"),
            summary=node.get("summary"),
            original_ids=node.get("original_ids"),
            type=node.get("type"),
            keywords=node.get("keywords"),
            doc_id=doc_id,
            chunks_lower_bound=chunks_bounds[0],
            chunks_upper_bound=chunks_bounds[1]
        )

    # Creazione delle relazioni
    for rel in data["summarized_relationships"]:
        query_create_rel = f"""
        MATCH (source:SummarizedNode {{title: $source_id}})
        MATCH (target:SummarizedNode {{title: $target_id}})
        CREATE (source)-[:RELATIONSHIP_TYPE {{
            type : $relation_type,
            weight: $weight,
            original_relationships: $original_rels
        }}]->(target)
        """
        tx.run(
            query_create_rel,
            source_id=rel["source"],
            target_id=rel["target"],
            relation_type=rel["relation_type"],
            weight=rel.get("weight", 1),  # default=1 if not provided
            original_rels=rel.get("original_relationships", [])
        )

In [24]:
##################################################
# 2.4 ELEMENT SUMMARIES → GRAPH COMMUNITIES
##################################################

class CommunityDetection:
    def __init__(self, driver: Any = None, uri: str = '', user: str = '', password: str = ''):
        """
        Initialize the CommunityDetection class with a Neo4j driver or connection details.

        Args:
            driver (Any): An existing Neo4j driver instance. If provided, `uri`, `user`, and `password` are ignored.
            uri (str): The URI for the Neo4j database.
            user (str): The username for the Neo4j database.
            password (str): The password for the Neo4j database.
        """
        # If a driver is provided, use it; otherwise, create a new driver instance
        self.driver = driver or GraphDatabase.driver(uri, auth=(user, password))
        self.graph_name = 'summarizedGraph'

    def close(self):
        self.driver.close()

    def project_graph(self, relationship_weight_property: str = "weight") -> None:
        """
        Projects the graph into memory for analysis.
        """
        with self.driver.session() as session:
            session.run(
                f"""
                CALL gds.graph.project(
                    '{self.graph_name}',
                    'SummarizedNode',
                    {{ RELATIONSHIP_TYPE: {{ orientation: 'UNDIRECTED', properties: ['{relationship_weight_property}'] }} }}
                )
                """
            )


    def set_communities(self, relationship_weight_property: str = "weight") -> List[Dict[str, Any]]:
        """
        Sets the community IDs directly into the graph database.
        """
        with self.driver.session() as session:
            result = session.run(
                f"""
                CALL gds.leiden.stream(
                    '{self.graph_name}', 
                    {{ relationshipWeightProperty: '{relationship_weight_property}' }})
                YIELD nodeId, communityId
                SET gds.util.asNode(nodeId).communityId = communityId
                """
            )

            return True
        
        return False
        
    def retrieve_communities(self) -> List[Dict[str, Any]]:
        """
        Retrieve the community assignments from the graph.
        """
        with self.driver.session() as session:
            result = session.run(
                f"""
                MATCH (n:SummarizedNode)-[r:RELATIONSHIP_TYPE]->(m:SummarizedNode)
                RETURN
                    n.communityId AS communityId,
                    n.title AS nodeTitle,      
                    n.summary AS nodeSummary,
                    n.keywords AS nodeKeywords,
                    r.type AS relationshipType,
                    r.weight AS relationshipWeight,
                    m.title AS targetNodeTitle
                """
            )

            communities = {}
            for record in result:
                community_id = record["communityId"]
                node_title = record["nodeTitle"]
                node_summary = record["nodeSummary"]
                node_keywords = record["nodeKeywords"]
                relationship_type = record["relationshipType"]
                relationship_weight = record["relationshipWeight"]
                target_node_title = record["targetNodeTitle"]
                
                if community_id not in communities:
                    communities[community_id] = {"nodes": [], "relationships": []}
                
                communities[community_id]["nodes"].append({
                    "title": node_title,
                    "summary": node_summary,
                    "keywords": node_keywords
                })
                
                communities[community_id]["relationships"].append({
                    "source": node_title,
                    "target": target_node_title,
                    "type": relationship_type,
                    "weight": relationship_weight
                })

            return communities

    def drop_graph(self) -> None:
        """
        Drops the graph from memory.
        """
        with self.driver.session() as session:
            session.run(f"CALL gds.graph.drop('{self.graph_name}') YIELD graphName")


In [25]:
##################################################
# 2.5 GRAPH COMMUNITIES → COMMUNITY SUMMARIES
##################################################

summary_community_prompt = """
You are an expert summarizer helping to create a concise “community report” from a list of related nodes. Each node has a title, a summary, and keywords. All these nodes belong to the same community, meaning they share a common theme, topic, or set of closely related ideas. Moreover, you are provided with a list of relationships between these nodes, indicating how they are connected or interact with each other.

Here is the list of nodes for this community:

{nodes}

And here are the relationships between these nodes:

{relationships}

Please read through the nodes and relationships and then produce a coherent summary describing:
1. The main topics, themes, or domains covered by the nodes in this community.
2. Any notable or central nodes and why they are important.
3. How the nodes interrelate: highlight significant relationships and mention if there are strong connections (high weight).
4. Overall, what makes this community distinct or interesting?

- Aim for a concise yet informative text, written in a clear paragraph style.
- You may group related nodes together and mention prominent links or patterns.
- Use plain English. Avoid overly technical language unless it is necessary to describe the domain.

4. Provide your final answer in the following **JSON** structure:

```json
{{
    "title": a title capable to summarize the community,
    "community_summary": a single, cohesive summary that helps a reader quickly understand the core content of this community. You do NOT need to repeat every node’s individual summary verbatim. Instead, synthesize the most relevant information into a unified overview.
    "keywords": a list of keywords that capture the main topics or themes of this community.
}}
```

Please **only** output valid JSON in the format described above, without additional commentary so that we can parse it correctly. 

Begin now.
"""

In [26]:
def summarize_communities(
    communities: Dict[int, List[Dict[str, Any]]],
    USE_OPENAI: bool = True,
    local_llm: str = "llama3.1",
) -> Dict[int, str]:
    """
    (Step 2.5) Summarize each community (or sub-community in a hierarchical approach).
    - Gather all element summaries (nodes, edges, covariates) in that community.
    - Summarize them, potentially chunking if they don't fit in an LLM context window.

    Args:
        communities (dict): A dictionary where keys are community IDs and values are lists of nodes with 'title', 'summary', and 'keywords' and a list of relationships with 'source', 'target', 'type', and 'weight'.
        USE_OPENAI (bool): Whether to use OpenAI or a local LLM.
        local_llm (str): The name of the local LLM model.

    Returns:
        dict: Summaries for each community.
    """
    # Select the LLM to use
    if USE_OPENAI:
        llm = ChatOpenAI(model=openai_model, temperature=0.0)
    else:
        llm = ChatOllama(model=local_llm, temperature=0)

    summaries = {}

    for community_id, data in tqdm.tqdm(communities.items()):
        # Extract nodes and relationships
        nodes = data["nodes"]
        relationships = data["relationships"]

        # Prepare prompt content
        node_descriptions = "\n".join([f"Title: {node['title']}, Summary: {node['summary']}, Keywords: {', '.join(node['keywords'])}" for node in nodes])
        relationships = "\n".join([f"Source: {rel['source']}, Target: {rel['target']}, Type: {rel['type']}, Weight: {rel['weight']}" for rel in relationships])
        prompt = summary_community_prompt.format(nodes=node_descriptions, relationships=relationships)

        # Generate summary
        response = llm.invoke([HumanMessage(content=prompt)])
        response = response.content.replace('```json','').replace('```','')

        try:
            summaries[community_id] = eval(response)
        except Exception as e:
            summaries[community_id] = [f"Error generating summary: {str(e)}", response]

    return summaries




In [27]:
##################################################
# MAIN FUNCTION: 2.6 COMMUNITY SUMMARIES → COMMUNITY ANSWERS → GLOBAL ANSWER
###################################################


# -------------------------------------------------------------------------
# UTILITY: Prompt to get partial answer + helpfulness score
# -------------------------------------------------------------------------
PARTIAL_ANSWER_PROMPT = """\
You have a user query:
\"\"\"{user_query}\"\"\"

Below is a chunk of text from a community summary that may or may not be relevant to the query:
\"\"\"{chunk_text}\"\"\"

1) Please provide a concise partial answer (if any) relevant to the query, based on the chunk above.
   If the chunk is irrelevant, you can say "No relevant info here."
2) Provide a helpfulness score from 0 to 100 (integer), indicating how much this chunk helps answer the query. 
   0 = not relevant at all, 100 = extremely helpful.

Output your response **only** in valid JSON format like:
{{
  "partial_answer": "...",
  "helpfulness_score": ...
}}
"""

In [28]:
def answer_query_from_communities(
    user_query: str,
    community_summaries: Dict[int, str],
    USE_OPENAI: bool = True,
    local_llm: str = "llama3.1",
    max_context_tokens: int = 1000
) -> str:
    """
    (Step 2.6) Use the hierarchical community summaries to answer a user query globally.
    
    Args:
        user_query (str): The question asked by the user.
        community_summaries (Dict[int, str]): A dict of {community_id: summary_text}.
        USE_OPENAI (bool): Whether to use OpenAI or a local LLM.
        local_llm (str): The local LLM name if not using OpenAI.
        max_context_tokens (int): Approx token limit for each chunk.

    Returns:
        str: The final global answer.
    
    High-level process:
      1) For each community summary:
         - Chunk the text (so we don't exceed context window).
      2) For each chunk:
         - Ask the LLM for a partial answer + helpfulness score.
      3) Sort partial answers by score.
      4) Combine top partial answers into a final context.
      5) Ask the LLM for a final answer.
    """

    # -------------------------
    # 0) Select which LLM to use
    # -------------------------
    if USE_OPENAI:
        # Example with OpenAI
        llm = ChatOpenAI(model=openai_model, temperature=0.0)
    else:
        # Example with Ollama or a local Hugging Face model
        llm = ChatOllama(model=local_llm, temperature=0)

    # ----------------------------------------------
    # 1) Chunk each community summary
    # ----------------------------------------------
    chunked_texts = []
    for comm_id, summary_text in community_summaries.items():
        # Break down the summary into smaller pieces
        chunks = chunk_text(summary_text, chunk_size=max_context_tokens)
        for c in chunks:
            chunked_texts.append((comm_id, c))
        
    # Shuffl

    # ----------------------------------------------
    # 2) For each chunk, get partial answer + score
    # ----------------------------------------------
    partial_answers = []
    marks = []
    for comm_id, text_chunk in chunked_texts:
        # Build the prompt
        prompt = PARTIAL_ANSWER_PROMPT.format(
            user_query=user_query,
            chunk_text=text_chunk
        )
       
        # LLM call
        response = llm.invoke([HumanMessage(content=prompt)])
        raw_content = response.content.strip()

        # Attempt to parse JSON response
        try:
            # Clean up any trailing text
            first_brace = raw_content.find('{')
            last_brace = raw_content.rfind('}')
            json_str = raw_content[first_brace:last_brace+1]

            parsed = eval(json_str)  # or use json.loads if strictly valid
            partial_answer = parsed.get("partial_answer", "No relevant info here.")
            score = parsed.get("helpfulness_score", 0)
        except Exception as e:
            partial_answer = "Parsing error or no relevant info."
            score = 0

        # Store result
        partial_answers.append((partial_answer, score))
        marks.append((comm_id, score))

    # ----------------------------------------------
    # 3) Sort partial answers by score (descending)
    # ----------------------------------------------
    partial_answers.sort(key=lambda x: x[1], reverse=True)

    # ----------------------------------------------
    # 4) Combine top partial answers into a final context
    #    We'll do a simple cutoff if we have too many
    # ----------------------------------------------
    final_context = []
    used_chars = 0

    for ans, sc in partial_answers:
        # Add some label or delimiter if needed
        if sc==0:
            continue
        
        snippet = f"PartialAnswer(Score={sc}): {ans}\n"
        if used_chars + len(snippet) <= max_context_tokens * 4:
            final_context.append(snippet)
            used_chars += len(snippet)
        else:
            break  # no more space in final context

    # ----------------------------------------------
    # 5) Produce a final answer from the LLM
    # ----------------------------------------------
    final_prompt = f"""\
We have these partial answers and their helpfulness scores:

{''.join(final_context)}

User Query: {user_query}

Based on these partial answers, please produce a single, coherent, and well-structured final answer.
Feel free to synthesize or refine the information. If there's conflicting info, do your best to clarify.

Respond in plain text.
"""

    # Final LLM call
    final_response = llm.invoke([HumanMessage(content=final_prompt)])
    global_answer = final_response.content.strip()

    return global_answer, partial_answers, marks


# Pipeline

In [29]:
##################################################
# 5) INGEST PDF -> STORE IN GRAPH (Putting Steps 2.1 and 2.2+ in context)
##################################################

def ingest_pdf_into_graph(pdf_path: str, doc_id: str, embed_opt: bool = True, BATCH_SIZE: int = 25):
    """
    1) Parse PDF into raw text.
    2) Chunk it (Step 2.1).
    3) Generate embeddings for each chunk.
    4) Store chunk nodes in Neo4j.
    5) For each chunk, call LLM to extract element instances (Step 2.2).
    6) Summarize them into a single descriptive block (Step 2.3).
    7) Store the block in Neo4j for further community detection.
    """
    # Step 1: Parse PDF
    print(f"Parsing PDF at {pdf_path}...")
    raw_text = parse_pdf(pdf_path)
    print(f"Extracted {len(raw_text)} characters from {pdf_path} \n\n")

    # Step 1.1: Retrieve metadata
    print(f"Retrieving metadata for {doc_id}...")
    metadata = find_metadata(doc_id)
    print(f"Metadata: {metadata}\n\n")

    # Step 2: Chunk the text (default chunk_size=600 for improved recall)
    chunks = chunk_text(raw_text)
    print(f"Chunked {len(chunks)} segments from {pdf_path} \n\n")

    # Step 3: Embeddings
    if embed_opt:
        print("Generating embeddings for each chunk...")
        embed_fn = get_hf_embedding_function(USE_OPENAI=False)

        print("Storing chunks in Milvus...")
        vectorstore = add_documents_to_milvus([
            {
                "text": chunk, 
                "metadata": {
                    "doc_id": doc_id, 
                    "chunk": i, 
                    **metadata
                    }
            } for i, chunk in enumerate(chunks)], embed_fn)
        print(f"Stored {len(chunks)} chunks in Milvus under Document {doc_id} \n\n")


    # Step 4: Store chunk nodes in Neo4j
    # Process chunks in parallel with batch size of 25
    for lim in tqdm.tqdm(range(0, len(chunks), BATCH_SIZE), desc="Processing chunks in parallel"):
        extracted_elements = []
        not_parsed_elements = []

        with concurrent.futures.ThreadPoolExecutor() as executor:
            # Submit tasks
            futures = [
                executor.submit(extract_element_instances_from_chunk, chunk_text_str, USE_OPENAI=USE_OPENAI)
                for i, chunk_text_str in enumerate(chunks[lim:lim+BATCH_SIZE])
            ]
            
            # Process results as they complete
            for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Processing chunks"):
                elements_instance, not_parsed_instance = future.result()
                extracted_elements.extend(elements_instance)
                not_parsed_elements.extend(not_parsed_instance)

        print("Parallel processing completed.")
            
        # Step 6: Summarize them (element-level)
        print("Summarizing element instances...")
        element_summary = summarize_element_instances(extracted_elements, USE_OPENAI=USE_OPENAI)

        print(f"Summarized {len(element_summary['summarized_nodes'])} nodes and {len(element_summary['summarized_relationships'])} relationships\n\n")

        print("Storing backup files...")
        # Backup element_summary, extracted_elements and not_parsed_elements
        os.makedirs('backup_extraction_nodes/'+doc_id+'/element_summary', exist_ok=True)
        os.makedirs('backup_extraction_nodes/'+doc_id+'/extracted_elements', exist_ok=True)
        os.makedirs('backup_extraction_nodes/'+doc_id+'/not_parsed_elements', exist_ok=True)

        with open(f'backup_extraction_nodes/{doc_id}/element_summary/{lim}.json', 'w') as f:
            f.write(str(element_summary))
        
        with open(f'backup_extraction_nodes/{doc_id}/extracted_elements/{lim}.json', 'w') as f:
            f.write(str(extracted_elements))
        
        with open(f'backup_extraction_nodes/{doc_id}/not_parsed_elements/{lim}.json', 'w') as f:
            f.write(str(not_parsed_elements))
        
        print("Backup files stored.\n\n")

        # Step 7: Store the summary
        print("Storing element summary in Neo4j...")
        with driver.session() as session:
            # Step 7: Store the summary
            session.execute_write(store_element_summary_in_graph, element_summary, doc_id, (lim, lim+BATCH_SIZE))
    
    print(f"Ingested {len(chunks)} chunks from {pdf_path} into Neo4j under Document {doc_id}")
    return True

    

In [30]:
#############################################
# MAIN EXECUTION EXAMPLE
#############################################

In [31]:
PDF_PATH = 'data/docs/'

# Retrieve all the docs in PDF_PATH
docs=sorted([f for f in os.listdir(PDF_PATH) if f.endswith('.pdf')])

In [32]:
filtered_docs=docs[2:5]

In [33]:
for doc in filtered_docs:
    print(f"Processing document {doc}...")
    doc_id = doc.replace('.pdf', '')

    if ingest_pdf_into_graph(PDF_PATH+doc, doc_id, embed_opt=True):
        print(f"Finished processing document {doc}.")
        continue
    else:
        raise Exception(f"Error processing document {doc}.")
    

Processing document 0710.0845.pdf...
Parsing PDF at data/docs/0710.0845.pdf...
Extracted 91057 characters from data/docs/0710.0845.pdf 


Retrieving metadata for 0710.0845...
Metadata: {'title': 'The nested Chinese restaurant process and Bayesian nonparametric inference of topic hierarchies', 'summary': 'We present the nested Chinese restaurant process (nCRP), a stochastic process\nwhich assigns probability distributions to infinitely-deep,\ninfinitely-branching trees. We show how this stochastic process can be used as\na prior distribution in a Bayesian nonparametric model of document collections.\nSpecifically, we present an application to information retrieval in which\ndocuments are modeled as paths down a random tree, and the preferential\nattachment dynamics of the nCRP leads to clustering of documents according to\nsharing of topics at multiple levels of abstraction. Given a corpus of\ndocuments, a posterior inference algorithm finds an approximation to a\nposterior distribution

  hf_embed = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={'device': device})


Storing chunks in Milvus...
Stored 186 chunks in Milvus under Document 0710.0845 




Processing chunks: 100%|██████████| 25/25 [00:35<00:00,  1.42s/it]s]


Parallel processing completed.
Summarizing element instances...
Summarized 9 nodes and 7 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...


Processing chunks in parallel:  12%|█▎        | 1/8 [01:00<07:00, 60.03s/it]

Error parsing LLM output: unterminated string literal (detected at line 1) (<string>, line 1)


Processing chunks: 100%|██████████| 25/25 [00:19<00:00,  1.30it/s]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel:  25%|██▌       | 2/8 [01:34<04:30, 45.15s/it]

Summarized 7 nodes and 6 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...


Processing chunks: 100%|██████████| 25/25 [00:20<00:00,  1.19it/s]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel:  38%|███▊      | 3/8 [02:08<03:20, 40.05s/it]

Summarized 8 nodes and 4 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...




Error parsing LLM output: unmatched '}' (<string>, line 1)


Processing chunks: 100%|██████████| 25/25 [00:22<00:00,  1.12it/s]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel:  50%|█████     | 4/8 [02:57<02:53, 43.39s/it]

Summarized 11 nodes and 6 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...




Error parsing LLM output: source code string cannot contain null bytes


Processing chunks: 100%|██████████| 25/25 [00:35<00:00,  1.42s/it]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel:  62%|██████▎   | 5/8 [03:48<02:18, 46.11s/it]

Summarized 9 nodes and 5 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...


Processing chunks: 100%|██████████| 25/25 [00:59<00:00,  2.38s/it]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel:  75%|███████▌  | 6/8 [05:04<01:52, 56.49s/it]

Summarized 7 nodes and 6 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...


Processing chunks: 100%|██████████| 25/25 [00:36<00:00,  1.44s/it]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel:  88%|████████▊ | 7/8 [05:59<00:55, 55.82s/it]

Summarized 8 nodes and 6 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...


Processing chunks: 100%|██████████| 11/11 [00:27<00:00,  2.52s/it]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel: 100%|██████████| 8/8 [06:56<00:00, 52.07s/it]

Summarized 20 nodes and 15 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...
Ingested 186 chunks from data/docs/0710.0845.pdf into Neo4j under Document 0710.0845
Finished processing document 0710.0845.pdf.
Processing document 0801.1223.pdf...
Parsing PDF at data/docs/0801.1223.pdf...





Extracted 46562 characters from data/docs/0801.1223.pdf 


Retrieving metadata for 0801.1223...
Metadata: {'title': 'The c2d Spitzer spectroscopy survey of ices around low-mass young stellar objects, III: CH4', 'summary': 'CH4 is proposed to be the starting point of a rich organic chemistry. Solid\nCH4 abundances have previously been determined mostly toward high mass star\nforming regions. Spitzer/IRS now provides a unique opportunity to probe solid\nCH4 toward low mass star forming regions as well. Infrared spectra from the\nSpitzer Space Telescope are presented to determine the solid CH4 abundance\ntoward a large sample of low mass young stellar objects. 25 out of 52 ice\nsources in the $c2d$ (cores to disks) legacy have an absorption feature at 7.7\num, attributed to the bending mode of solid CH4. The solid CH4 / H2O abundances\nare 2-8%, except for three sources with abundances as high as 11-13%. These\nlatter sources have relatively large uncertainties due to small total ice\ncol

Processing chunks: 100%|██████████| 25/25 [00:19<00:00,  1.26it/s]s]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel:  25%|██▌       | 1/4 [00:46<02:18, 46.12s/it]

Summarized 9 nodes and 8 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...


Processing chunks: 100%|██████████| 25/25 [00:16<00:00,  1.47it/s]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel:  50%|█████     | 2/4 [01:18<01:16, 38.23s/it]

Summarized 6 nodes and 4 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...


Processing chunks: 100%|██████████| 25/25 [00:27<00:00,  1.09s/it]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel:  75%|███████▌  | 3/4 [02:04<00:41, 41.67s/it]

Summarized 8 nodes and 6 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...


Processing chunks: 100%|██████████| 24/24 [01:02<00:00,  2.59s/it]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel: 100%|██████████| 4/4 [03:23<00:00, 50.79s/it]

Summarized 8 nodes and 6 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...
Ingested 99 chunks from data/docs/0801.1223.pdf into Neo4j under Document 0801.1223
Finished processing document 0801.1223.pdf.
Processing document 0804.4409.pdf...
Parsing PDF at data/docs/0804.4409.pdf...





Extracted 34382 characters from data/docs/0804.4409.pdf 


Retrieving metadata for 0804.4409...
Metadata: {'title': 'Thermal conductivity and phase separation of the crust of accreting neutron stars', 'summary': 'Recently, crust cooling times have been measured for neutron stars after\nextended outbursts. These observations are very sensitive to the thermal\nconductivity $\\kappa$ of the crust and strongly suggest that $\\kappa$ is large.\nWe perform molecular dynamics simulations of the structure of the crust of an\naccreting neutron star using a complex composition that includes many\nimpurities. The composition comes from simulations of rapid proton capture\nnucleosynthesys followed by electron captures. We find that the thermal\nconductivity is reduced by impurity scattering. In addition, we find phase\nseparation. Some impurities with low atomic number $Z$ are concentrated in a\nsubregion of the simulation volume. For our composition, the solid crust must\nseparate into regions of

Processing chunks: 100%|██████████| 25/25 [00:19<00:00,  1.26it/s]s]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel:  33%|███▎      | 1/3 [00:36<01:12, 36.25s/it]

Summarized 6 nodes and 5 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...


Processing chunks: 100%|██████████| 25/25 [00:17<00:00,  1.45it/s]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel:  67%|██████▋   | 2/3 [01:15<00:38, 38.02s/it]

Summarized 10 nodes and 7 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...


Processing chunks: 100%|██████████| 17/17 [00:26<00:00,  1.55s/it]


Parallel processing completed.
Summarizing element instances...


Processing chunks in parallel: 100%|██████████| 3/3 [02:00<00:00, 40.22s/it]

Summarized 8 nodes and 5 relationships


Storing backup files...
Backup files stored.


Storing element summary in Neo4j...
Ingested 67 chunks from data/docs/0804.4409.pdf into Neo4j under Document 0804.4409
Finished processing document 0804.4409.pdf.





In [34]:
# Initialize the community detection class
detector = CommunityDetection(driver)

In [35]:
# 2) Community detection & summarization (Steps 2.4–2.5)

# Project the graph for community detection
detector.project_graph()

# Set community IDs in the graph
detector.set_communities()

# Drop the graph from memory
detector.drop_graph()


In [36]:
# Retrieve the communities
communities = detector.retrieve_communities()
community_summaries=summarize_communities(communities, USE_OPENAI=True)

100%|██████████| 59/59 [03:05<00:00,  3.15s/it]


In [37]:
# Store json file with community summaries
os.makedirs('backup_extraction_nodes/', exist_ok=True)
with open(f'backup_extraction_nodes/community_summaries.json', 'w') as f:
    f.write(str(community_summaries))

In [39]:
# Create embeddings for the community summaries and store them in Milvus
embed_fn = get_hf_embedding_function(USE_OPENAI=False)
vectorstore_community_summaries = add_documents_to_milvus([
    {
        "text": summary["community_summary"], 
        "metadata": {
            "doc_id": "community_summaries",
            "community_id": id,
            "title": summary["title"],
            "keywords": ", ".join(summary["keywords"]),
            }
    } for id, summary in community_summaries.items()], embed_fn, collection_name="community_summaries")

0.05$ for a single document of 21 pages with 89 chunks and 10 nodes and 7 relationships extracted. Morevoer this price include the generation of the community summaries of 2 documents.

3 min 22 sec for a single document.
2 min 9 sec for summary extraction

0.20$ for a 3 documents 

12 min 49 sec for a single document.
3 min 5 sec for summary extraction

# Generation

In [26]:
raise Exception("Stop here")

Exception: Stop here

In [27]:
# Retrieve all community summaries

clientMilvus = MilvusClient(
    uri="./vector_db_graphRAG/milvus_ingest.db",
)

community_summaries_retrieved = clientMilvus.query(
    collection_name="community_summaries",
    output_fields=["community_id", "text"],
    limit = 1000
)

In [28]:
# Refactor for generation

community_summaries_ingestion = {}
for el in community_summaries_retrieved:
    community_summaries_ingestion[el['community_id']]=el['text']


In [103]:
# 3) Ask a question (Step 2.6 simplified vs. full approach)
user_query = 'What does the graph indicate about the relationship between \\( \\langle r(t) \\rangle \\) and time (t) during steady-state conditions? Choose between the following options: ' + options 

# Simple QA (RAG):
global_answer, partial_answers, marks  = answer_query_from_communities(user_query, community_summaries_ingestion, USE_OPENAI=True)

In [117]:
vectorstore_community_summaries = init_milvus_db("community_summaries", "./vector_db_graphRAG/milvus_ingest.db", embed_fn)
embed=embed_fn.embed_query(user_query)
search=search_milvus(embed, vectorstore_community_summaries, top_k=5)

In [118]:
for s in search:
    print(s[0].metadata['community_id'],s[1])

8 1.274196982383728
35 1.2917883396148682
37 1.3111110925674438
31 1.3140339851379395


In [106]:

import pandas as pd
pd.DataFrame(marks).sort_values(by=1, ascending=False)

Unnamed: 0,0,1
0,26,0
35,11,0
26,21,0
27,14,0
28,19,0
29,35,0
30,23,0
31,45,0
32,12,0
33,5,0


In [55]:
# Open parquet file 

df= pd.read_parquet('data/test-00000-of-00001.parquet')

In [101]:
df[df['image_filename']=='images/0707.1659_3.jpg']

Unnamed: 0,query,image,image_filename,options,answer,page,model,prompt,source
68,What does the graph indicate about the relatio...,{'bytes': b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x...,images/0707.1659_3.jpg,['A. \\( \\langle r(t) \\rangle \\) decreases ...,B,,gpt4V,,arxiv_qa


In [99]:
options=df[df['image_filename']=='images/0707.1659_3.jpg']['options'].values
options = ', '.join(options)

In [100]:
options

"['A. \\\\( \\\\langle r(t) \\\\rangle \\\\) decreases as time increases.', 'B. \\\\( \\\\langle r(t) \\\\rangle \\\\) increases as time increases.', 'C. \\\\( \\\\langle r(t) \\\\rangle \\\\) remains constant as time increases.', 'D. The relationship between \\\\( \\\\langle r(t) \\\\rangle \\\\) and time cannot be determined from the graph.']"

In [None]:
raise Exception("End of the script")

In [119]:
# the question asked is the one from the database of vidore at document 0707.1659
# The answer is wrong

# the marks of the retrieving are all 0 

print(f"User's question: {user_query}")
print(f"Answer: {global_answer}")

User's question: What does the graph indicate about the relationship between \( \langle r(t) \rangle \) and time (t) during steady-state conditions? Choose between the following options: ['A. \\( \\langle r(t) \\rangle \\) decreases as time increases.', 'B. \\( \\langle r(t) \\rangle \\) increases as time increases.', 'C. \\( \\langle r(t) \\rangle \\) remains constant as time increases.', 'D. The relationship between \\( \\langle r(t) \\rangle \\) and time cannot be determined from the graph.']
Answer: To determine the relationship between \( \langle r(t) \rangle \) and time (t) during steady-state conditions, we need to analyze the graph provided. The options available are:

A. \( \langle r(t) \rangle \) decreases as time increases.  
B. \( \langle r(t) \rangle \) increases as time increases.  
C. \( \langle r(t) \rangle \) remains constant as time increases.  
D. The relationship between \( \langle r(t) \rangle \) and time cannot be determined from the graph.

If the graph shows a t

In [87]:
# the question asked is the one from the database of vidore at document 0704.2547
# The answer is correct

print(f"User's question: {user_query}")
print(f"Answer: {global_answer}")

User's question: What can be inferred about the frequency of state transitions between n-1, n, and n+1? Choose between the following options: ['A) Transitions occur more frequently from n to n-1.', 'B) Transitions occur more frequently from n to n+1.', 'C) Transitions from n-1 to n and n to n+1 occur with equal frequency.', 'D) The figure does not provide information about the frequency of transitions.']
Answer: Based on the provided partial answers, it is clear that they all indicate a lack of specific information regarding the frequency of transitions between the states n-1, n, and n+1. While they discuss probabilistic patterns related to these transitions, none of the answers provide concrete details about how often transitions occur in either direction (from n-1 to n, from n to n+1, or vice versa). Therefore, the most accurate inference we can draw is that the figure does not provide information about the frequency of transitions.

Final Answer: D) The figure does not provide infor

In [54]:
# the question asked is the one from the database of vidore at document 0704.2547
#2 documents refined prompt avoiding all 0 scores

# the top3 retrieving by embedding and by chatbot is the same
print(f"User's question: {user_query}")
print(f"Answer: {global_answer}")

User's question: What can be inferred about the frequency of state transitions between n-1, n, and n+1?
Answer: The text indicates that the frequency of state transitions between n-1, n, and n+1 can be understood through two primary probabilistic patterns: one for moving forward (from n to n+1) and another for moving backward (from n to n-1). These patterns suggest that the transitions are not random but rather governed by specific probabilistic behaviors, which may vary based on the direction of movement. 

Additionally, the development of probability functions is mentioned, which assign likelihoods to base values within a sequence. This implies that the frequency of transitions can be analyzed by considering how previous states influence current transitions. While the text does not provide explicit frequencies, it suggests that understanding these probabilistic patterns and functions can offer insights into the dynamics of state transitions and their varying frequencies. 

In summary

In [49]:
print(f"User's question: {user_query}")
print(f"Answer: {global_answer}")

User's question: What are the main topics of the documents?
Answer: The main topics of the documents encompass a range of themes primarily focused on the intersection of statistical modeling, biological systems, and genetic analysis. Key areas of discussion include:

1. **Statistical and Computational Methods**: The application of Bayesian inference and Monte Carlo simulations is highlighted, particularly in the context of studying free energy functions and understanding biological processes.

2. **DNA and RNA Dynamics**: Several documents explore the role of DNA and RNA in genetic information transfer, gene expression, and the impact of thermal fluctuations on DNA stability. The interaction between RNA and unzipped DNA strands is also a significant focus.

3. **Genetic Sequence Analysis**: Theoretical and mathematical frameworks are employed to analyze genetic sequences, including decay constants, error models, and prediction metrics related to mutations. This includes the interconnec

In [33]:
# the question asked is the one from the database of vidore at document 0704.2547
# 2 documents
print(f"User's question: {user_query}")
print(f"Answer: {global_answer}")

User's question: What can be inferred about the frequency of state transitions between n-1, n, and n+1?
Answer: The frequency of state transitions between n-1, n, and n+1 can be inferred from the identification of two primary probabilistic patterns: one for moving forward (from n to n+1) and another for moving backward (from n to n-1). These patterns suggest that the likelihood of transitioning between these states is influenced by the probabilities associated with each movement direction.

Specifically, the contrasting probabilities of these patterns indicate that the frequency of transitions is not uniform; rather, it may be affected by the likelihood of remaining in a given state versus moving to adjacent states. Additionally, the development of probability functions that assign likelihoods to base values within a sequence implies that historical data and prior values play a role in shaping these transition frequencies.

While the exact frequencies of these transitions are not detai

In [118]:
# the question asked is the one from the database of vidore at document 0704.2547
# Only right document
print(f"User's question: {user_query}")
print(f"Answer: {global_answer}")

User's question: What can be inferred about the frequency of state transitions between n-1, n, and n+1?
Answer: The frequency of state transitions between n-1, n, and n+1 can be influenced by several factors, primarily related to the concepts of entropy and the patterns of movement between states. 

Firstly, the relationship between entropy and sequences suggests that varying levels of entropy can affect the probabilities of state transitions. Higher entropy may indicate a greater likelihood of transitioning between states, while lower entropy could imply a tendency to remain in a particular state. This understanding is crucial for calculating the probabilities of moving from n-1 to n, n to n+1, and vice versa.

Additionally, two primary patterns of state transitions have been identified: Pattern A, which involves moving forward (from n-1 to n and n to n+1), and Pattern B, which involves moving backward (from n+1 to n and n to n-1). The contrasting probabilities of these patterns indic

1 min generation
< 0.01$

In [116]:
# only paper 0704.2547
print(f"User's question: {user_query}")
print(f"Answer: {global_answer}")

User's question: What are the main contributions of the paper?
Answer: The paper makes several significant contributions to the understanding of biological processes, particularly in the context of DNA dynamics and probabilistic modeling. 

Firstly, it enhances the accuracy of predictions by developing and applying probability functions that optimize how probabilities are assigned to base values, emphasizing the influence of prior data on current assessments. This is complemented by the introduction of a Binding Energy Matrix, which illustrates the recursive processes affecting binding energies and serves as a foundational tool for analyzing their cumulative distribution within biological systems.

The paper also delves into the mechanics of DNA unzipping, particularly in relation to the λ-phage sequence. It focuses on the escape probability and average prediction error, exploring their interrelationship in probabilistic modeling and error analysis. This analysis is further enriched by