In [None]:
# !pip install neo4j
# !pip install langchain
# !pip install PyPDF2
# !pip install tiktoken
# !pip install openai  # Only if you want to use the OpenAI API
# !pip install transformers  # For open (HF) models
# !pip install sentence_transformers
# !pip install -U langchain-community
# For advanced community detection with Leiden, you might need external libraries (e.g., igraph, networkx, etc.).


In [1]:
import os
from typing import List, Dict, Any

# -----------------------
# Neo4j Database imports
# -----------------------
from neo4j import GraphDatabase

# -----------------------
# LLM / Embeddings imports
# -----------------------
# If using HuggingFace transformers:
from transformers import pipeline

# If using LangChain for retrieval + QA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.question_answering import load_qa_chain

# If you want to use OpenAI, uncomment:
import openai

# -----------------------
# Load environment variables
import dotenv
dotenv.load_dotenv()

# -----------------------
# PDF Parsing library
# -----------------------
import PyPDF2  # or "pypdf" if needed

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#############################################
# 1) CONFIGURATION: toggle open vs. OpenAI
#############################################

USE_OPENAI = False  # Set to True if you want to switch to OpenAI’s ChatGPT
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# For Neo4j:
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USERNAME")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")

In [None]:
##################################################
# 2) NEO4J CONNECTION AND GRAPH FUNCTIONS
##################################################

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))


def add_chunk_node(tx, chunk_text: str, chunk_id: str, embedding: List[float]):
    """
    Create or merge a chunk node in Neo4j to represent a piece of text.
    Store its embedding as well.
    """
    embedding_str = ",".join([str(x) for x in embedding])

    query = """
    MERGE (c:Chunk {chunk_id: $chunk_id})
    ON CREATE SET c.text = $chunk_text,
                  c.embedding = $embedding_str
    """
    tx.run(query, chunk_id=chunk_id, chunk_text=chunk_text, embedding_str=embedding_str)


def add_document_relationship(tx, doc_id: str, chunk_id: str):
    """
    Link each chunk node to a parent Document node in Neo4j.
    """
    query = """
    MERGE (d:Document {doc_id: $doc_id})
    MERGE (c:Chunk {chunk_id: $chunk_id})
    MERGE (d)-[:HAS_CHUNK]->(c)
    """
    tx.run(query, doc_id=doc_id, chunk_id=chunk_id)


def add_element_instance(tx, element_data: Dict[str, Any]):
    """
    (Step 2.2) Insert extracted graph node/edge relationships from text chunk.
    Example structure of element_data might be:
    {
        "entity_name": "...",
        "entity_type": "...",
        "entity_description": "...",
        "relationship": {
            "source_entity": "...",
            "target_entity": "...",
            "description": "...",
        },
        ...
    }

    In a real pipeline, you might store these as separate nodes/edges. 
    For demonstration, we store them in a single node with a property that can be parsed.
    """
    query = """
    CREATE (e:ElementInstance {data: $element_data})
    """
    tx.run(query, element_data=str(element_data))


def retrieve_relevant_chunks(tx, user_query: str, limit: int = 5) -> List[Dict[str, Any]]:
    """
    A simplistic retrieval function that returns chunk nodes based on naive text search.
    In a real scenario, you'd have a vector similarity search using embeddings.
    """
    query = """
    MATCH (c:Chunk)
    WHERE c.text CONTAINS $user_query
    RETURN c.chunk_id AS chunk_id, c.text AS text
    LIMIT $limit
    """
    result = tx.run(query, user_query=user_query, limit=limit)
    return [record.data() for record in result]

In [None]:
##################################################
# 2.1 SOURCE DOCUMENTS → TEXT CHUNKS
##################################################

def parse_pdf(pdf_path: str) -> str:
    """
    Extract raw text from a PDF file using PyPDF2.
    """
    text = ""
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text


def chunk_text(text: str, chunk_size: int = 600, chunk_overlap: int = 100) -> List[str]:
    """
    (Step 2.1) Split text into chunks. 
    Following the guidance in 2.1, we use a smaller chunk size (e.g., ~600 tokens).
    This can improve entity recall at the cost of more LLM calls.
    """
    # Note: the chunk_size is in characters by default using this splitter;
    # you may want to adapt to token-based splitting.
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    return chunks

In [2]:
import os

PATH_DOCS= "data/docs"

# Retrieve all docs from path
docs_ids = sorted([f.replace('.pdf','') for f in os.listdir(PATH_DOCS) if f.endswith(".pdf")])

In [3]:
import arxiv

# Fetch papers from arXiv
client = arxiv.Client()
search = arxiv.Search(
    id_list=docs_ids[:10]
)

In [4]:
for result in client.results(search):
    display(result.__dict__)
    break

{'entry_id': 'http://arxiv.org/abs/0704.2547v1',
 'updated': datetime.datetime(2007, 4, 19, 14, 45, 29, tzinfo=datetime.timezone.utc),
 'published': datetime.datetime(2007, 4, 19, 14, 45, 29, tzinfo=datetime.timezone.utc),
 'title': 'Inferring DNA sequences from mechanical unzipping data: the large-bandwidth case',
 'authors': [arxiv.Result.Author('Valentina Baldazzi'),
  arxiv.Result.Author('Serena Bradde'),
  arxiv.Result.Author('Simona Cocco'),
  arxiv.Result.Author('Enzo Marinari'),
  arxiv.Result.Author('Remi Monasson')],
 'summary': 'The complementary strands of DNA molecules can be separated when stretched\napart by a force; the unzipping signal is correlated to the base content of the\nsequence but is affected by thermal and instrumental noise. We consider here\nthe ideal case where opening events are known to a very good time resolution\n(very large bandwidth), and study how the sequence can be reconstructed from\nthe unzipping data. Our approach relies on the use of statistic

In [5]:
### Milvus Lite Vectorstore

import time
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_milvus import Milvus
from langchain_community.embeddings import HuggingFaceEmbeddings


docs = []
for result in client.results(search):
    docs.append(
        {"title": result.title, 
        "summary": result.summary, 
        "url": result.entry_id,
        "authors": ', '.join([a.name for a in result.authors]),
        "categories": ', '.join(result.categories)
        }
    )

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=500, chunk_overlap=50
)
doc_splits = text_splitter.create_documents(
    [doc["summary"] for doc in docs], metadatas=docs
)

print(f"Number of papers: {len(docs)}")
print(f"Number of chunks: {len(doc_splits)}")


# Add to Milvus
vectorstore = Milvus.from_documents(
    documents=doc_splits,
    collection_name="rag_milvus",
    embedding=HuggingFaceEmbeddings(
        model_name="models/all-mpnet-base-v2", 
        model_kwargs={
            'device':"mps"
        }
    ),
    connection_args={"uri": "./vector_db_graphRAG/milvus_ingest.db"},
)
retriever = vectorstore.as_retriever()

USER_AGENT environment variable not set, consider setting it to identify your requests.


Number of papers: 10
Number of chunks: 10


  embedding=HuggingFaceEmbeddings(
I0000 00:00:1735680055.393127  306142 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
2024-12-31 13:20:55,896 [ERROR][handler]: RPC error: [create_index], <MilvusException: (code=65535, message=invalid index type: HNSW, local mode only support FLAT IVF_FLAT AUTOINDEX: )>, <Time:{'RPC start': '2024-12-31 13:20:55.896040', 'RPC error': '2024-12-31 13:20:55.896607'}> (decorators.py:140)


In [None]:
##################################################
# 4) EMBEDDING UTILITIES
##################################################

def get_hf_embedding_function(model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
    """
    Returns a function that can generate embeddings using a HuggingFace model.
    """
    hf_embed = HuggingFaceEmbeddings(model_name=model_name)
    return hf_embed.embed_documents


# If using OpenAI embeddings, uncomment and implement:
# def get_openai_embedding_function(model_name: str = "text-embedding-ada-002"):
#     def _embeddings(texts: List[str]) -> List[List[float]]:
#         response = openai.Embedding.create(
#             input=texts,
#             model=model_name
#         )
#         embeddings = [item["embedding"] for item in response["data"]]
#         return embeddings
#     return _embeddings

In [8]:
# GraphRAG Setup
from langchain_community.graphs import Neo4jGraph
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_core.documents import Document
from langchain_experimental.llms.ollama_functions import OllamaFunctions
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
from langchain_openai import ChatOpenAI
from langchain_ollama import ChatOllama

graph = Neo4jGraph()

graph_llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")

graph_transformer = LLMGraphTransformer(
    llm=graph_llm,
    allowed_nodes=["Paper", "Author", "Topic"],
    node_properties=["title", "summary", "url"],
    allowed_relationships=["AUTHORED", "DISCUSSES", "RELATED_TO"],
)

graph_documents = graph_transformer.convert_to_graph_documents(doc_splits)

graph.add_graph_documents(graph_documents)

print(f"Graph documents: {len(graph_documents)}")
print(f"Nodes from 1st graph doc:{graph_documents[0].nodes}")
print(f"Relationships from 1st graph doc:{graph_documents[0].relationships}")

Graph documents: 10
Nodes from 1st graph doc:[Node(id='Dna Molecules', type='Topic', properties={}), Node(id='Unzipping Signal', type='Topic', properties={}), Node(id='Base Content', type='Topic', properties={}), Node(id='Thermal Noise', type='Topic', properties={}), Node(id='Instrumental Noise', type='Topic', properties={}), Node(id='Bayesian Inference', type='Topic', properties={}), Node(id='Viterbi Decoding Algorithm', type='Topic', properties={}), Node(id='Monte Carlo Generated Data', type='Topic', properties={}), Node(id='Unzippings', type='Topic', properties={}), Node(id='Elasticity Parameters', type='Topic', properties={})]
Relationships from 1st graph doc:[Relationship(source=Node(id='Dna Molecules', type='Topic', properties={}), target=Node(id='Unzipping Signal', type='Topic', properties={}), type='RELATED_TO', properties={}), Relationship(source=Node(id='Unzipping Signal', type='Topic', properties={}), target=Node(id='Base Content', type='Topic', properties={}), type='RELATED

In [9]:
# After converting to graph documents
for i, doc in enumerate(graph_documents):
    print(f"Document {i}:")
    print(f"  Nodes: {doc.nodes}")
    print(f"  Relationships: {doc.relationships}")
    print("---")

Document 0:
  Nodes: [Node(id='Dna Molecules', type='Topic', properties={}), Node(id='Unzipping Signal', type='Topic', properties={}), Node(id='Base Content', type='Topic', properties={}), Node(id='Thermal Noise', type='Topic', properties={}), Node(id='Instrumental Noise', type='Topic', properties={}), Node(id='Bayesian Inference', type='Topic', properties={}), Node(id='Viterbi Decoding Algorithm', type='Topic', properties={}), Node(id='Monte Carlo Generated Data', type='Topic', properties={}), Node(id='Unzippings', type='Topic', properties={}), Node(id='Elasticity Parameters', type='Topic', properties={})]
  Relationships: [Relationship(source=Node(id='Dna Molecules', type='Topic', properties={}), target=Node(id='Unzipping Signal', type='Topic', properties={}), type='RELATED_TO', properties={}), Relationship(source=Node(id='Unzipping Signal', type='Topic', properties={}), target=Node(id='Base Content', type='Topic', properties={}), type='RELATED_TO', properties={}), Relationship(sourc

In [None]:
##################################################
# 2.2 TEXT CHUNKS → ELEMENT INSTANCES
##################################################

def extract_element_instances_from_chunk(
    chunk_text: str,
    gleaning_rounds: int = 1
) -> List[Dict[str, Any]]:
    """
    (Step 2.2) Use an LLM prompt to identify entity references, relationships, and covariates.
    - Our default prompt identifies entities (name, type, description) and relationships.
    - We support multiple rounds of "gleanings" to find any missed entities (multi-stage approach).
    For demonstration, this is a stub that returns dummy data.
    """

    # -- Basic LLM extraction prompt (conceptual) --
    # e.g., "Identify all entities and relationships in the following text ..."

    # If gleaning rounds > 1, you’d incorporate prompts that ask:
    #  - "Were any entities missed? (Yes/No) [Logit bias forcing yes/no]"
    #  - "Please glean the missing entities." 
    # This is a simplified representation:

    dummy_data = [
        {
            "entity_name": "ExampleEntity",
            "entity_type": "Person",
            "entity_description": "A person of interest in the text.",
            "relationship": {
                "source_entity": "ExampleEntity",
                "target_entity": "AnotherEntity",
                "description": "A potential relationship described here."
            }
        }
    ]

    # In reality, you'd call your LLM with few-shot examples specialized to your domain.
    # Each gleaning round might add newly discovered entities/relationships.
    # Here, just return dummy data for demonstration.
    return dummy_data

In [None]:
##################################################
# 2.3 ELEMENT INSTANCES → ELEMENT SUMMARIES
##################################################

def summarize_element_instances(element_instances: List[Dict[str, Any]]) -> str:
    """
    (Step 2.3) Summarize extracted nodes/relationships into a single descriptive block of text
    for each chunk. This is an additional LLM-based summarization step, forming "element summaries."
    For demonstration, we simply concatenate them into a single string.
    """
    # In practice, you might use an LLM to produce an abstractive summary that merges
    # repeated references, standardizes entity mention formats, etc.
    summary_parts = []
    for e in element_instances:
        entity_str = (f"Entity: {e.get('entity_name')} ({e.get('entity_type')}). "
                      f"Description: {e.get('entity_description')}")
        rel = e.get("relationship", {})
        relationship_str = (f"Relationship: {rel.get('source_entity')} -> {rel.get('target_entity')}. "
                            f"Desc: {rel.get('description')}")
        summary_parts.append(f"{entity_str}\n{relationship_str}\n")

    return "\n".join(summary_parts)


def store_element_summary_in_graph(tx, summary_text: str):
    """
    For demonstration, store the summary as a node in Neo4j. 
    In a real solution, you might merge it back into existing entity/relationship nodes
    or create a dedicated "Summary" node referencing them.
    """
    query = """
    CREATE (s:ElementSummary {summary_text: $summary_text})
    """
    tx.run(query, summary_text=summary_text)

In [None]:
##################################################
# 2.4 ELEMENT SUMMARIES → GRAPH COMMUNITIES
##################################################

def detect_communities():
    """
    (Step 2.4) Perform community detection on the stored nodes/edges in the graph.
    For demonstration, we omit the full code for Leiden or other algorithms.
    In a real system, you’d gather the graph elements from Neo4j, run community detection,
    and store the results (community IDs, hierarchical structure, etc.) back into Neo4j.
    """
    # Placeholder function
    print("[Community Detection] Placeholder: run Leiden or other community detection.")

In [None]:
##################################################
# 2.5 GRAPH COMMUNITIES → COMMUNITY SUMMARIES
##################################################

def summarize_communities():
    """
    (Step 2.5) Summarize each community (or sub-community in a hierarchical approach).
    - Gather all element summaries (nodes, edges, covariates) in that community.
    - Summarize them, potentially chunking if they don't fit in an LLM context window.
    """
    # Placeholder logic
    print("[Community Summaries] Placeholder: gather summaries and do hierarchical summarization.")

In [None]:
##################################################
# 2.6 COMMUNITY SUMMARIES → COMMUNITY ANSWERS → GLOBAL ANSWER
##################################################

def answer_query_from_communities(user_query: str) -> str:
    """
    (Step 2.6) Use the hierarchical community summaries to answer user queries globally.
    - In an actual implementation, you'd fetch the relevant community summaries, chunk them,
      run partial QA on each chunk, rank answers by helpfulness, and then produce a final answer.
    - Below is a simplified approach that just returns a single, direct LLM-based QA.
    """
    # Placeholder logic
    # If you have multiple community summaries, you'd do partial QA in parallel, rank by
    # self-reported "helpfulness" (0-100), then combine or reduce them into a global answer.

    return f"Global answer to '{user_query}' (placeholder)."

In [None]:
##################################################
# 5) INGEST PDF -> STORE IN GRAPH (Putting Steps 2.1 and 2.2+ in context)
##################################################

def ingest_pdf_into_graph(pdf_path: str, doc_id: str):
    """
    1) Parse PDF into raw text.
    2) Chunk it (Step 2.1).
    3) Generate embeddings for each chunk.
    4) Store chunk nodes in Neo4j.
    5) For each chunk, call LLM to extract element instances (Step 2.2).
    6) Summarize them into a single descriptive block (Step 2.3).
    7) Optionally store the block in Neo4j for further community detection.
    """
    # Step 1: Parse PDF
    raw_text = parse_pdf(pdf_path)

    # Step 2: Chunk the text (default chunk_size=600 for improved recall)
    chunks = chunk_text(raw_text)

    # Step 3: Embeddings
    if USE_OPENAI:
        # Implement an OpenAI embedding function if desired
        raise NotImplementedError("OpenAI embeddings not implemented here.")
    else:
        embed_fn = get_hf_embedding_function()

    embeddings = embed_fn(chunks)

    # Step 4: Store chunk nodes in Neo4j
    with driver.session() as session:
        for i, chunk_text_str in enumerate(chunks):
            chunk_id = f"{doc_id}_chunk_{i}"
            embedding = embeddings[i]
            session.execute_write(add_chunk_node, chunk_text_str, chunk_id, embedding)
            session.execute_write(add_document_relationship, doc_id, chunk_id)

            # Step 5: Extract element instances
            element_instances = extract_element_instances_from_chunk(chunk_text_str)

            # Step 6: Summarize them (element-level)
            element_summary = summarize_element_instances(element_instances)

            # Step 7: Store the summary
            session.execute_write(store_element_summary_in_graph, element_summary)

    print(f"Ingested {len(chunks)} chunks from {pdf_path} into Neo4j under Document {doc_id}")

In [None]:
##################################################
# QA / RAG PIPELINE (Simplified)
##################################################

def perform_qa_with_graph(user_query: str) -> str:
    """
    A simplified RAG approach:
    1) Retrieve relevant chunks from the Neo4j graph (naive text search).
    2) Build a context from those chunks.
    3) Use either an open model or an OpenAI model for generative answer.

    NOTE: This doesn't incorporate full community-based summarization from 2.6.
    For a more complete approach, see `answer_query_from_communities()`.
    """
    with driver.session() as session:
        candidate_chunks = session.read_transaction(retrieve_relevant_chunks, user_query)

    # Build the context
    context = "\n\n".join([c["text"] for c in candidate_chunks])

    # Use a HuggingFace or OpenAI model for generation
    if USE_OPENAI:
        # If using OpenAI ChatCompletion:
        # openai.api_key = OPENAI_API_KEY
        # response = openai.ChatCompletion.create(
        #     model="gpt-3.5-turbo",
        #     messages=[
        #         {"role": "system", "content": "You are a helpful assistant."},
        #         {"role": "user", "content": f"Context: {context}\n\nQuestion: {user_query}"}
        #     ]
        # )
        # answer = response["choices"][0]["message"]["content"]
        raise NotImplementedError("OpenAI ChatCompletion usage not fully implemented here.")
    else:
        # Example with a local HF pipeline
        qa_pipeline = pipeline("text-generation", model="bigscience/bloom-560m")
        prompt = f"Context: {context}\nQuestion: {user_query}\nAnswer:"
        answer_list = qa_pipeline(prompt, max_new_tokens=100, do_sample=True)
        if answer_list:
            answer = answer_list[0]["generated_text"].split("Answer:")[-1].strip()
        else:
            answer = "No answer generated."

    return answer

In [None]:
#############################################
# MAIN EXECUTION EXAMPLE
#############################################

In [None]:
# 1) Ingest an arXiv PDF (Steps 2.1–2.3)
pdf_path = "data/docs/0704.2547.pdf"  # Replace with the path to your local arXiv PDF
doc_id = "0704.2547"           # Arbitrary doc ID for grouping in Neo4j
ingest_pdf_into_graph(pdf_path, doc_id)

In [None]:
# 2) Community detection & summarization (Steps 2.4–2.5)
detect_communities()
summarize_communities()

In [None]:
# 3) Ask a question (Step 2.6 simplified vs. full approach)
user_query = "What are the main contributions of the paper?"
# Simple QA (naive RAG):
answer = perform_qa_with_graph(user_query)
print(f"User's question: {user_query}")
print(f"Answer: {answer}")

In [None]:
# Alternatively, full approach using community-based QA:
# global_answer = answer_query_from_communities(user_query)
# print(f"Global Answer: {global_answer}")