In [None]:
# !pip install neo4j
# !pip install langchain
# !pip install PyPDF2
# !pip install tiktoken
# !pip install openai  # Only if you want to use the OpenAI API
# !pip install transformers  # For open (HF) models
# !pip install sentence_transformers
# !pip install -U langchain-community


In [22]:
import os
from typing import List, Dict, Any

# -----------------------
# Neo4j Database imports
# -----------------------
from neo4j import GraphDatabase

# -----------------------
# LLM / Embeddings imports
# -----------------------
# If using HuggingFace transformers:
from transformers import pipeline

# If using LangChain for retrieval + QA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.question_answering import load_qa_chain

# If you want to use OpenAI, uncomment:
# import openai

# -----------------------
# PDF Parsing library
# -----------------------
import PyPDF2  # or pypdf if needed

In [23]:
#############################################
# 1) CONFIGURATION: toggle open vs. OpenAI
#############################################

USE_OPENAI = False  # Set to True if you want to switch to OpenAI’s ChatGPT
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "YOUR_OPENAI_API_KEY")

# For Neo4j:
NEO4J_URI = "neo4j+s://97d4d6ef.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "3bhQDb-56wt4bhRkekO83OCj0YXjD9N2lWn7sQpOnkc"




In [24]:
##################################################
# 2) NEO4J CONNECTION AND GRAPH FUNCTIONS
##################################################

# Connect to Neo4j
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

def add_chunk_to_graph(tx, chunk_text: str, chunk_id: str, embedding: List[float]):
    """
    Create a node in Neo4j for each chunk of text from the document,
    along with the embedding as properties (for demonstration).
    We can store the embedding as a list or a string representation.
    """
    # Convert embedding to a string (for demonstration).
    embedding_str = ",".join([str(x) for x in embedding])

    query = """
    MERGE (c:Chunk {chunk_id: $chunk_id})
    ON CREATE SET c.text = $chunk_text,
                  c.embedding = $embedding_str
    """
    tx.run(query, chunk_id=chunk_id, chunk_text=chunk_text, embedding_str=embedding_str)


def add_document_relationship(tx, doc_id: str, chunk_id: str):
    """
    Link each chunk node to a parent Document node in Neo4j.
    """
    query = """
    MERGE (d:Document {doc_id: $doc_id})
    MERGE (c:Chunk {chunk_id: $chunk_id})
    MERGE (d)-[:HAS_CHUNK]->(c)
    """
    tx.run(query, doc_id=doc_id, chunk_id=chunk_id)


def retrieve_relevant_chunks(tx, user_query: str, limit: int = 5) -> List[Dict[str, Any]]:
    """
    A simplistic retrieval function that returns chunk nodes.
    In a real scenario, you'd have a vector similarity search using
    embeddings. Here, for demonstration, we only do a naive text search
    in the graph. You can integrate with external vector DB or
    vector similarity queries in Neo4j (Graph Data Science).
    """
    query = """
    MATCH (c:Chunk)
    WHERE c.text CONTAINS $user_query
    RETURN c.chunk_id AS chunk_id, c.text AS text
    LIMIT $limit
    """
    result = tx.run(query, user_query=user_query, limit=limit)
    return [record.data() for record in result]



In [25]:
##################################################
# 3) PDF PARSING AND CHUNKING
##################################################

def parse_pdf(pdf_path: str) -> str:
    """
    Extract raw text from a PDF file using PyPDF2.
    """
    text = ""
    with open(pdf_path, 'rb') as f:
        reader = PyPDF2.PdfReader(f)
        for page in reader.pages:
            text += page.extract_text() + "\n"
    return text


def chunk_text_func(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> List[str]:
    """
    Use LangChain's RecursiveCharacterTextSplitter to chunk the text
    for better embeddings & retrieval.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_text(text)
    return chunks

In [26]:
##################################################
# 4) EMBEDDING UTILITIES
##################################################

def get_hf_embedding_function(model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
    """
    Returns a function that can generate embeddings using a HuggingFace model.
    """
    hf_embed = HuggingFaceEmbeddings(model_name=model_name)
    return hf_embed.embed_documents


# If using OpenAI (uncomment if needed):
# def get_openai_embedding_function(model_name: str = "text-embedding-ada-002"):
#     def _embeddings(texts: List[str]) -> List[List[float]]:
#         response = openai.Embedding.create(
#             input=texts,
#             model=model_name
#         )
#         embeddings = [item["embedding"] for item in response["data"]]
#         return embeddings
#     return _embeddings

In [27]:
##################################################
# 5) INGEST PDF -> STORE IN GRAPH
##################################################

def ingest_pdf_into_graph(pdf_path: str, doc_id: str):
    """
    Parse PDF, chunk it, generate embeddings, and store in Neo4j.
    """
    # Step 1: Parse the PDF
    raw_text = parse_pdf(pdf_path)

    # Step 2: Chunk the text
    chunks = chunk_text_func(raw_text)


    # Step 3: Embeddings
    if USE_OPENAI:
        # Example if you have an OpenAI embeddings function
        # embed_fn = get_openai_embedding_function()
        # ...
        raise NotImplementedError("OpenAI Embeddings not implemented in this snippet.")
    else:
        embed_fn = get_hf_embedding_function()

    # Generate embeddings in batch
    embeddings = embed_fn(chunks)

    # Step 4: Store in Neo4j
    with driver.session() as session:
        for i, chunk_text in enumerate(chunks):
            chunk_id = f"{doc_id}_chunk_{i}"
            embedding = embeddings[i]
            session.write_transaction(add_chunk_to_graph, chunk_text, chunk_id, embedding)
            session.write_transaction(add_document_relationship, doc_id, chunk_id)

    print(f"Ingested {len(chunks)} chunks from {pdf_path} into Neo4j under Document {doc_id}")

In [30]:
##################################################
# 6) QA / RAG PIPELINE
##################################################

def perform_qa_with_graph(user_query: str) -> str:
    """
    Retrieves relevant chunks from the Neo4j graph, then performs
    a simple RAG-based answer. Here we demonstrate a naive approach:
    1) Retrieve relevant chunks from the graph
    2) Concatenate them as context
    3) Use either an open model or an OpenAI model for generative answer.
    """
    with driver.session() as session:
        # Retrieve relevant chunks (Naive text-based example)
        candidate_chunks = session.read_transaction(retrieve_relevant_chunks, user_query)
    
    print(f"Found {len(candidate_chunks)} relevant chunks in the graph.")
    print(candidate_chunks)
    
    # Build the context
    context = "\n\n".join([c["text"] for c in candidate_chunks])

    # Use a huggingface or openAI model for generation
    if USE_OPENAI:
        # Example for OpenAI's chatgpt:
        # openai.api_key = OPENAI_API_KEY
        # response = openai.ChatCompletion.create(
        #     model="gpt-3.5-turbo",
        #     messages=[
        #         {"role": "system", "content": "You are a helpful assistant."},
        #         {"role": "user", "content": f"Context: {context}\n\nQuestion: {user_query}"}
        #     ]
        # )
        # answer = response["choices"][0]["message"]["content"]
        raise NotImplementedError("OpenAI ChatCompletion usage not fully implemented here.")
    else:
        # Example with a local HF pipeline
        qa_pipeline = pipeline("text-generation", model="bigscience/bloom-560m")
        prompt = f"Context: {context}\nQuestion: {user_query}\nAnswer:"
        answer_list = qa_pipeline(prompt, max_new_tokens=100, do_sample=True)
        if answer_list:
            answer = answer_list[0]["generated_text"].split("Answer:")[-1].strip()
        else:
            answer = "No answer generated."
    
    return answer

In [None]:
#############################################
# 7) MAIN EXECUTION EXAMPLE
#############################################

# Example usage:
# 1) Ingest an arXiv PDF
pdf_path = "data/docs/0704.2547.pdf"  # Replace with the path to your local arXiv PDF
doc_id = "0704.2547"           # Arbitrary doc ID for grouping in Neo4j
ingest_pdf_into_graph(pdf_path, doc_id)

In [31]:
# 2) Ask a question
user_query = "What are the main contributions of the paper?"
answer = perform_qa_with_graph(user_query)
print(f"User's question: {user_query}")
print(f"Answer: {answer}")


  candidate_chunks = session.read_transaction(retrieve_relevant_chunks, user_query)


Found 0 relevant chunks in the graph.
[]


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


User's question: What are the main contributions of the paper?
Answer: in the paper we show that a class of functions defined by the identity equation in a compact set is a continuous functional from a compact interval into the entire function space. We also show that for this class of functions the function values of (1.7) are the fundamental solutions of the integro-differential equation.
Finally we show that for bounded functions (1.8) a necessary condition for solutions for (1.7) and (1.8) is that its values are contained
