In [1]:
!pip install  pymupdf langchain langchain-core langchain-community \
langchain-text-splitters langchain-huggingface langchain-anthropic \
langchain-experimental neo4j
!pip install --upgrade gradio

Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain-anthropic
  Downloading langchain_anthropic-0.3.11-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain-experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Collecting neo4j
  Downloading neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx

In [1]:
import gradio as gr
import fitz  # PyMuPDF
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Neo4jVector
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.graphs import Neo4jGraph
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_experimental.graph_transformers import LLMGraphTransformer
import os
from neo4j.exceptions import ServiceUnavailable, AuthError

# Neo4j configuration
NEO4J_URI = "neo4j+s://d6a2c0c3.databases.neo4j.io"
NEO4J_USER = "neo4j"
NEO4J_PASS = "KePxEDqbUPmiBE3VLT9brDHcqOFJ6HmkHWpSidspoKA"

# Anthropic configuration
ANTHROPIC_API_KEY = "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"

os.environ["NEO4J_URI"] = NEO4J_URI
os.environ["NEO4J_USERNAME"] = NEO4J_USER
os.environ["NEO4J_PASSWORD"] = NEO4J_PASS
os.environ["ANTHROPIC_API_KEY"] = ANTHROPIC_API_KEY

# Initialize components
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
llm = ChatAnthropic(model_name="claude-3-5-sonnet-20240620", temperature=0.5)

def clear_neo4j():
    try:
        graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASS)
        graph.query("MATCH (n) DETACH DELETE n")
    except (ServiceUnavailable, AuthError) as e:
        raise gr.Error(f"Failed to connect to Neo4j: {str(e)}")
    except Exception as e:
        raise gr.Error(f"Error clearing Neo4j database: {str(e)}")

def process_pdf(pdf_file):
    try:
        # Clear existing data
        clear_neo4j()

        # Load and process PDF
        try:
            loader = PyMuPDFLoader(pdf_file.name)
            documents = loader.load()
        except Exception as e:
            raise gr.Error(f"Error loading PDF file: {str(e)}")

        # Split documents
        try:
            splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
            docs = splitter.split_documents(documents)
        except Exception as e:
            raise gr.Error(f"Error splitting document: {str(e)}")

        # Store in Neo4j
        try:
            vectorstore = Neo4jVector.from_documents(
                documents=docs,
                embedding=embedding,
                url=NEO4J_URI,
                username=NEO4J_USER,
                password=NEO4J_PASS,
                index_name="pdf_chunks",
                node_label="Document",
                text_node_property="text",
                embedding_node_property="embedding"
            )
        except (ServiceUnavailable, AuthError) as e:
            raise gr.Error(f"Failed to connect to Neo4j: {str(e)}")
        except Exception as e:
            raise gr.Error(f"Error storing documents in Neo4j: {str(e)}")

        # Create knowledge graph
        try:
            GRAPH_PROMPT = ChatPromptTemplate.from_template("""
            Extract knowledge graph entities and relationships from the following text.
            Return only the entities and relationships in a structured format.

            Text: {input}
            """)

            transformer = LLMGraphTransformer(llm=llm, prompt=GRAPH_PROMPT)
            graph_documents = transformer.convert_to_graph_documents(docs)

            graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASS)
            graph.add_graph_documents(graph_documents)
        except Exception as e:
            raise gr.Error(f"Error creating knowledge graph: {str(e)}")

        return vectorstore, graph
    except Exception as e:
        raise gr.Error(str(e))

def create_qa_chain(vectorstore, graph):
    try:
        # Vector similarity retriever
        vector_retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

        def query_knowledge_graph(question):
            try:
                # Find text property name
                schema_query = """
                MATCH (n)
                WHERE NOT isEmpty(keys(n))
                RETURN keys(n) as properties
                LIMIT 1
                """
                schema_result = graph.query(schema_query)

                if schema_result and 'properties' in schema_result[0]:
                    possible_props = ['text', 'name', 'title', 'content', 'value']
                    props = schema_result[0]['properties']
                    text_property = next((p for p in possible_props if p in props), props[0])
                else:
                    return "No knowledge graph information available"

                # Extract entities
                extraction_prompt = """Extract the main entities from this question that would be relevant
                for querying a knowledge graph. Return them as a comma-separated list.

                Question: {question}
                Entities:"""

                entities_response = llm.invoke(extraction_prompt.format(question=question))
                entities = [e.strip() for e in entities_response.content.split(",") if e.strip()]

                if not entities:
                    return "No entities extracted from question"

                graph_info = []

                # Query nodes
                node_query = f"""
                MATCH (n)
                WHERE ANY(entity in $entities WHERE toLower(n.{text_property}) CONTAINS toLower(entity))
                RETURN n.{text_property} as text, labels(n) as types
                LIMIT 5
                """
                nodes = graph.query(node_query, params={"entities": entities}) or []

                # Query relationships
                rel_query = f"""
                MATCH (e1)-[r]->(e2)
                WHERE ANY(entity in $entities WHERE
                         toLower(e1.{text_property}) CONTAINS toLower(entity) OR
                         toLower(e2.{text_property}) CONTAINS toLower(entity))
                RETURN e1.{text_property} as source, type(r) as relationship, e2.{text_property} as target
                LIMIT 5
                """
                relationships = graph.query(rel_query, params={"entities": entities}) or []

                # Format results
                if nodes:
                    graph_info.append("### Found Nodes:")
                    for node in nodes:
                        node_text = node.get('text', 'Unknown')
                        types = node.get('types', ['Unknown'])
                        graph_info.append(f"- {node_text} ({', '.join(types)})")

                if relationships:
                    graph_info.append("\n### Found Relationships:")
                    for rel in relationships:
                        source = rel.get('source', 'Unknown')
                        target = rel.get('target', 'Unknown')
                        relationship = rel.get('relationship', 'related_to')
                        graph_info.append(f"- {source} --[{relationship}]--> {target}")

                return "\n".join(graph_info) if graph_info else "No relevant knowledge graph information found"

            except Exception as e:
                print(f"Error querying knowledge graph: {e}")
                return "No knowledge graph information available"

        # Prompt template
        COMBINED_PROMPT = ChatPromptTemplate.from_template("""
        Answer the question based on the following information:

        ### Document Context:
        {context}

        ### Knowledge Graph Information:
        {graph_info}

        ### Question:
        {question}

        Provide a short answer combining both sources .
        If there are contradictions, mention them and explain which source you're prioritizing.
        """)

        def format_docs(docs):
            return "\n\n".join([d.page_content for d in docs])

        chain = (
            RunnablePassthrough.assign(
                context=lambda x: vector_retriever.get_relevant_documents(
                    x["question"] if isinstance(x, dict) else x
                ),
                graph_info=lambda x: query_knowledge_graph(
                    x["question"] if isinstance(x, dict) else x
                )
            )
            | {
                "question": lambda x: x["question"] if isinstance(x, dict) else x,
                "context": lambda x: format_docs(x["context"]),
                "graph_info": lambda x: x["graph_info"]
            }
            | COMBINED_PROMPT
            | llm
        )

        return chain
    except Exception as e:
        raise gr.Error(f"Error creating QA chain: {str(e)}")

# Global variables to store the QA chain
qa_chain = None
processed_file = None

def upload_file(file):
    global qa_chain, processed_file
    try:
        if file is None:
            raise gr.Error("Please upload a PDF file first.")

        # Process the PDF and create the QA chain
        vectorstore, graph = process_pdf(file)
        qa_chain = create_qa_chain(vectorstore, graph)
        processed_file = file.name

        return f"Successfully processed {file.name}. You can now ask questions."
    except gr.Error as e:
        # Re-raise Gradio errors as they already have the proper format
        raise e
    except Exception as e:
        # Convert other exceptions to Gradio errors
        raise gr.Error(f"Error processing file: {str(e)}")

def answer_question(question):
    global qa_chain
    try:
        if qa_chain is None:
            raise gr.Error("Please upload a PDF file first.")
        if not question:
            raise gr.Error("Please enter a question.")

        response = qa_chain.invoke({"question": question})
        return response.content
    except gr.Error as e:
        raise e
    except Exception as e:
        raise gr.Error(f"Error answering question: {str(e)}")

# Create Gradio interface with dark blue theme
with gr.Blocks(theme=gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="indigo",
    neutral_hue="slate",
    spacing_size="sm",
    radius_size="lg",
    font="default",
    text_size="sm"
)) as demo:
    gr.Markdown("# PDF Question Answering System")
    gr.Markdown("Upload a PDF and ask questions about its content.")

    with gr.Row():
        with gr.Column(scale=1):
            file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
            upload_status = gr.Textbox(label="Upload Status", interactive=False)

        with gr.Column(scale=3):
            answer_output = gr.Textbox(label="Answer", interactive=False, lines=10)
            with gr.Row():
                question_input = gr.Textbox(
                    label="Your Question",
                    placeholder="Ask a question about the PDF...",
                    scale=4
                )
                submit_btn = gr.Button("Ask", variant="primary", scale=1)

    file_input.upload(
        fn=upload_file,
        inputs=[file_input],
        outputs=[upload_status]
    )

    question_input.submit(
        fn=answer_question,
        inputs=[question_input],
        outputs=[answer_output]
    )

    submit_btn.click(
        fn=answer_question,
        inputs=[question_input],
        outputs=[answer_output]
    )

if __name__ == "__main__":
    demo.launch(debug=True, share=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://261ee5d7dd9f66bc42.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


  graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASS)
  context=lambda x: vector_retriever.get_relevant_documents(


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://261ee5d7dd9f66bc42.gradio.live
