In [32]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.1


In [36]:
import os
from sentence_transformers import SentenceTransformer, util
from langchain.text_splitter import RecursiveCharacterTextSplitter
from neo4j import GraphDatabase
import fitz  # PyMuPDF for PDF text extraction

# Neo4j Integration
class Neo4jHandler:
    def __init__(self, uri, user, password, database):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        self.database = database

    def close(self):
        self.driver.close()

    def store_chunks(self, chunks, relationships):
        """Store chunks as nodes and establish relationships."""
        with self.driver.session(database=self.database) as session:
            for chunk_id, chunk_text in chunks.items():
                # Create nodes for each chunk
                session.run(
                    """
                    MERGE (c:Chunk {id: $chunk_id, text: $chunk_text})
                    """,
                    chunk_id=chunk_id,
                    chunk_text=chunk_text,
                )
            for rel in relationships:
                # Create relationships between relevant chunks
                session.run(
                    """
                    MATCH (c1:Chunk {id: $source_id})
                    MATCH (c2:Chunk {id: $target_id})
                    MERGE (c1)-[:RELATED]->(c2)
                    """,
                    source_id=rel["source"],
                    target_id=rel["target"],
                )

# PDF Text Extraction
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    text = ""
    with fitz.open(pdf_path) as pdf:
        for page in pdf:
            text += page.get_text()
    return text

# Text Splitting and Similarity
def split_and_analyze_document(file_path):
    """Split document and find relationships."""
    # Extract text from the PDF
    document = extract_text_from_pdf(file_path)

    # Split document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
    chunks = text_splitter.split_text(document)

    # Assign unique IDs to chunks
    chunk_dict = {f"chunk_{i}": chunk for i, chunk in enumerate(chunks)}

    # Use Sentence Transformers for embeddings
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(list(chunk_dict.values()), convert_to_tensor=True)

    # Find relationships based on cosine similarity
    relationships = []
    for i, source_embedding in enumerate(embeddings):
        cos_similarities = util.cos_sim(source_embedding, embeddings)
        for j, score in enumerate(cos_similarities[0]):
            if i != j and score > 0.8:  # Threshold for similarity
                relationships.append({"source": f"chunk_{i}", "target": f"chunk_{j}"})

    return chunk_dict, relationships

# Initialize Neo4j handler
neo4j_handler = Neo4jHandler(
    uri="bolt://3.235.154.204",
    user="neo4j",
    password="networks-centerline-symbols",
    database="neo4j",
)

# Process the document and store in Neo4j
file_path = "/content/Redacted.pdf"
chunks, relationships = split_and_analyze_document(file_path)
neo4j_handler.store_chunks(chunks, relationships)

# Close Neo4j connection
neo4j_handler.close()
