In [1]:
import json
import os
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Neo4jVector
from langchain.graphs import Neo4jGraph
from langchain.chains import GraphCypherQAChain
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from neo4j import GraphDatabase
import pandas as pd
import numpy as np

from __future__ import annotations
import os, json, time, random, hashlib, re, traceback
from pathlib import Path
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Iterator, Tuple
from collections import defaultdict

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import json
import re
import time  # Import the time module
import random # Import the random module
from typing import Dict, Any, List, Optional

In [None]:
# Complete Guide: Building Knowledge Graphs from PDFs using Docling, LangChain, and Neo4j
# =====================================================================================

# 1. INSTALLATION REQUIREMENTS
# ============================

# Install required packages:
"""
pip install langchain-docling
pip install neo4j
pip install langchain-experimental
pip install langchain-openai
pip install docling
pip install neo4j-graphrag-python
"""

# 2. IMPORT REQUIRED LIBRARIES
# ============================

import os
from pathlib import Path
from typing import List, Dict, Any
import asyncio

# Neo4j and GraphRAG
from neo4j import GraphDatabase
from neo4j_graphrag.embeddings import OpenAIEmbeddings
from neo4j_graphrag.experimental.pipeline.kg_builder import SimpleKGPipeline
from neo4j_graphrag.llm import OpenAILLM

# LangChain components
from langchain_docling import DoclingLoader
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_openai import ChatOpenAI
from langchain.text_splitter import TokenTextSplitter
from langchain_core.documents import Document
from langchain_neo4j import Neo4jGraph

# Docling specific
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker

# 3. SETUP CONFIGURATION
# ======================

class KnowledgeGraphConfig:
    def __init__(self):
        # Neo4j configuration
        self.neo4j_uri = "bolt://localhost:7687"  # or your Neo4j Aura URL
        self.neo4j_username = "neo4j"
        self.neo4j_password = "your_password"
        
        # OpenAI configuration
        self.openai_api_key = "your_openai_api_key"
        
        # LLM model settings
        self.llm_model = "gpt-4o"
        self.embedding_model = "text-embedding-3-large"
        
        # Text processing settings
        self.chunk_size = 1000
        self.chunk_overlap = 200
        
        # Knowledge graph schema
        self.allowed_nodes = [
            "Person", "Organization", "Location", "Event", 
            "Concept", "Technology", "Document", "Date"
        ]
        self.allowed_relationships = [
            "WORKS_AT", "LOCATED_IN", "PART_OF", "RELATED_TO", 
            "MENTIONS", "OCCURRED_ON", "CREATED_BY", "CONTAINS"
        ]

# 4. MAIN KNOWLEDGE GRAPH BUILDER CLASS
# =====================================

class PDFKnowledgeGraphBuilder:
    def __init__(self, config: KnowledgeGraphConfig):
        self.config = config
        self.setup_connections()
        self.setup_models()
    
    def setup_connections(self):
        """Initialize Neo4j connection"""
        self.driver = GraphDatabase.driver(
            self.config.neo4j_uri,
            auth=(self.config.neo4j_username, self.config.neo4j_password)
        )
        
        # Verify connection
        self.driver.verify_connectivity()
        print("✓ Connected to Neo4j successfully")
    
    def setup_models(self):
        """Initialize LLM and embedding models"""
        # Set OpenAI API key
        os.environ["OPENAI_API_KEY"] = self.config.openai_api_key
        
        # Initialize LLM for graph transformation
        self.llm = ChatOpenAI(
            temperature=0,
            model_name=self.config.llm_model
        )
        
        # Initialize graph transformer
        self.graph_transformer = LLMGraphTransformer(
            llm=self.llm,
            allowed_nodes=self.config.allowed_nodes,
            allowed_relationships=self.config.allowed_relationships,
            node_properties=True,
            relationship_properties=True
        )
        
        # Initialize embeddings
        self.embedder = OpenAIEmbeddings(
            model=self.config.embedding_model
        )
        
        print("✓ Models initialized successfully")
    
    def load_pdf_with_docling(self, pdf_path: str) -> List[Document]:
        """Load and process PDF using Docling"""
        print(f"Loading PDF: {pdf_path}")
        
        # Method 1: Using DoclingLoader (LangChain integration)
        loader = DoclingLoader(
            file_path=pdf_path,
            chunker=HybridChunker(
                tokenizer=self.config.embedding_model
            )
        )
        documents = loader.load()
        
        # Method 2: Alternative using DocumentConverter directly
        # converter = DocumentConverter()
        # result = converter.convert(pdf_path)
        # markdown_text = result.document.export_to_markdown()
        # documents = [Document(page_content=markdown_text)]
        
        print(f"✓ Loaded {len(documents)} document chunks")
        return documents
    
    def chunk_documents(self, documents: List[Document]) -> List[Document]:
        """Split documents into smaller chunks for processing"""
        text_splitter = TokenTextSplitter(
            chunk_size=self.config.chunk_size,
            chunk_overlap=self.config.chunk_overlap
        )
        
        chunks = text_splitter.split_documents(documents)
        print(f"✓ Created {len(chunks)} text chunks")
        return chunks
    
    def extract_graph_from_chunks(self, chunks: List[Document]) -> List[Any]:
        """Extract graph documents from text chunks using LLM"""
        print("Extracting entities and relationships from text...")
        
        # Convert chunks to graph documents
        graph_documents = self.graph_transformer.convert_to_graph_documents(chunks)
        
        print(f"✓ Extracted {len(graph_documents)} graph documents")
        return graph_documents
    
    def store_in_neo4j(self, graph_documents: List[Any]):
        """Store extracted graph data in Neo4j"""
        print("Storing graph data in Neo4j...")
        
        # Initialize Neo4j graph
        graph = Neo4jGraph(
            url=self.config.neo4j_uri,
            username=self.config.neo4j_username,
            password=self.config.neo4j_password
        )
        
        # Store graph documents
        graph.add_graph_documents(
            graph_documents,
            baseEntityLabel=True,  # Add __Entity__ label to all nodes
            include_source=True    # Link nodes to source documents
        )
        
        print("✓ Graph data stored in Neo4j successfully")
        
        # Print graph schema
        print("\nGraph Schema:")
        print(graph.schema)
    
    def create_vector_index(self):
        """Create vector index for similarity search"""
        print("Creating vector index...")
        
        with self.driver.session() as session:
            # Create vector index on chunk embeddings
            session.run('''
                CREATE VECTOR INDEX chunk_embeddings IF NOT EXISTS
                FOR (c:Chunk)
                ON c.embedding
                OPTIONS {
                    indexConfig: {
                        `vector.dimensions`: 3072,
                        `vector.similarity_function`: 'cosine'
                    }
                }
            ''')
        
        print("✓ Vector index created")
    
    def build_knowledge_graph(self, pdf_path: str):
        """Main method to build knowledge graph from PDF"""
        print(f"\n🚀 Building Knowledge Graph from: {pdf_path}")
        print("=" * 60)
        
        try:
            # Step 1: Load PDF with Docling
            documents = self.load_pdf_with_docling(pdf_path)
            
            # Step 2: Chunk documents
            chunks = self.chunk_documents(documents)
            
            # Step 3: Extract graph structure
            graph_documents = self.extract_graph_from_chunks(chunks)
            
            # Step 4: Store in Neo4j
            self.store_in_neo4j(graph_documents)
            
            # Step 5: Create vector index
            self.create_vector_index()
            
            print("\n✅ Knowledge Graph built successfully!")
            
        except Exception as e:
            print(f"❌ Error building knowledge graph: {str(e)}")
            raise
    
    def close_connections(self):
        """Close database connections"""
        if hasattr(self, 'driver'):
            self.driver.close()
            print("✓ Database connections closed")

# 5. ALTERNATIVE APPROACH USING NEO4J GRAPHRAG PIPELINE
# ====================================================

class SimpleKGBuilder:
    def __init__(self, config: KnowledgeGraphConfig):
        self.config = config
        self.setup_pipeline()
    
    def setup_pipeline(self):
        """Setup Neo4j GraphRAG pipeline"""
        # Initialize database connection
        self.driver = GraphDatabase.driver(
            self.config.neo4j_uri,
            auth=(self.config.neo4j_username, self.config.neo4j_password)
        )
        
        # Initialize LLM
        llm = OpenAILLM(
            model_name=self.config.llm_model,
            model_params={
                "max_tokens": 2000,
                "response_format": {"type": "json_object"},
                "temperature": 0,
            }
        )
        
        # Initialize embeddings
        embedder = OpenAIEmbeddings(model=self.config.embedding_model)
        
        # Create pipeline
        self.kg_pipeline = SimpleKGPipeline(
            llm=llm,
            driver=self.driver,
            embedder=embedder,
            entities=self.config.allowed_nodes,
            relations=self.config.allowed_relationships,
            from_pdf=True,
            on_error="IGNORE"
        )
    
    def build_from_pdf(self, pdf_path: str):
        """Build knowledge graph using SimpleKGPipeline"""
        print(f"Building KG using SimpleKGPipeline from: {pdf_path}")
        
        # Run async pipeline
        asyncio.run(self.kg_pipeline.run_async(file_path=pdf_path))
        
        print("✅ Knowledge Graph built using SimpleKGPipeline!")
    
    def close(self):
        self.driver.close()

# 6. QUERYING THE KNOWLEDGE GRAPH
# ===============================

class KnowledgeGraphQuerier:
    def __init__(self, config: KnowledgeGraphConfig):
        self.config = config
        self.driver = GraphDatabase.driver(
            config.neo4j_uri,
            auth=(config.neo4j_username, config.neo4j_password)
        )
    
    def get_graph_statistics(self):
        """Get basic statistics about the knowledge graph"""
        with self.driver.session() as session:
            # Count nodes by label
            node_counts = session.run('''
                MATCH (n)
                RETURN labels(n) as labels, count(n) as count
                ORDER BY count DESC
            ''').data()
            
            # Count relationships by type
            rel_counts = session.run('''
                MATCH ()-[r]->()
                RETURN type(r) as relationship_type, count(r) as count
                ORDER BY count DESC
            ''').data()
            
            print("Node Counts:")
            for record in node_counts:
                print(f"  {record['labels']}: {record['count']}")
            
            print("\nRelationship Counts:")
            for record in rel_counts:
                print(f"  {record['relationship_type']}: {record['count']}")
    
    def find_similar_concepts(self, concept: str, limit: int = 5):
        """Find concepts similar to the given concept"""
        with self.driver.session() as session:
            result = session.run('''
                MATCH (n:Concept {name: $concept})-[r]-(connected)
                RETURN connected.name as related_concept, 
                       type(r) as relationship,
                       count(*) as strength
                ORDER BY strength DESC
                LIMIT $limit
            ''', concept=concept, limit=limit)
            
            return result.data()
    
    def close(self):
        self.driver.close()

# 7. ADVANCED FEATURES
# ===================

class AdvancedKGFeatures:
    @staticmethod
    def setup_custom_schema():
        """Example of setting up custom schema for specific domains"""
        # Medical domain schema
        medical_schema = {
            "allowed_nodes": [
                "Disease", "Symptom", "Treatment", "Medication", 
                "Patient", "Doctor", "Hospital", "Research"
            ],
            "allowed_relationships": [
                "HAS_SYMPTOM", "TREATED_WITH", "PRESCRIBED", 
                "DIAGNOSED_WITH", "WORKS_AT", "RESEARCHES"
            ]
        }
        
        # Legal domain schema
        legal_schema = {
            "allowed_nodes": [
                "Case", "Law", "Court", "Judge", "Lawyer", 
                "Client", "Document", "Precedent"
            ],
            "allowed_relationships": [
                "CITED_IN", "RULED_BY", "REPRESENTED_BY", 
                "APPEALS_TO", "REFERENCES", "GOVERNED_BY"
            ]
        }
        
        return {"medical": medical_schema, "legal": legal_schema}
    
    @staticmethod
    def batch_process_pdfs(pdf_directory: str, config: KnowledgeGraphConfig):
        """Process multiple PDFs in batch"""
        pdf_files = list(Path(pdf_directory).glob("*.pdf"))
        
        kg_builder = PDFKnowledgeGraphBuilder(config)
        
        for pdf_file in pdf_files:
            print(f"Processing: {pdf_file.name}")
            try:
                kg_builder.build_knowledge_graph(str(pdf_file))
            except Exception as e:
                print(f"Error processing {pdf_file.name}: {e}")
        
        kg_builder.close_connections()

# 8. USAGE EXAMPLE
# ================

def main():
    # Initialize configuration
    config = KnowledgeGraphConfig()
    
    # Update with your actual credentials
    config.neo4j_uri = "bolt://localhost:7687"  # or Neo4j Aura URI
    config.neo4j_username = "neo4j"
    config.neo4j_password = "your_neo4j_password"
    config.openai_api_key = "your_openai_api_key"
    
    # PDF file path
    pdf_path = "path/to/your/document.pdf"
    
    # Method 1: Using detailed approach with Docling + LangChain
    print("Method 1: Detailed approach")
    kg_builder = PDFKnowledgeGraphBuilder(config)
    kg_builder.build_knowledge_graph(pdf_path)
    kg_builder.close_connections()
    
    print("\n" + "="*60 + "\n")
    
    # Method 2: Using SimpleKGPipeline
    print("Method 2: SimpleKGPipeline approach")
    simple_builder = SimpleKGBuilder(config)
    simple_builder.build_from_pdf(pdf_path)
    simple_builder.close()
    
    # Query the knowledge graph
    print("\n" + "="*60 + "\n")
    print("Querying Knowledge Graph:")
    querier = KnowledgeGraphQuerier(config)
    querier.get_graph_statistics()
    querier.close()

if __name__ == "__main__":
    main()

In [2]:
web_doc_path = r"I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output\verizon_production_web_documents.json"

web_doc_index_path =r"I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output\verizon_document_index.json"

In [5]:
# New cell — Repack `documents` as {section: [docs...]} within a single file

from pathlib import Path
from urllib.parse import urlparse
from collections import defaultdict
import json, re

# Uses your existing paths
src_path = Path(web_doc_path)

# Choose whether to overwrite the original file or write a new sibling file
overwrite_existing = False
dest_path = src_path if overwrite_existing else src_path.with_name(src_path.stem + ".sectioned.json")

# Load source JSON
with open(src_path, "r", encoding="utf-8") as f:
    data = json.load(f)

documents = data.get("documents", [])
stats = data.get("statistics", {}) or {}
meta = data.get("metadata", {}) or {}
doc_count_total = len(documents)

def section_from_url(url: str) -> str:
    """First segment after '/about/' else first path segment; fallback 'unknown'."""
    if not url:
        return "unknown"
    parsed = urlparse(url)
    parts = [p for p in parsed.path.strip("/").split("/") if p]
    if not parts:
        return "unknown"
    try:
        i = parts.index("about")
        return parts[i + 1] if i + 1 < len(parts) else "unknown"
    except ValueError:
        return parts[0]

# Group full docs by section
section_to_docs = defaultdict(list)
skipped = 0
for doc in documents:
    url = doc.get("url") or ""
    if not url:
        skipped += 1
        continue
    sec = section_from_url(url)
    section_to_docs[sec].append({
        "url": doc.get("url"),
        "status": doc.get("status"),
        "metadata": doc.get("metadata"),
        "content": doc.get("content") or ""
    })

# Build new data object (single-file, sectioned)
section_counts = {k: len(v) for k, v in section_to_docs.items()}
sections_sorted = sorted(section_to_docs.keys())

# Prefer provided total_content_characters else compute
total_chars = stats.get("total_content_characters")
if total_chars is None:
    total_chars = sum(len((d.get("content") or "")) for docs in section_to_docs.values() for d in docs)

new_data = {
    "metadata": meta,
    "statistics": {
        **stats,
        "total_documents": doc_count_total,         # original total
        "documents_grouped": sum(section_counts.values()),  # after skipping any malformed rows
        "skipped_documents": skipped,
        "sections": sections_sorted,
        "section_counts": section_counts,
        "total_content_characters": total_chars
    },
    # Key change: documents is now a dict keyed by section
    "documents": section_to_docs
}

# Save as a single file
with open(dest_path, "w", encoding="utf-8") as f:
    json.dump(new_data, f, ensure_ascii=False, indent=2)

print(f"Wrote sectioned file: {dest_path}")
print(f"Sections: {len(section_to_docs)} | Docs total: {doc_count_total} | Grouped: {sum(section_counts.values())} | Skipped: {skipped}")

# Lightweight helpers to consume the single file by section
def load_section_from_file(path: str | Path, section: str):
    with open(path, "r", encoding="utf-8") as f:
        d = json.load(f)
    docs_by_sec = d.get("documents", {})
    if section not in docs_by_sec:
        raise KeyError(f"Section '{section}' not found. Available (sample): {list(docs_by_sec.keys())[:10]}")
    return docs_by_sec[section]


Wrote sectioned file: I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output\verizon_production_web_documents.sectioned.json
Sections: 9 | Docs total: 235 | Grouped: 235 | Skipped: 0


In [6]:
web_doc_section_path = r"I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output\verizon_production_web_documents.sectioned.json"

In [None]:

# Example:
# terms_docs = load_section_from_file(dest_path, "terms-conditions")
# print(len(terms_docs), "docs in terms-conditions")

### Processing PDFs

In [7]:
verizon_quaterly_pdf_path = r"I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\downloaded_verizon_financial_pdfs\downloaded_verizon_quarterly_pdfs"

In [10]:
# New cell — Build year/quarter PDF index and save to output

from pathlib import Path
import re, json
from collections import defaultdict

# Base directory from your notebook (already defined earlier)
base_quarter_dir = Path(verizon_quaterly_pdf_path)  # e.g., ...\downloaded_verizon_quarterly_pdfs
output_dir = Path(r"I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output")
output_dir.mkdir(parents=True, exist_ok=True)
quarterly_index_json_path = output_dir / "verizon_quarterly_pdfs_index.json"

# Helpers
YEAR_RE = re.compile(r"^\d{4}$")
Q_RE = re.compile(r"\bq(?:uarter)?\s*([1-4])\b", re.IGNORECASE)
Q_PATTERNS = [
    re.compile(r"\bq(?:uarter)?\s*([1-4])\b", re.IGNORECASE),  # Q1, Quarter 1
    re.compile(r"\b([1-4])\s*q(?:uarter)?\b", re.IGNORECASE),  # 1 Q, 1 Quarter
    re.compile(r"\b([1-4])q\b", re.IGNORECASE),                # 1Q
    re.compile(r"\bq([1-4])\b", re.IGNORECASE),                # Q1
]

def extract_quarter_key(name: str) -> str | None:
    n = name.lower()
    # Normalize common separators
    n = re.sub(r"[_\-]+", " ", n)
    for pat in Q_PATTERNS:
        m = pat.search(n)
        if m:
            num = m.group(1)
            return f"q{num}"
    return None

def classify_pdf(file_name: str) -> str:
    """
    Classify into 'text' (transcript), 'infographic' (presentation/deck/slides/infographic),
    else 'tables'.
    """
    n = file_name.lower()
    if "transcript" in n:
        return "text"
    if ("presentation" in n) or ("deck" in n) or ("slides" in n) or ("infographic" in n):
        return "infographic"
    return "tables"

def ensure_bucket() -> dict:
    return {"text": [], "infographic": [], "tables": []}

# Build index
index: dict[str, dict[str, dict[str, list[str]]]] = {}
skipped_year_dirs: list[str] = []
skipped_quarter_dirs: list[str] = []

if not base_quarter_dir.exists():
    raise FileNotFoundError(f"Quarterly base dir not found: {base_quarter_dir}")

for year_dir in sorted([p for p in base_quarter_dir.iterdir() if p.is_dir()], key=lambda p: p.name):
    if not YEAR_RE.match(year_dir.name):
        skipped_year_dirs.append(year_dir.name)
        continue
    year_key = year_dir.name
    year_entry: dict[str, dict[str, list[str]]] = {}
    # Accept any subfolder that yields a quarter key
    for q_dir in sorted([p for p in year_dir.iterdir() if p.is_dir()], key=lambda p: p.name):
        q_key = extract_quarter_key(q_dir.name)
        if not q_key:
            skipped_quarter_dirs.append(str(q_dir.relative_to(base_quarter_dir)))
            continue
        buckets = ensure_bucket()
        # Collect PDFs recursively inside the quarter folder
        for pdf_path in q_dir.rglob("*.pdf"):
            cat = classify_pdf(pdf_path.name)
            buckets[cat].append(str(pdf_path))
        # Only add quarter if there is at least one file or you want empty buckets always
        if any(buckets.values()):
            year_entry[q_key] = buckets
        else:
            # Still include empty quarter buckets for consistency; comment out if not desired
            year_entry[q_key] = buckets
    if year_entry:
        index[year_key] = year_entry

# Save JSON
with open(quarterly_index_json_path, "w", encoding="utf-8") as f:
    json.dump(index, f, ensure_ascii=False, indent=2)

# Summary
total_files = sum(len(files) for y in index.values() for q in y.values() for files in q.values())
quarters_count = sum(len(y) for y in index.values())
print(f"Wrote: {quarterly_index_json_path}")
print(f"Years: {len(index)} | Quarters: {quarters_count} | Total PDFs indexed: {total_files}")
if skipped_year_dirs:
    print("Skipped non-year dirs:", skipped_year_dirs[:10], "..." if len(skipped_year_dirs) > 10 else "")
if skipped_quarter_dirs:
    print("Skipped non-quarter dirs (sample):", skipped_quarter_dirs[:20], "..." if len(skipped_quarter_dirs) > 10 else "")

# Loader helpers
def load_quarter_index(path: str | Path = quarterly_index_json_path) -> dict:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def get_pdfs(index_dict: dict, year: str, quarter: str, category: str | None = None) -> list[str]:
    """
    quarter: 'q1'|'q2'|'q3'|'q4' (case-insensitive)
    category: None for all, or 'text'|'infographic'|'tables'
    """
    q = quarter.lower()
    year_map = index_dict.get(year, {})
    q_map = year_map.get(q, {})
    if category is None:
        # All categories
        return q_map.get("text", []) + q_map.get("infographic", []) + q_map.get("tables", [])
    return q_map.get(category, [])

Wrote: I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output\verizon_quarterly_pdfs_index.json
Years: 9 | Quarters: 32 | Total PDFs indexed: 209


In [None]:
# Loader helpers
def load_quarter_index(path: str | Path = quarterly_index_json_path) -> dict:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def get_pdfs(index_dict: dict, year: str, quarter: str, category: str | None = None) -> list[str]:
    """
    quarter: 'q1'|'q2'|'q3'|'q4' (case-insensitive)
    category: None for all, or 'text'|'infographic'|'tables'
    """
    q = quarter.lower()
    year_map = index_dict.get(year, {})
    q_map = year_map.get(q, {})
    if category is None:
        # All categories
        return q_map.get("text", []) + q_map.get("infographic", []) + q_map.get("tables", [])
    return q_map.get(category, [])

In [11]:
idx = load_quarter_index()
get_pdfs(idx, "2019", "q1", "text")
get_pdfs(idx, "2019", "q1", "infographic")
get_pdfs(idx, "2019", "q1", "tables")

['I:\\My Drive\\M. Tech AI ML\\AIML SEM 4\\Dissertation\\Project\\downloaded_verizon_financial_pdfs\\downloaded_verizon_quarterly_pdfs\\2019\\1Q\\Download Financial statements PDF_b1cbfcb5.pdf',
 'I:\\My Drive\\M. Tech AI ML\\AIML SEM 4\\Dissertation\\Project\\downloaded_verizon_financial_pdfs\\downloaded_verizon_quarterly_pdfs\\2019\\1Q\\Download Financial & Operating information PDF_90b20eb4.pdf',
 'I:\\My Drive\\M. Tech AI ML\\AIML SEM 4\\Dissertation\\Project\\downloaded_verizon_financial_pdfs\\downloaded_verizon_quarterly_pdfs\\2019\\1Q\\Download Non-GAAP reconciliations PDF_1dc6bdd4.pdf']

## Extracting and Building knowledge graph

You know what, let's do this, first we'll extract the pdf using langchain-docling into a json, whole format will be:
{
"2025":{
    "q1:{
        "text":{
            1: {
                path: "file path",
                "content":{
                    "The extracted contect from langchain-docling in markdown format, 
                }
            }
        },
        "tables":{
            1: {
                path: "file path",
                "content":{
                    "The extracted contect from langchain-docling in markdown format, 
                }
            }
        }
        "infographic:{
            .....
        }
    },
    "q2: {
        ....
    },

}

"2024":{
    ....
}
}


In [14]:
q1_2025_tables = get_pdfs(idx, "2025", "q1", "tables")

In [15]:
q1_2025_tables

['I:\\My Drive\\M. Tech AI ML\\AIML SEM 4\\Dissertation\\Project\\downloaded_verizon_financial_pdfs\\downloaded_verizon_quarterly_pdfs\\2025\\1Q\\Download Financial statements PDF_5a9b6fa4.pdf',
 'I:\\My Drive\\M. Tech AI ML\\AIML SEM 4\\Dissertation\\Project\\downloaded_verizon_financial_pdfs\\downloaded_verizon_quarterly_pdfs\\2025\\1Q\\Download Financial & Operating information PDF_3dedf514.pdf',
 'I:\\My Drive\\M. Tech AI ML\\AIML SEM 4\\Dissertation\\Project\\downloaded_verizon_financial_pdfs\\downloaded_verizon_quarterly_pdfs\\2025\\1Q\\Download Non-GAAP reconciliations PDF_d419afb0.pdf']

In [17]:
q1_2025_tables[0]

'I:\\My Drive\\M. Tech AI ML\\AIML SEM 4\\Dissertation\\Project\\downloaded_verizon_financial_pdfs\\downloaded_verizon_quarterly_pdfs\\2025\\1Q\\Download Financial statements PDF_5a9b6fa4.pdf'

In [18]:
from langchain_docling import DoclingLoader

pdf_file_path = q1_2025_tables[0]
loader = DoclingLoader(file_path=pdf_file_path)
docs = loader.load()  # Each doc is a LangChain Document; text and table chunks included


Token indices sequence length is longer than the specified maximum sequence length for this model (2341 > 512). Running this sequence through the model will result in indexing errors


In [19]:

from langchain_docling.loader import ExportType
loader = DoclingLoader(
    file_path=pdf_file_path,
    export_type=ExportType.MARKDOWN  # Preserves tables as markdown in content
)
docs = loader.load()
with open("output.md", "w", encoding="utf-8") as f:
    for doc in docs:
        f.write(doc.page_content)

In [23]:
docs[0]

Document(metadata={'source': 'I:\\My Drive\\M. Tech AI ML\\AIML SEM 4\\Dissertation\\Project\\downloaded_verizon_financial_pdfs\\downloaded_verizon_quarterly_pdfs\\2025\\1Q\\Download Financial statements PDF_5a9b6fa4.pdf'}, page_content="## Verizon Communications Inc.\n\n1Q25 VZ Financial Statements\n\n## Condensed Consolidated Statements of Income\n\n|                                                          | (dollars in millions, except per share amounts)   | (dollars in millions, except per share amounts)   | (dollars in millions, except per share amounts)   |\n|----------------------------------------------------------|---------------------------------------------------|---------------------------------------------------|---------------------------------------------------|\n| Unaudited                                                | 3 Mos. Ended 3/31/25                              | 3 Mos. Ended 3/31/24                              | % Change                                     

In [35]:
# One-file aggregator for all years/quarters/categories
# Writes: I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output\verizon_quaterly_pdfs_extracted.json

from pathlib import Path
import json
from typing import Dict, Any, List, Literal



from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType


from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings as docling_settings
from docling.document_converter import DocumentConverter, PdfFormatOption


# Optional header-based splitting
try:
    from langchain_text_splitters import MarkdownHeaderTextSplitter
except ImportError:
    MarkdownHeaderTextSplitter = None  # Use split_mode="none" if not installed

AGG_OUT_DIR = r"I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output"
AGG_OUT_PATH = Path(AGG_OUT_DIR) / "verizon_quaterly_pdfs_extracted.json"  # keep your exact filename

# Global converter configured with accelerator; when set, we use it
DOC_CONVERTER: DocumentConverter | None = None

def setup_docling_accelerator(
    device: AcceleratorDevice | str = AcceleratorDevice.CPU,  # or AcceleratorDevice.AUTO/CUDA/MPS
    num_threads: int = 8,
    do_ocr: bool = True,
    do_table_structure: bool = True,
    do_cell_matching: bool = True,
    profile_timings: bool = True,
) -> DocumentConverter:
    """
    Configure a DocumentConverter with accelerator options and set it globally.
    """
    global DOC_CONVERTER

    if isinstance(device, str):
        # Accept "CPU", "AUTO", "CUDA", "MPS"
        device = getattr(AcceleratorDevice, device.upper())

    accel = AcceleratorOptions(num_threads=num_threads, device=device)

    pipe = PdfPipelineOptions()
    pipe.accelerator_options = accel
    pipe.do_ocr = do_ocr
    pipe.do_table_structure = do_table_structure
    pipe.table_structure_options.do_cell_matching = do_cell_matching

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipe)
        }
    )

    # Optional profiling
    docling_settings.debug.profile_pipeline_timings = profile_timings

    DOC_CONVERTER = converter
    return converter


def extract_markdown_from_pdf(
    pdf_path: str,
    export_type: ExportType = ExportType.MARKDOWN,
    split_mode: Literal["none", "headers"] = "none",
) -> Dict[str, Any]:
    """
    Returns a JSON-serializable content dict:
      - split_mode='none'   -> {'markdown': '<full markdown>'}
      - split_mode='headers'-> {'splits': [{'headers': {...}, 'content': '...'}, ...]}
    """
    loader = DoclingLoader(file_path=pdf_path, export_type=export_type)
    docs = loader.load()  # list[Document]; page_content has markdown

    if split_mode == "none":
        full_md = "".join(doc.page_content or "" for doc in docs)
        return {"markdown": full_md}

    if split_mode == "headers":
        if MarkdownHeaderTextSplitter is None:
            raise RuntimeError("langchain-text-splitters not installed. Install it or use split_mode='none'.")
        splitter = MarkdownHeaderTextSplitter(
            headers_to_split_on=[
                ("#", "Header_1"),
                ("##", "Header_2"),
                ("###", "Header_3"),
            ]
        )
        splits: List[Dict[str, Any]] = []
        for doc in docs:
            parts = splitter.split_text(doc.page_content or "")
            # parts are LangChain Documents; make them JSON-serializable
            for part in parts:
                splits.append({
                    "headers": dict(part.metadata or {}),
                    "content": part.page_content or ""
                })
        return {"splits": splits}

    raise ValueError(f"Unexpected split_mode: {split_mode}")


# Reuse earlier extractor if present; otherwise define a minimal one here
try:
    extract_markdown_from_pdf  # type: ignore # noqa: F821
except NameError:
    def extract_markdown_from_pdf(
    pdf_path: str,
    export_type: ExportType = ExportType.MARKDOWN,
    split_mode: Literal["none", "headers"] = "none",
) -> Dict[str, Any]:
        """
        If a global DOC_CONVERTER is configured, use it for conversion (accelerated).
        Otherwise fall back to DoclingLoader(export_type=MARKDOWN).
        """
        # Prefer the accelerator-backed converter
        if DOC_CONVERTER is not None:
            result = DOC_CONVERTER.convert(pdf_path)
            markdown_text = result.document.export_to_markdown()

            if split_mode == "none":
                return {"markdown": markdown_text}

            if split_mode == "headers":
                if MarkdownHeaderTextSplitter is None:
                    return {"markdown": markdown_text, "note": "header splitter not installed"}
                splitter = MarkdownHeaderTextSplitter(
                    headers_to_split_on=[("#", "Header_1"), ("##", "Header_2"), ("###", "Header_3")]
                )
                parts = splitter.split_text(markdown_text)
                splits = [{**dict(part.metadata or {}), "content": part.page_content or ""} for part in parts]
                return {"splits": splits}

            raise ValueError(f"Unexpected split_mode: {split_mode}")

        # Fallback: original loader path
        loader = DoclingLoader(file_path=pdf_path, export_type=export_type)
        docs = loader.load()

        if split_mode == "none":
            full_md = "".join(doc.page_content or "" for doc in docs)
            return {"markdown": full_md}

        if split_mode == "headers":
            if MarkdownHeaderTextSplitter is None:
                full_md = "".join(doc.page_content or "" for doc in docs)
                return {"markdown": full_md, "note": "header splitter not installed"}
            splitter = MarkdownHeaderTextSplitter(
                headers_to_split_on=[("#", "Header_1"), ("##", "Header_2"), ("###", "Header_3")]
            )
            splits: List[Dict[str, Any]] = []
            for doc in docs:
                for part in splitter.split_text(doc.page_content or ""):
                    splits.append({**dict(part.metadata or {}), "content": part.page_content or ""})
            return {"splits": splits}

        raise ValueError(f"Unexpected split_mode: {split_mode}")


def _ensure_nested(d: Dict[str, Any], *keys: str) -> Dict[str, Any]:
    cur = d
    for k in keys:
        if k not in cur or not isinstance(cur[k], dict):
            cur[k] = {}
        cur = cur[k]
    return cur

def _next_index(bucket: Dict[str, Any]) -> int:
    # Find next numeric index for keys like "1", "2", ...
    numeric_keys = [int(k) for k in bucket.keys() if isinstance(k, str) and k.isdigit()]
    return (max(numeric_keys) + 1) if numeric_keys else 1



def _safe_load_json(path: Path) -> Dict[str, Any]:
    if not path.exists() or path.stat().st_size == 0:
        return {}
    try:
        with open(path, "r", encoding="utf-8") as f:
            txt = f.read().strip()
        if not txt:
            return {}
        return json.loads(txt)
    except Exception as e:
        # Backup the corrupted file and start fresh
        try:
            backup = path.with_suffix(path.suffix + ".bak")
            path.replace(backup)
            print(f"Backed up invalid JSON to: {backup}")
        except Exception:
            pass
        return {}



def upsert_extractions_for_list(
    pdf_paths: List[str],
    year: str,
    quarter_key: str,                 # e.g., "q1", "q2"
    category_key: str,                # "text" | "tables" | "infographic"
    split_mode: Literal["none", "headers"] = "none",
    export_type: ExportType = ExportType.MARKDOWN,
    agg_path: Path = AGG_OUT_PATH,
) -> Path:
    agg_path.parent.mkdir(parents=True, exist_ok=True)

    # Load current aggregator (or start new)
    agg: Dict[str, Any] = _safe_load_json(agg_path)

    # Ensure nested structure exists
    bucket = _ensure_nested(agg, year, quarter_key, category_key)
    start_idx = _next_index(bucket)

    idx = start_idx
    for p in pdf_paths:
        try:
            content = extract_markdown_from_pdf(p, export_type=export_type, split_mode=split_mode)
            bucket[str(idx)] = {
                "path": p,
                "content": content  # either {"markdown": "..."} or {"splits": [...]}
            }
            idx += 1
        except Exception as e:
            bucket[f"error_{idx}"] = {"path": p, "error": str(e)}
            idx += 1  # move on to the next index even if error

    # Atomic-ish write: write to temp then replace
    tmp = agg_path.with_suffix(agg_path.suffix + ".tmp")
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(agg, f, ensure_ascii=False, indent=2)
    tmp.replace(agg_path)

    print(f"Upserted {idx - start_idx} item(s) under {year} -> {quarter_key} -> {category_key}")
    print(f"Wrote: {agg_path}")
    return agg_path

In [36]:
# Optional: inspect current file state
print(AGG_OUT_PATH, AGG_OUT_PATH.exists(), (AGG_OUT_PATH.stat().st_size if AGG_OUT_PATH.exists() else None))

# Configure accelerator: CPU, 5 threads (safe default on Windows)
setup_docling_accelerator(device=AcceleratorDevice.CPU, num_threads=5)

# Optional: see converter timings later in result.timings if you inspect via DOC_CONVERTER.convert(...)
print("Accelerator ready:", DOC_CONVERTER is not None)

# Uses your already defined q1_2025_tables list
upsert_extractions_for_list(
    pdf_paths=q1_2025_tables,
    year="2025",
    quarter_key="q1",
    category_key="tables",        # or "text"/"infographic" for other buckets
    split_mode="headers",            # use "headers" to split by markdown headers
    export_type=ExportType.MARKDOWN,
)

I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output\verizon_quaterly_pdfs_extracted.json True 0
Accelerator ready: True




Upserted 3 item(s) under 2025 -> q1 -> tables
Wrote: I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output\verizon_quaterly_pdfs_extracted.json


WindowsPath('I:/My Drive/M. Tech AI ML/AIML SEM 4/Dissertation/Project/output/verizon_quaterly_pdfs_extracted.json')

In [37]:
upsert_extractions_for_list(
    pdf_paths=get_pdfs(idx, "2025", "q1", "text"),
    year="2025",
    quarter_key="q1",
    category_key="text",        # or "text"/"infographic" for other buckets
    split_mode="headers",            # use "headers" to split by markdown headers
    export_type=ExportType.MARKDOWN,
)



Upserted 2 item(s) under 2025 -> q1 -> text
Wrote: I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output\verizon_quaterly_pdfs_extracted.json


WindowsPath('I:/My Drive/M. Tech AI ML/AIML SEM 4/Dissertation/Project/output/verizon_quaterly_pdfs_extracted.json')

In [39]:
upsert_extractions_for_list(
    pdf_paths=get_pdfs(idx, "2025", "q2", "tables"),
    year="2025",
    quarter_key="q2",
    category_key="tables",        # or "text"/"infographic" for other buckets
    split_mode="headers",            # use "headers" to split by markdown headers
    export_type=ExportType.MARKDOWN,
)



Upserted 3 item(s) under 2025 -> q2 -> tables
Wrote: I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output\verizon_quaterly_pdfs_extracted.json


WindowsPath('I:/My Drive/M. Tech AI ML/AIML SEM 4/Dissertation/Project/output/verizon_quaterly_pdfs_extracted.json')

In [40]:
upsert_extractions_for_list(
    pdf_paths=get_pdfs(idx, "2025", "q2", "text"),
    year="2025",
    quarter_key="q2",
    category_key="text",        # or "text"/"infographic" for other buckets
    split_mode="headers",            # use "headers" to split by markdown headers
    export_type=ExportType.MARKDOWN,
)



Upserted 1 item(s) under 2025 -> q2 -> text
Wrote: I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output\verizon_quaterly_pdfs_extracted.json


WindowsPath('I:/My Drive/M. Tech AI ML/AIML SEM 4/Dissertation/Project/output/verizon_quaterly_pdfs_extracted.json')

In [None]:
# upsert_extractions_for_list(
#     pdf_paths=get_pdfs(idx, "2024", "q1", "tables"),
#     year="2024",
#     quarter_key="q1",
#     category_key="tables",        # or "text"/"infographic" for other buckets
#     split_mode="headers",            # use "headers" to split by markdown headers
#     export_type=ExportType.MARKDOWN,
# )

In [None]:
# upsert_extractions_for_list(
#     pdf_paths=get_pdfs(idx, "2024", "q2", "tables"),
#     year="2024",
#     quarter_key="q2",
#     category_key="tables",        # or "text"/"infographic" for other buckets
#     split_mode="headers",            # use "headers" to split by markdown headers
#     export_type=ExportType.MARKDOWN,
# )

In [None]:
# upsert_extractions_for_list(
#     pdf_paths=get_pdfs(idx, "2024", "q3", "tables"),
#     year="2024",
#     quarter_key="q3",
#     category_key="tables",        # or "text"/"infographic" for other buckets
#     split_mode="headers",            # use "headers" to split by markdown headers
#     export_type=ExportType.MARKDOWN,
# )

In [None]:
# upsert_extractions_for_list(
#     pdf_paths=get_pdfs(idx, "2024", "q4", "tables"),
#     year="2024",
#     quarter_key="q4",
#     category_key="tables",        # or "text"/"infographic" for other buckets
#     split_mode="headers",            # use "headers" to split by markdown headers
#     export_type=ExportType.MARKDOWN,
# )

In [None]:
# upsert_extractions_for_list(
#     pdf_paths=get_pdfs(idx, "2024", "q1", "text"),
#     year="2024",
#     quarter_key="q1",
#     category_key="text",        # or "text"/"infographic" for other buckets
#     split_mode="headers",            # use "headers" to split by markdown headers
#     export_type=ExportType.MARKDOWN,
# )

In [None]:
# upsert_extractions_for_list(
#     pdf_paths=get_pdfs(idx, "2024", "q2", "text"),
#     year="2024",
#     quarter_key="q2",
#     category_key="text",        # or "text"/"infographic" for other buckets
#     split_mode="headers",            # use "headers" to split by markdown headers
#     export_type=ExportType.MARKDOWN,
# )

In [None]:
# upsert_extractions_for_list(
#     pdf_paths=get_pdfs(idx, "2024", "q3", "text"),
#     year="2024",
#     quarter_key="q3",
#     category_key="text",        # or "text"/"infographic" for other buckets
#     split_mode="headers",            # use "headers" to split by markdown headers
#     export_type=ExportType.MARKDOWN,
# )

In [None]:
# upsert_extractions_for_list(
#     pdf_paths=get_pdfs(idx, "2024", "q4", "text"),
#     year="2024",
#     quarter_key="q4",
#     category_key="text",        # or "text"/"infographic" for other buckets
#     split_mode="headers",            # use "headers" to split by markdown headers
#     export_type=ExportType.MARKDOWN,
# )

In [None]:
terms_docs = load_section_from_file(dest_path, "terms-conditions")
print(len(terms_docs), "docs in terms-conditions")

In [41]:
from pathlib import Path
import json

web_doc_section_path = r"I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output\verizon_production_web_documents.sectioned.json"

# 1) Load sectioned JSON
p = Path(web_doc_section_path)
with open(p, "r", encoding="utf-8") as f:
    web_data = json.load(f)

docs_by_sec = web_data.get("documents", {})

# 2) Print available sections and counts
sections = sorted(docs_by_sec.keys())
print("Available sections:", ", ".join(sections))
print("Counts per section:")
for sec in sections:
    print(f"  - {sec}: {len(docs_by_sec.get(sec, []))}")


Available sections: about, consumer-safety, international, investors, our-company, parenting, privacy, responsibility, terms-conditions
Counts per section:
  - about: 3
  - consumer-safety: 5
  - international: 29
  - investors: 39
  - our-company: 18
  - parenting: 44
  - privacy: 12
  - responsibility: 39
  - terms-conditions: 46


In [42]:

# 3) Choose sections to filter and how many from each
# NOTE: key is 'terms-conditions' (not 'terms-and conditions')
target_sections = ["our-company", "investors", "terms-conditions"]
k = 4  # number of docs per section

# 4) Build filtered dictionary: top-k by content length for each section
filtered_web_samples = {}
for sec in target_sections:
    docs = docs_by_sec.get(sec, []) or []
    # Enrich with content length for sorting
    enriched = []
    for i, d in enumerate(docs, 1):
        content = d.get("content") or ""
        enriched.append({
            "index_in_section": i,
            "url": d.get("url"),
            "status": d.get("status"),
            "metadata": d.get("metadata"),
            "content_length": len(content),
            "content": content,
        })
    # Sort by length desc and take top-k
    topk = sorted(enriched, key=lambda x: x["content_length"], reverse=True)[:k]
    filtered_web_samples[sec] = topk

# 5) Brief summary
total_selected = sum(len(v) for v in filtered_web_samples.values())
print(f"\nSelected {total_selected} docs across sections:")
for sec in target_sections:
    picked = filtered_web_samples.get(sec, [])
    max_len = picked[0]["content_length"] if picked else 0
    print(f"  - {sec}: {len(picked)} picked (max_len={max_len})")


Selected 12 docs across sections:
  - our-company: 4 picked (max_len=11063)
  - investors: 4 picked (max_len=5153)
  - terms-conditions: 4 picked (max_len=144087)


In [43]:
filtered_web_samples

{'our-company': [{'index_in_section': 16,
   'url': 'https://www.verizon.com/about/our-company/retiree-information',
   'status': 'success',
   'metadata': {'type': 'html',
    'extraction_method': 'trafilatura',
    'content_length': 11063,
    'processing_timestamp': '2025-07-20 21:04:52',
    'agent': 'SitemapDataIngestionAgent'},
   'content_length': 11063,
   'content': 'Retiree\n\ninformation\n\ninformation\n\n## Here’s everything you need to know, all in one place.\n\nThis page provides former Verizon employees with information about finances, employment verification, retiree discounts, health and life events and more.\n\nA “retiree” is a former Verizon employee who left Verizon after having attained the relevant milestone for retirement eligibility.\n\n\n## Health and life events\n\nWe know that comprehensive, quality and affordable healthcare is important to you. Learn about resources available to you through BenefitsConnection, Anytime Enrollment and WellConnect.\n\nAll the i

In [45]:
# Given path (relative to your notebook cwd); change to absolute if you prefer
quaterly_pdf_extracted_path = r"output\verizon_quaterly_pdfs_extracted.json"

p = Path(quaterly_pdf_extracted_path)
if not p.is_absolute():
    p = (Path.cwd() / p).resolve()

# Safe load into a variable
if p.exists() and p.stat().st_size > 0:
    with open(p, "r", encoding="utf-8") as f:
        txt = f.read().strip()
    quaterly_pdf_extracted = json.loads(txt) if txt else {}
else:
    quaterly_pdf_extracted = {}

# Quick peek
print("Loaded file:", p)
print("Top-level keys (sample):", list(quaterly_pdf_extracted.keys())[:5])

Loaded file: I:\My Drive\M. Tech AI ML\AIML SEM 4\Dissertation\Project\output\verizon_quaterly_pdfs_extracted.json
Top-level keys (sample): ['2025']


In [46]:
quaterly_pdf_extracted

{'2025': {'q1': {'tables': {'1': {'path': 'I:\\My Drive\\M. Tech AI ML\\AIML SEM 4\\Dissertation\\Project\\downloaded_verizon_financial_pdfs\\downloaded_verizon_quarterly_pdfs\\2025\\1Q\\Download Financial statements PDF_5a9b6fa4.pdf',
     'content': {'splits': [{'headers': {'Header_2': 'Verizon Communications Inc.'},
        'content': '1Q25 VZ Financial Statements'},
       {'headers': {'Header_2': 'Condensed Consolidated Statements of Income'},
        'content': '|                                                          | (dollars in millions, except per share amounts)   | (dollars in millions, except per share amounts)   | (dollars in millions, except per share amounts)   |\n|----------------------------------------------------------|---------------------------------------------------|---------------------------------------------------|---------------------------------------------------|\n| Unaudited                                                | 3 Mos. Ended 3/31/25          

In [47]:
# Chunk web samples with RecursiveCharacterTextSplitter (optionally preceded by MarkdownHeaderTextSplitter)

from langchain.text_splitter import RecursiveCharacterTextSplitter
try:
    from langchain_text_splitters import MarkdownHeaderTextSplitter
    HAVE_HDR = True
except Exception:
    HAVE_HDR = False
    MarkdownHeaderTextSplitter = None

from langchain_core.documents import Document

# Config
CHUNK_SIZE = 1200
CHUNK_OVERLAP = 200
USE_HYBRID = True  # True = headers first (if available), then recursive; False = recursive only

# Recursive splitter (tune separators to favor Markdown structure when present)
rec_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    separators=["\n### ", "\n## ", "\n# ", "\n\n", "\n", " ", ""],
)

hdr_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[("#", "Header_1"), ("##", "Header_2"), ("###", "Header_3")]
) if (USE_HYBRID and HAVE_HDR) else None

assert isinstance(filtered_web_samples, dict), "filtered_web_samples must be defined."

web_chunk_docs = []
web_chunk_stats = {}

for section, items in filtered_web_samples.items():
    section_chunk_count = 0
    for item in (items or []):
        text = (item.get("content") or "").strip()
        if not text:
            continue

        base_meta = {
            "source_type": "web",
            "section": section,
            "source_url": item.get("url"),
            "status": item.get("status"),
            "index_in_section": item.get("index_in_section"),
            "original_content_length": item.get("content_length", 0),
            "chunk_strategy": f"{'headers+' if hdr_splitter else ''}recursive[{CHUNK_SIZE},{CHUNK_OVERLAP}]",
        }

        segments = [text]
        if hdr_splitter:
            parts = hdr_splitter.split_text(text)
            segments = [p.page_content or "" for p in parts if (p.page_content or "").strip()]

        # Apply recursive splitter per segment, carrying metadata through
        per_item_chunks = []
        for seg in segments:
            docs = rec_splitter.create_documents([seg], metadatas=[base_meta])
            per_item_chunks.extend(docs)

        # Add chunk indices
        total = len(per_item_chunks)
        for i, d in enumerate(per_item_chunks, start=1):
            d.metadata["chunk_index"] = i
            d.metadata["chunk_total_for_item"] = total

        web_chunk_docs.extend(per_item_chunks)
        section_chunk_count += total

    web_chunk_stats[section] = section_chunk_count

# Summary
print("Chunked web samples (recursive):")
for sec, cnt in web_chunk_stats.items():
    print(f"  - {sec}: {cnt} chunks")
print("Total chunks:", len(web_chunk_docs))
if web_chunk_docs:
    print("\nPreview:", {k: web_chunk_docs[0].metadata[k] for k in ["section","source_url","chunk_index","chunk_total_for_item"]})
    print(web_chunk_docs[0].page_content[:250], "...")

Chunked web samples (recursive):
  - our-company: 41 chunks
  - investors: 25 chunks
  - terms-conditions: 428 chunks
Total chunks: 494

Preview: {'section': 'our-company', 'source_url': 'https://www.verizon.com/about/our-company/retiree-information', 'chunk_index': 1, 'chunk_total_for_item': 14}
Retiree  
information  
information ...


In [48]:
web_chunk_docs

[Document(metadata={'source_type': 'web', 'section': 'our-company', 'source_url': 'https://www.verizon.com/about/our-company/retiree-information', 'status': 'success', 'index_in_section': 16, 'original_content_length': 11063, 'chunk_strategy': 'headers+recursive[1200,200]', 'chunk_index': 1, 'chunk_total_for_item': 14}, page_content='Retiree  \ninformation  \ninformation'),
 Document(metadata={'source_type': 'web', 'section': 'our-company', 'source_url': 'https://www.verizon.com/about/our-company/retiree-information', 'status': 'success', 'index_in_section': 16, 'original_content_length': 11063, 'chunk_strategy': 'headers+recursive[1200,200]', 'chunk_index': 2, 'chunk_total_for_item': 14}, page_content='This page provides former Verizon employees with information about finances, employment verification, retiree discounts, health and life events and more.  \nA “retiree” is a former Verizon employee who left Verizon after having attained the relevant milestone for retirement eligibility.

In [49]:
# Chunk quarterly PDF extractions by preserving Docling "splits"
# Assumes: `quaterly_pdf_extracted` dict is already loaded, like:
# quaterly_pdf_extracted = json.load(open(quaterly_pdf_extracted_path, "r", encoding="utf-8"))

from langchain_core.documents import Document

pdf_chunk_docs = []      # list[Document]
pdf_chunk_stats = {}     # nested counts per year -> quarter -> category

assert isinstance(quaterly_pdf_extracted, dict), "Load quaterly_pdf_extracted before running."

for year, quarters in quaterly_pdf_extracted.items():
    for quarter_key, categories in (quarters or {}).items():
        for category_key, items_dict in (categories or {}).items():
            bucket_count = 0
            # items_dict is like {"1": {"path": "...", "content": {"splits": [...]}}}
            for doc_idx_key, payload in (items_dict or {}).items():
                src_path = payload.get("path")
                content = payload.get("content") or {}

                # Prefer preserving structure when splits are available
                splits = content.get("splits")
                if isinstance(splits, list):
                    for split_i, split_item in enumerate(splits, start=1):
                        text = (split_item.get("content") or "").strip()
                        if not text:
                            continue
                        headers = split_item.get("headers") or {}
                        pdf_chunk_docs.append(
                            Document(
                                page_content=text,
                                metadata={
                                    "source_type": "pdf",
                                    "year": year,
                                    "quarter": quarter_key,
                                    "category": category_key,   # tables | text | infographic
                                    "source_path": src_path,
                                    "doc_index": doc_idx_key,   # "1", "2", ...
                                    "split_index": split_i,
                                    "headers": headers,         # keep original header context
                                    "chunk_strategy": "docling-headers-split",  # no further splitting
                                },
                            )
                        )
                        bucket_count += 1

                # If no splits (e.g., markdown only), keep a single doc per item
                elif isinstance(content.get("markdown"), str):
                    md = content["markdown"].strip()
                    if md:
                        pdf_chunk_docs.append(
                            Document(
                                page_content=md,
                                metadata={
                                    "source_type": "pdf",
                                    "year": year,
                                    "quarter": quarter_key,
                                    "category": category_key,
                                    "source_path": src_path,
                                    "doc_index": doc_idx_key,
                                    "split_index": 1,
                                    "headers": {},
                                    "chunk_strategy": "docling-markdown",
                                },
                            )
                        )
                        bucket_count += 1

            # record stats
            pdf_chunk_stats.setdefault(year, {}).setdefault(quarter_key, {})[category_key] = bucket_count

# Summary
total_chunks = len(pdf_chunk_docs)
print("Quarterly PDF chunks summary:")
for y, qmap in pdf_chunk_stats.items():
    for q, cmap in qmap.items():
        counts = ", ".join(f"{cat}:{cnt}" for cat, cnt in cmap.items())
        print(f"  - {y} {q} -> {counts}")
print("Total PDF chunks:", total_chunks)

# Peek
if pdf_chunk_docs:
    d0 = pdf_chunk_docs[0]
    print("\nPreview meta:", {k: d0.metadata[k] for k in ["year","quarter","category","doc_index","split_index"]})
    print("Preview headers:", d0.metadata.get("headers"))
    print("Preview content:\n", d0.page_content[:400], "...")

Quarterly PDF chunks summary:
  - 2025 q1 -> tables:67, text:64
  - 2025 q2 -> tables:72, text:59
Total PDF chunks: 262

Preview meta: {'year': '2025', 'quarter': 'q1', 'category': 'tables', 'doc_index': '1', 'split_index': 1}
Preview headers: {'Header_2': 'Verizon Communications Inc.'}
Preview content:
 1Q25 VZ Financial Statements ...


In [50]:
pdf_chunk_docs

[Document(metadata={'source_type': 'pdf', 'year': '2025', 'quarter': 'q1', 'category': 'tables', 'source_path': 'I:\\My Drive\\M. Tech AI ML\\AIML SEM 4\\Dissertation\\Project\\downloaded_verizon_financial_pdfs\\downloaded_verizon_quarterly_pdfs\\2025\\1Q\\Download Financial statements PDF_5a9b6fa4.pdf', 'doc_index': '1', 'split_index': 1, 'headers': {'Header_2': 'Verizon Communications Inc.'}, 'chunk_strategy': 'docling-headers-split'}, page_content='1Q25 VZ Financial Statements'),
 Document(metadata={'source_type': 'pdf', 'year': '2025', 'quarter': 'q1', 'category': 'tables', 'source_path': 'I:\\My Drive\\M. Tech AI ML\\AIML SEM 4\\Dissertation\\Project\\downloaded_verizon_financial_pdfs\\downloaded_verizon_quarterly_pdfs\\2025\\1Q\\Download Financial statements PDF_5a9b6fa4.pdf', 'doc_index': '1', 'split_index': 2, 'headers': {'Header_2': 'Condensed Consolidated Statements of Income'}, 'chunk_strategy': 'docling-headers-split'}, page_content='|                                       

In [58]:
import os, math, time
from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_neo4j import Neo4jGraph
from langchain.vectorstores import Neo4jVector

# 0) Env/config (match app_streamlit.py conventions)
load_dotenv()  # expects GOOGLE_API_KEY, NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD
GOOGLE_CHAT_MODEL  = os.getenv("GOOGLE_CHAT_MODEL", "gemini-2.5-flash-lite")
GOOGLE_EMBED_MODEL = os.getenv("GOOGLE_EMBED_MODEL", "models/gemini-embedding-001")

NEO4J_URI      = os.getenv("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USER     = os.getenv("NEO4J_USER", "neo4j")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "neo4j_verizon")

import os, json, hashlib
from collections import Counter

os.makedirs("output", exist_ok=True)
KG_JSONL_PATH = "output/kg_batches.jsonl"
KG_JSON_PATH  = "output/knowledge_graph.json"  # final deduped snapshot


def _is_primitive(x):
    return x is None or isinstance(x, (str, int, float, bool))

def _sanitize_value(x):
    # Neo4j allows only primitives or arrays of primitives as property values
    if _is_primitive(x):
        return x
    if isinstance(x, (list, tuple, set)):
        return [_sanitize_value(v) for v in x]
    if isinstance(x, dict):
        # Flatten nested maps to a JSON string
        return json.dumps({str(k): _sanitize_value(v) for k, v in x.items()}, ensure_ascii=False)
    # Node-like objects or anything else → stringify
    if hasattr(x, "id"):
        try:
            return str(getattr(x, "id"))
        except Exception:
            pass
    return str(x)

def _sanitize_props(props: dict | None):
    props = props or {}
    return {str(k): _sanitize_value(v) for k, v in props.items()}

def sanitize_graph_documents(gdocs):
    # Mutates in-place
    for gd in gdocs or []:
        for n in getattr(gd, "nodes", []) or []:
            n.properties = _sanitize_props(getattr(n, "properties", None))
        for r in getattr(gd, "relationships", []) or []:
            r.properties = _sanitize_props(getattr(r, "properties", None))
    return gdocs

def _to_json_primitive(x):
    # Pass through JSON-native types
    if x is None or isinstance(x, (str, int, float, bool)):
        return x
    # Sequences
    if isinstance(x, (list, tuple, set)):
        return [_to_json_primitive(v) for v in x]
    # Mappings
    if isinstance(x, dict):
        return {str(k): _to_json_primitive(v) for k, v in x.items()}
    # Node-like objects with id
    if hasattr(x, "id"):
        try:
            return _to_json_primitive(getattr(x, "id"))
        except Exception:
            pass
    # Fallback: stringify
    return str(x)

def _node_to_dict(n):
    props = getattr(n, "properties", None) or {}
    nid = getattr(n, "id", None) or props.get("id") or props.get("name")
    ntype = getattr(n, "type", None) or props.get("type") or "Entity"
    return {
        "id": _to_json_primitive(nid),
        "type": _to_json_primitive(ntype),
        "properties": _to_json_primitive(props),
    }

def _rel_to_dict(r):
    props = getattr(r, "properties", None) or {}
    src = getattr(r, "source", None)
    tgt = getattr(r, "target", None)

    # If endpoints are Node-like, pick their ids; otherwise stringify safely
    if hasattr(src, "id"):
        src = getattr(src, "id")
    if hasattr(tgt, "id"):
        tgt = getattr(tgt, "id")

    return {
        "source": _to_json_primitive(src),
        "type": _to_json_primitive(getattr(r, "type", None) or "RELATED_TO"),
        "target": _to_json_primitive(tgt),
        "properties": _to_json_primitive(props),
    }

def export_graph_documents_batch_jsonl(graph_docs, jsonl_path="output/kg_batches.jsonl"):
    os.makedirs(os.path.dirname(jsonl_path), exist_ok=True)
    with open(jsonl_path, "a", encoding="utf-8") as f:
        for gd in graph_docs:
            doc_nodes = [_node_to_dict(n) for n in (getattr(gd, "nodes", []) or [])]
            doc_rels  = [_rel_to_dict(r) for r in (getattr(gd, "relationships", []) or [])]
            f.write(json.dumps({"nodes": doc_nodes, "relationships": doc_rels}, ensure_ascii=False) + "\n")

def build_dedup_kg_from_jsonl(jsonl_path="output/kg_batches.jsonl", out_path="output/knowledge_graph.json"):
    nodes_map = {}  # (type,id) -> node
    rels_seen = set()
    rels = []

    if not os.path.exists(jsonl_path):
        print(f"No JSONL found at {jsonl_path}")
        return

    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            rec = json.loads(line)
            for nd in rec.get("nodes", []):
                key = (nd.get("type"), nd.get("id"))
                if key not in nodes_map:
                    nodes_map[key] = {
                        "id": nd.get("id"),
                        "type": nd.get("type"),
                        "properties": dict(nd.get("properties") or {}),
                    }
                else:
                    nodes_map[key]["properties"].update(nd.get("properties") or {})
            for rd in rec.get("relationships", []):
                rk = (rd.get("source"), rd.get("type"), rd.get("target"), tuple(sorted((rd.get("properties") or {}).items())))
                if rk not in rels_seen:
                    rels_seen.add(rk)
                    rels.append(rd)

    kg = {
        "nodes": list(nodes_map.values()),
        "relationships": rels,
        "schema": {
            "node_types": sorted({n["type"] for n in nodes_map.values()}),
            "relationship_types": dict(Counter(r["type"] for r in rels)),
        },
    }
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(kg, f, ensure_ascii=False, indent=2)
    print(f"Wrote dedup KG JSON: {out_path} (nodes={len(kg['nodes'])}, rels={len(kg['relationships'])})")


# Progress tracking + resume
import json, hashlib

PROCESSED_IDS_PATH = "output/processed_doc_ids.json"

def stable_doc_id(doc: Document) -> str:
    h = hashlib.sha1()
    h.update((doc.page_content or "").encode("utf-8", errors="ignore"))
    h.update(json.dumps(doc.metadata or {}, sort_keys=True, ensure_ascii=False).encode("utf-8", errors="ignore"))
    return h.hexdigest()

def load_processed_ids(path: str = PROCESSED_IDS_PATH) -> set[str]:
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
            return set(data if isinstance(data, list) else [])
    except FileNotFoundError:
        return set()
    except Exception:
        # Backup corrupted file and start fresh
        try:
            os.replace(path, path + ".bak")
        except Exception:
            pass
        return set()

def save_processed_ids(ids: set[str], path: str = PROCESSED_IDS_PATH):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    tmp = path + ".tmp"
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(sorted(ids), f, ensure_ascii=False, indent=2)
    os.replace(tmp, path)

# Build (id, doc) pairs and compute pending set
ALL_PAIRS = [(stable_doc_id(d), d) for d in all_docs]
ALL_IDS = [pid for pid, _ in ALL_PAIRS]
processed_ids = load_processed_ids()

already = sum(1 for pid in ALL_IDS if pid in processed_ids)
pending_pairs = [(pid, d) for pid, d in ALL_PAIRS if pid not in processed_ids]

print(f"Total docs: {len(all_docs)}")
print(f"Already processed: {already}")
print(f"Pending this run: {len(pending_pairs)}")

def batched_pairs(pairs, size):
    for i in range(0, len(pairs), size):
        yield pairs[i:i+size]

# 1) Merge docs
all_docs: list[Document] = []
if "web_chunk_docs" in globals() and isinstance(web_chunk_docs, list):
    all_docs += web_chunk_docs
if "pdf_chunk_docs" in globals() and isinstance(pdf_chunk_docs, list):
    all_docs += pdf_chunk_docs

print("Docs to ingest:", len(pending_pairs), "of", len(all_docs))
assert all_docs, "No docs found. Build web_chunk_docs/pdf_chunk_docs first."

# 2) LLM for auto KG; no allowed_nodes/relationships → let LLM infer
llm = ChatGoogleGenerativeAI(model=GOOGLE_CHAT_MODEL, temperature=0)
graph_transformer = LLMGraphTransformer(
    llm=llm,
    node_properties=True,
    relationship_properties=True,
)

# Optional: batch conversion to avoid huge prompts
def batched(seq, size):
    for i in range(0, len(seq), size):
        yield seq[i:i+size]

# KG ingestion with resume + progress
graph_docs_total = 0
graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASSWORD)

BATCH_SIZE = 50  # tune if you hit rate/size limits
for i, pair_batch in enumerate(batched_pairs(pending_pairs, BATCH_SIZE), start=1):
    batch_ids  = [pid for pid, _ in pair_batch]
    batch_docs = [d for _, d in pair_batch]

    print(f"[KG] Converting batch {i} ({len(batch_docs)} docs)…")
    gdocs = graph_transformer.convert_to_graph_documents(batch_docs)

    # Sanitize to ensure Neo4j-friendly properties
    sanitize_graph_documents(gdocs)

    print(f"[KG] Pushing {len(gdocs)} graph docs…")
    graph.add_graph_documents(
        gdocs,
        baseEntityLabel=True,
        include_source=False
    )

    # Only after successful push, append to JSONL and mark processed
    export_graph_documents_batch_jsonl(gdocs, jsonl_path=KG_JSONL_PATH)

    processed_ids.update(batch_ids)
    save_processed_ids(processed_ids)

    graph_docs_total += len(gdocs)
    print(f"[KG] Progress: {len(processed_ids)}/{len(all_docs)} processed")

print(f"KG ingestion complete. Graph documents pushed: {graph_docs_total}")

# Final dedup snapshot
build_dedup_kg_from_jsonl(jsonl_path=KG_JSONL_PATH, out_path=KG_JSON_PATH)


Total docs: 756
Already processed: 600
Pending this run: 156
Docs to ingest: 156 of 756
[KG] Converting batch 1 (50 docs)…
[KG] Converting batch 1 (50 docs)…
[KG] Pushing 50 graph docs…
[KG] Pushing 50 graph docs…
[KG] Progress: 650/756 processed
[KG] Converting batch 2 (50 docs)…
[KG] Progress: 650/756 processed
[KG] Converting batch 2 (50 docs)…
[KG] Pushing 50 graph docs…
[KG] Pushing 50 graph docs…
[KG] Progress: 700/756 processed
[KG] Converting batch 3 (50 docs)…
[KG] Progress: 700/756 processed
[KG] Converting batch 3 (50 docs)…
[KG] Pushing 50 graph docs…
[KG] Pushing 50 graph docs…
[KG] Progress: 750/756 processed
[KG] Converting batch 4 (6 docs)…
[KG] Progress: 750/756 processed
[KG] Converting batch 4 (6 docs)…
[KG] Pushing 6 graph docs…
[KG] Pushing 6 graph docs…
[KG] Progress: 756/756 processed
KG ingestion complete. Graph documents pushed: 156
[KG] Progress: 756/756 processed
KG ingestion complete. Graph documents pushed: 156
Wrote dedup KG JSON: output/knowledge_graph.js

In [57]:
# Run after you have all_docs and your GOOGLE_EMBED_MODEL
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Reuse your embedder or re-init
embedder = GoogleGenerativeAIEmbeddings(model=GOOGLE_EMBED_MODEL)

# Use langchain-chroma if present, else community fallback
try:
    from langchain_chroma import Chroma
except Exception:
    from langchain_community.vectorstores import Chroma  # fallback

import hashlib, json

# Stable IDs to avoid duplicates across runs
def doc_id(d):
    h = hashlib.sha1()
    h.update((d.page_content or "").encode("utf-8", errors="ignore"))
    h.update(json.dumps(d.metadata or {}, sort_keys=True, ensure_ascii=False).encode("utf-8", errors="ignore"))
    return h.hexdigest()

DOC_IDS = [doc_id(d) for d in all_docs]

CHROMA_PERSIST_DIR = "vector_db"
CHROMA_COLLECTION  = "verizon_chunks_gemini"

# Optional: start clean (uncomment to drop the collection)
# import chromadb
# chroma_client = chromadb.PersistentClient(path=CHROMA_PERSIST_DIR)
# try:
#     chroma_client.delete_collection(CHROMA_COLLECTION)
#     print(f"Dropped existing collection: {CHROMA_COLLECTION}")
# except Exception:
#     pass

print("Creating/upserting vector index in Chroma…")
chroma_vs = Chroma(
    collection_name=CHROMA_COLLECTION,
    embedding_function=embedder,
    persist_directory=CHROMA_PERSIST_DIR,
)

# Add documents; if you re-run with same IDs, Chroma may upsert or error depending on version.
# If you see duplicate-id errors, either drop the collection (above) or change CHROMA_COLLECTION.
chroma_vs.add_documents(documents=all_docs, ids=DOC_IDS)
try:
    chroma_vs.persist()
except Exception:
    pass
print("Chroma vector ingestion complete.")

# Quick search smoke test
cres = chroma_vs.similarity_search("quarterly financial statements tables", k=3)
for d in cres:
    src = d.metadata.get("source_url") or d.metadata.get("source_path")
    print("-", d.metadata.get("source_type"), src, f"(len={len(d.page_content)})")

Creating/upserting vector index in Chroma…


ValueError: Expected metadata value to be a str, int, float, bool, or None, got {'Header_2': 'Verizon Communications Inc.'} which is a dict in upsert.

Try filtering complex metadata from the document using langchain_community.vectorstores.utils.filter_complex_metadata.