In [3]:
!pip install boto3 langchain_aws mypy_boto3_bedrock_runtime neo4j



In [2]:
!pip uninstall numpy
!pip install numpy==1.23.5 --force-reinstall

Found existing installation: numpy 1.23.5
Uninstalling numpy-1.23.5:
  Would remove:
    /usr/local/bin/f2py
    /usr/local/bin/f2py3
    /usr/local/bin/f2py3.11
    /usr/local/lib/python3.11/dist-packages/numpy-1.23.5.dist-info/*
    /usr/local/lib/python3.11/dist-packages/numpy.libs/libgfortran-040039e1.so.5.0.0
    /usr/local/lib/python3.11/dist-packages/numpy.libs/libopenblas64_p-r0-742d56dc.3.20.so
    /usr/local/lib/python3.11/dist-packages/numpy.libs/libquadmath-96973f99.so.0.0.0
    /usr/local/lib/python3.11/dist-packages/numpy/*
Proceed (Y/n)? y
  Successfully uninstalled numpy-1.23.5
Collecting numpy==1.23.5
  Using cached numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Using cached numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the sou

In [4]:
from pydantic import BaseModel, Field
from enum import Enum

class ModelId(str, Enum):
    CLAUDE_3_HAIKU = "us.anthropic.claude-3-haiku-20240307-v1:0"
    CLAUDE_3_SONNET = "us.anthropic.claude-3-sonnet-20240229-v1:0"
    CLAUDE_3_OPUS = "us.anthropic.claude-3-opus-20240229-v1:0",
    CLAUDE_3_5_SONNET ="us.anthropic.claude-3-5-sonnet-20240620-v1:0"
    LLAMA_3_3_70B ="us.meta.llama3-3-70b-instruct-v1:0"

class ModelKwargsClaude(BaseModel):
    temperature: float = Field(default=0.1, ge=0, le=1)
    max_tokens: int = Field(default=2048, ge=1, le=4096)
    top_p: float = Field(default=0.999, ge=0, le=1)
    top_k: int = Field(default=0, ge=0, le=500)

class ModelKwargsLlama(BaseModel):
    temperature: float = Field(default=0.1, ge=0, le=1)
    max_tokens: int = Field(default=2048, ge=1, le=4096)
    top_p: float = Field(default=0.999, ge=0, le=1)

In [5]:
import boto3
from langchain_aws.chat_models import ChatBedrock
from langchain_core.callbacks import StreamingStdOutCallbackHandler
from mypy_boto3_bedrock_runtime.client import BedrockRuntimeClient
from typing import Union, List
aws_creds = {
    "access_key": "YOUR_ACCESS_KEY",
    "secret_key": "YOUR_SECRET_KEY",
    "region": "us-east-1"
}
def get_bedrock_client():
    session = boto3.Session(
        aws_access_key_id=aws_creds["access_key"],
        aws_secret_access_key=aws_creds["secret_key"],
        region_name=aws_creds["region"]
    )
    client = session.client("bedrock-runtime")
    return client

def get_bedrock_model(
  client: BedrockRuntimeClient,
  model_id: ModelId,
  model_kwargs: Union[ModelKwargsClaude,ModelKwargsLlama],
  streaming: bool = False,
  verbose: bool = False) -> ChatBedrock:
    return ChatBedrock(
        client=client,
        model_id=model_id.value,
        model_kwargs=model_kwargs.__dict__,
        streaming=streaming,
        verbose=verbose,
        callbacks=[StreamingStdOutCallbackHandler()] if streaming else []
    )

In [6]:
class Entity(BaseModel):
    entity_name: str = Field(..., description="The capitalized name of the entity (e.g., 'Federated Learning')")
    entity_type: str = Field(..., description="The type or category of the entity (e.g., 'large language model', 'dataset')")
    entity_description: str = Field(..., description="A detailed description of the entity, explaining its attributes and activities")

class Relationship(BaseModel):
    source_entity: str = Field(..., description="The name of the source entity in the relationship")
    target_entity: str = Field(..., description="The name of the target entity in the relationship")
    relationship_description: str = Field(..., description="A description explaining why and how the entities are related")
    relationship_strength: int = Field(..., ge=1, le=10, description="An integer score between 1 and 10 indicating the strength of the relationship")

class DocumentAnalysis(BaseModel):
    entities: List[Entity] = Field(..., description="A list of identified entities with their names, types, and descriptions")
    relationships: List[Relationship] = Field(..., description="A list of relationships between the identified entities, including source and target entities, relationship description, and relationship strength")


In [17]:
import json
import asyncio
from typing import List, Dict, Any, Optional
from pydantic import BaseModel, Field
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from neo4j import GraphDatabase
import matplotlib.pyplot as plt
from collections import defaultdict
import numpy as np


class Entity(BaseModel):
    entity_name: str = Field(description="Name of the entity (capitalized)")
    entity_type: str = Field(description="Type of the entity from predefined categories")
    entity_description: str = Field(description="Detailed description of the entity")

class Relationship(BaseModel):
    source_entity: str = Field(description="Name of the source entity")
    target_entity: str = Field(description="Name of the target entity")
    relationship_description: str = Field(description="Description of the relationship")
    relationship_strength: int = Field(description="Strength of relationship (1-10)")

class DocumentAnalysis(BaseModel):
    entities: List[Entity] = Field(description="List of extracted entities")
    relationships: List[Relationship] = Field(description="List of relationships between entities")

class KnowledgeGraph:
    def __init__(self, client, claude_model, neo4j_uri="your_db",
                 neo4j_user="neo4j", neo4j_password="password"):
        self.client = client
        self.claude_model = claude_model
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=4000,
            chunk_overlap=200,
            length_function=len,
            separators=["\n\n", "\n", ". ", "! ", "? ", " ", ""]
        )

        self.driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_user, neo4j_password))
        self.session = self.driver.session()

        self._initialize_database()

        self.all_entities = {}
        self.all_relationships = []

    def _initialize_database(self):
        """Create constraints and indexes for better performance"""
        try:
            self.session.run("CREATE CONSTRAINT entity_name_unique IF NOT EXISTS FOR (e:Entity) REQUIRE e.name IS UNIQUE")

            self.session.run("CREATE INDEX entity_type_index IF NOT EXISTS FOR (e:Entity) ON (e.type)")

        except Exception as e:
            print(f"Database initialization warning: {e}")

    async def extract_entities_and_relationships(self, texts):
        """Extract entities and relationships from a text chunk"""
        parser = JsonOutputParser(pydantic_object=DocumentAnalysis)

        messages = [
            ("system", """You are an expert assistant specializing in analyzing textual insights and extracting business-related topics, entities, and relationships.
            You excel at identifying technical concepts, tools, methodologies, and their interconnections in complex documents."""),
            ("human", """
            Please analyze the following text and perform the following tasks:

            1. Identify all entities from the provided text. For each identified entity, extract:
                - entity_name: Name of the entity (capitalize properly)
                - entity_type: Determine the most appropriate type based on context (e.g., "Technology", "Methodology", "Framework", "Tool", "Algorithm", "Concept", "Person", "Organization", "Standard", "Protocol", etc.)
                - entity_description: Provide a detailed description (2-3 sentences)

            2. For each pair of related entities, identify relationships:
                - source_entity: Name of the source entity
                - target_entity: Name of the target entity
                - relationship_description: Clear description of the relationship
                - relationship_strength: Rate 1-10 (10 being strongest)

            3. Focus on meaningful entities and relationships, not trivial ones.
            4. Use your knowledge to determine the most appropriate entity type for each entity based on its context and meaning.
            5. Return results in the specified JSON format.

            Text to analyze:
            '''{texts}'''

            {format_instructions}""")
        ]

        chat_prompt = ChatPromptTemplate.from_messages(messages)
        chain = chat_prompt | self.claude_model | parser

        try:
            response = await chain.ainvoke({
                "texts": texts,
                "format_instructions": parser.get_format_instructions()
            })

            if isinstance(response, str):
                response = json.loads(response)

            return response
        except Exception as e:
            print(f"Error extracting entities: {e}")
            return {"entities": [], "relationships": []}

    def merge_entities(self, new_entities):
        """Merge new entities with existing ones, handling duplicates"""
        for entity in new_entities:
            entity_name = entity["entity_name"].strip().title()

            query = """
            MATCH (e:Entity {name: $name})
            RETURN e
            """
            result = self.session.run(query, name=entity_name)

            if result.single():
                update_query = """
                MATCH (e:Entity {name: $name})
                SET e.description = e.description + ' ' + $new_description
                """
                self.session.run(update_query, name=entity_name, new_description=entity['entity_description'])
            else:
                create_query = """
                CREATE (e:Entity {
                    name: $name,
                    type: $type,
                    description: $description
                })
                """
                self.session.run(create_query,
                               name=entity_name,
                               type=entity.get('entity_type', ''),
                               description=entity.get('entity_description', ''))

            self.all_entities[entity_name] = entity

    def merge_relationships(self, new_relationships):
        """Merge new relationships with existing ones"""
        for rel in new_relationships:
            source = rel["source_entity"].strip().title()
            target = rel["target_entity"].strip().title()


            check_query = """
            MATCH (s:Entity {name: $source})-[r:RELATED_TO]-(t:Entity {name: $target})
            RETURN r
            """
            result = self.session.run(check_query, source=source, target=target)

            if result.single():

                update_query = """
                MATCH (s:Entity {name: $source})-[r:RELATED_TO]-(t:Entity {name: $target})
                SET r.strength = CASE WHEN r.strength < $strength THEN $strength ELSE r.strength END,
                    r.description = r.description + ' ' + $description
                """
                self.session.run(update_query,
                               source=source,
                               target=target,
                               strength=rel["relationship_strength"],
                               description=rel["relationship_description"])
            else:
                create_query = """
                MATCH (s:Entity {name: $source}), (t:Entity {name: $target})
                CREATE (s)-[r:RELATED_TO {
                    strength: $strength,
                    description: $description
                }]->(t)
                """
                self.session.run(create_query,
                               source=source,
                               target=target,
                               strength=rel["relationship_strength"],
                               description=rel["relationship_description"])

            self.all_relationships.append(rel)

    async def process_large_document(self, document_text: str, chunk_size: int = 4000):
        """Process a large document by splitting it into chunks"""
        print("Splitting document into chunks...")
        chunks = self.text_splitter.split_text(document_text)
        print(f"Document split into {len(chunks)} chunks")

        batch_size = 5
        for i in range(0, len(chunks), batch_size):
            batch = chunks[i:i+batch_size]
            print(f"Processing batch {i//batch_size + 1}/{(len(chunks) + batch_size - 1)//batch_size}")

            tasks = [self.extract_entities_and_relationships(chunk) for chunk in batch]
            results = await asyncio.gather(*tasks)

            for result in results:
                if result and "entities" in result and "relationships" in result:
                    self.merge_entities(result["entities"])
                    self.merge_relationships(result["relationships"])

        print(f"Extraction complete! Found {len(self.all_entities)} entities and {len(self.all_relationships)} relationships")

    def analyze_graph(self):
        """Analyze the knowledge graph using Neo4j queries"""
        analysis = {}

        result = self.session.run("MATCH (n:Entity) RETURN count(n) as total")
        analysis["total_entities"] = result.single()["total"]


        result = self.session.run("MATCH ()-[r:RELATED_TO]->() RETURN count(r) as total")
        analysis["total_relationships"] = result.single()["total"]

        query = """
        MATCH (e:Entity)
        OPTIONAL MATCH (e)-[r]-(connected)
        WITH e, count(connected) as degree
        RETURN e.name as entity, e.type as type, degree
        ORDER BY degree DESC
        LIMIT 10
        """
        result = self.session.run(query)
        analysis["most_connected_entities"] = [{"entity": record["entity"],
                                               "type": record["type"],
                                               "degree": record["degree"]}
                                              for record in result]

        query = """
        MATCH (e:Entity)
        RETURN e.type as type, count(e) as count
        ORDER BY count DESC
        """
        result = self.session.run(query)
        analysis["entity_types_distribution"] = {record["type"]: record["count"] for record in result}

        query = """
        MATCH (e:Entity)
        OPTIONAL MATCH (e)-[r]-(connected)
        WITH e, count(connected) as degree
        RETURN avg(degree) as avg_degree
        """
        result = self.session.run(query)
        analysis["average_degree"] = result.single()["avg_degree"]


        return analysis


    def export_graph(self):
        """Export the knowledge graph from Neo4j"""
        entities_query = "MATCH (e:Entity) RETURN e.name as name, e.type as type, e.description as description"
        entities_result = self.session.run(entities_query)
        entities = {record["name"]: {"entity_type": record["type"],
                                   "entity_description": record["description"]}
                   for record in entities_result}

        relations_query = """
        MATCH (s:Entity)-[r:RELATED_TO]->(t:Entity)
        RETURN s.name as source, t.name as target, r.strength as strength, r.description as description
        """
        relations_result = self.session.run(relations_query)
        relationships = [{"source_entity": record["source"],
                         "target_entity": record["target"],
                         "relationship_strength": record["strength"],
                         "relationship_description": record["description"]}
                        for record in relations_result]

        return {
            "entities": entities,
            "relationships": relationships,
            "graph_analysis": self.analyze_graph()
        }

    def search_entities(self, query: str, limit: int = 10):
        """Search for entities by name or description using Neo4j full-text search"""
        try:
            self.session.run("CALL db.index.fulltext.createNodeIndex('entitySearch', ['Entity'], ['name', 'description'])")
        except:
            pass  #

        search_query = """
        CALL db.index.fulltext.queryNodes('entitySearch', $query)
        YIELD node, score
        RETURN node.name as name, node.type as type, node.description as description
        ORDER BY score DESC
        LIMIT $limit
        """
        try:
            result = self.session.run(search_query, query=query, limit=limit)
            return [{"name": record["name"],
                    "type": record["type"],
                    "description": record["description"]}
                   for record in result]
        except:
            fallback_query = """
            MATCH (e:Entity)
            WHERE toLower(e.name) CONTAINS toLower($query)
               OR toLower(e.description) CONTAINS toLower($query)
            RETURN e.name as name, e.type as type, e.description as description
            LIMIT $limit
            """
            result = self.session.run(fallback_query, query=query, limit=limit)
            return [{"name": record["name"],
                    "type": record["type"],
                    "description": record["description"]}
                   for record in result]

    def get_entity_relationships(self, entity_name: str):
        """Get all relationships for a specific entity"""
        query = """
        MATCH (e:Entity {name: $name})-[r:RELATED_TO]-(other:Entity)
        RETURN other.name as related_entity, r.strength as strength, r.description as description
        """
        result = self.session.run(query, name=entity_name.strip().title())
        return [{"related_entity": record["related_entity"],
                "relationship_strength": record["strength"],
                "relationship_description": record["description"]}
               for record in result]

    def detect_and_store_communities(self):
      """Run Leiden community detection and store results in the graph"""
      graph_name = "community_detection_graph"
      try:

          with self.driver.session() as session:
              session.run("CALL gds.aura.api.credentials()")

              try:
                  session.run(f"CALL gds.graph.drop('{graph_name}')")
              except:
                  pass

              create_graph_query = f"""
              CALL gds.graph.project(
                  '{graph_name}',
                  'Entity',
                  'RELATED_TO',
                  {{
                      relationshipProperties: 'strength'
                  }}
              )
              """
              session.run(create_graph_query)


              leiden_write_query = f"""
              CALL gds.leiden.write('{graph_name}', {{
                  relationshipWeightProperty: 'strength',
                  writeProperty: 'communityId',
                  randomSeed: 42,
                  maxLevels: 10
              }})
              YIELD nodePropertiesWritten, modularity, ranLevels
              """
              result = session.run(leiden_write_query)
              stats = result.single()

              session.run(f"CALL gds.graph.drop('{graph_name}')")

              return {
                  "nodes_updated": stats["nodePropertiesWritten"],
                  "modularity": stats["modularity"],
                  "levels_run": stats["ranLevels"]
              }

      except Exception as e:
          print(f"Error in community detection: {e}")
          return None


    def get_communities(self):
        """Get all communities and their members"""
        query = """
        MATCH (e:Entity)
        WHERE e.communityId IS NOT NULL
        WITH e.communityId as communityId, collect(e) as members
        RETURN communityId,
                [member IN members | {name: member.name, type: member.type}] as members,
                size(members) as size
        ORDER BY size DESC
        """
        result = self.session.run(query)

        communities = {}
        for record in result:
            comm_id = f"Community_{record['communityId']}"
            communities[comm_id] = {
                "members": record["members"],
                "size": record["size"]
            }

        return communities

    def get_entity_community(self, entity_name: str):
        """Get the community ID for a specific entity"""
        query = """
        MATCH (e:Entity {name: $name})
        RETURN e.communityId as communityId
        """
        result = self.session.run(query, name=entity_name.strip().title())
        record = result.single()
        return record["communityId"] if record else None

    def get_community_summary(self):
        """Get summary statistics about communities"""
        query = """
        MATCH (e:Entity)
        WHERE e.communityId IS NOT NULL
        WITH e.communityId as communityId, count(e) as size
        RETURN min(size) as min_size,
                max(size) as max_size,
                avg(size) as avg_size,
                count(*) as total_communities
        """
        result = self.session.run(query)
        return dict(result.single())

    def close(self):
        """Close Neo4j connection"""
        self.session.close()
        self.driver.close()

    def __del__(self):
        """Cleanup when object is destroyed"""
        try:
            self.close()
        except:
            pass

    def close(self):
        """Close Neo4j connection"""
        self.session.close()
        self.driver.close()

    def __del__(self):
        """Cleanup when object is destroyed"""
        try:
            self.close()
        except:
            pass

In [8]:
client = get_bedrock_client()

claude_model = get_bedrock_model(
    client=client,
    model_id=ModelId.CLAUDE_3_5_SONNET,
    model_kwargs=ModelKwargsClaude(),
)

In [9]:
document_text = """International Journal of Research In Computer Applications and Information Technology (IJRCAIT) Volume 7, Issue 2, July-December 2024, pp. 98-110, Article ID: IJRCAIT_07_02_007 Available online at https://iaeme.com/Home/issue/IJRCAIT?Volume=7&Issue=2 ISSN Print: 2348-0009; ISSN Online: 2347-5099; Journal ID: 0497-2547 Impact Factor (2024): 14.56 (Based on Google Scholar Citation) DOI: https://doi.org/10.5281/zenodo.13908615 © IAEME Publication GRAPHRAG AND ROLE OF GRAPH DATABASES IN ADVANCING AI Ravi Kiran Magham Osmania University, India ABSTRACT This article explores the synergy between GraphRAG and graph databases in advancing AI capabilities. Graph RAG, a novel approach to query-focused summarization that combines knowledge graph generation, retrieval-augmented generation (RAG), and query-focused summarization (QFS) to support human sensemaking over large text corpora. While traditional RAG methods excel at answering specific questions, they struggle with global queries about entire datasets. Graph RAG addresses this limitation by using an LLM to build a graph-based text index in two stages: first deriving an entity knowledge graph from source documents, then pre-generating community summaries for groups of related entities. Given a query, each community summary generates a partial response, which is then synthesized into a final answer.  editor@iaeme.com https://iaeme.com/Home/journal/IJRCAIT 98 Graphrag and Role of Graph Databases in Advancing AI Evaluations on datasets in the 1 million token range show that Graph RAG substantially improves the comprehensiveness and diversity of answers compared to naïve RAG baselines, while also demonstrating favorable performance against global text summarization approaches. The hierarchical nature of the graph index allows for efficient querying at different levels of granularity. This approach offers a scalable solution for global sensemaking tasks over large private document collections, with potential applications in scientific discovery, intelligence analysis, and other domains requiring complex information synthesis. Keywords: GraphRAG, Graph Databases, Artificial Intelligence, Knowledge Representation, Large Language Models Cite this Article: Ravi Kiran Magham. (2024). Graphrag and Role of Graph Databases in Advancing AI. International Journal of Research in Computer Applications and Information Technology (IJRCAIT), 7(2), 98-110.  https://iaeme.com/MasterAdmin/Journal_uploads/IJRCAIT/VOLUME_7_ISSUE_2/IJRCAIT_07_02_007.pdf I. INTRODUCTION The landscape of Artificial Intelligence (AI) is rapidly evolving, with recent advancements in large language models (LLMs) and retrieval-augmented generation (RAG) pushing the boundaries of what's possible in natural language processing (NLP) tasks. Concurrently, graph databases have emerged as a powerful tool for representing and querying complex, interconnected data. The fusion of these technologies – which we term GraphRAG – presents a promising frontier in AI research and applications. Large language models, such as those described by Brown et al. [1], have demonstrated remarkable few-shot learning capabilities, allowing them to adapt to new tasks with minimal examples. However, these models often struggle with knowledge-intensive tasks that require access to large, structured datasets. To address this limitation, Lewis et al. [2] introduced the concept of retrieval-augmented generation, which combines the generative power of language models with the ability to retrieve and incorporate external knowledge. While RAG has shown significant promise, there's still room for improvement, particularly in handling complex, interconnected information. This is where graph databases come into play. As explained by Robinson et al. [3], graph databases excel at managing and querying highly connected data, making them ideal for representing the intricate relationships often found in real-world knowledge bases. The concept of GraphRAG builds upon these foundations, leveraging the strengths of graph databases to enhance the retrieval and reasoning capabilities of RAG systems. By representing knowledge in a graph structure, we can capture nuanced relationships between entities and concepts, potentially leading to more accurate and contextually relevant information retrieval. Moreover, the integration of graph databases into AI systems opens up new possibilities for explainable AI. The explicit representation of relationships in a graph allows for easier tracing of reasoning paths, providing insights into how an AI system arrives at its conclusions. This transparency is crucial for building trust in AI systems, especially in high-stakes domains like healthcare or finance. As we delve deeper into the synergies between graph databases and advanced AI techniques like RAG, we stand at the cusp of a new era in artificial intelligence. GraphRAG has the potential to not only improve the performance of AI systems on knowledge-intensive tasks but also to enhance their interpretability and reliability. https://iaeme.com/Home/journal/IJRCAIT 99 editor@iaeme.com Ravi Kiran Magham In the following sections, we will explore the theoretical foundations of GraphRAG, discuss its potential applications, and examine the challenges and opportunities that lie ahead in this exciting field of research. Fig 1: Comparative Capabilities of AI Technologies in Knowledge-Intensive Tasks [1-3] II. GRAPHRAG: ENHANCING LLM REASONING GraphRAG represents a significant leap forward in leveraging the power of graph databases to enhance the reasoning capabilities of Large Language Models (LLMs). By seamlessly integrating structured knowledge from graphs with the generative prowess of LLMs, GraphRAG empowers AI systems to navigate complex relationships, answer intricate queries, and unlock deeper insights from vast datasets. At its core, GraphRAG operates through a series of interconnected stages, each designed to optimize the interaction between LLMs and graph-structured knowledge. Let's explore these key stages: 1. Graph-Based Indexing  The journey begins with the creation of a graph-based index, a structured representation of knowledge that facilitates efficient retrieval. This stage involves: Knowledge Graph Construction: • Leverage existing open-source knowledge graphs (e.g., Wikidata, Freebase, DBpedia) or domain-specific ones (e.g., CMeKG for biomedical data). • Construct custom knowledge graphs from textual or other data sources, extracting entities and their relationships. editor@iaeme.com https://iaeme.com/Home/journal/IJRCAIT 100 Graphrag and Role of Graph Databases in Advancing AI Graph Data Preprocessing: • Clean and normalize the graph data to ensure consistency and accuracy. • Perform entity linking and relation extraction to enrich the graph with semantic information. Indexing Methods: • Graph Indexing: Preserves the entire graph structure for efficient traversal and complex query processing. • Text Indexing: Converts graph elements into textual descriptions for seamless integration with LLMs. • Vector Indexing: Transforms graph data into vector representations for similarity-based retrieval. • Hybrid Indexing: Combines multiple methods for comprehensive and adaptable retrieval. [5,6] 2. Graph-Guided Retrieval Once the graph is indexed, GraphRAG employs intelligent retrieval mechanisms to pinpoint the most relevant information in response to user queries. This stage encompasses: Query Processing: • Analyze and expand user queries to improve retrieval accuracy and capture nuanced intent. • Implement query decomposition to break down complex questions into manageable sub-queries. Retriever Selection: • Non-parametric Retrievers: Utilize rule-based or traditional graph search algorithms for deterministic retrieval. • LM-based Retrievers: Leverage the semantic understanding of language models for context-aware retrieval.[5,6]  • GNN-based Retrievers: Employ graph neural networks to capture structural patterns and dependencies within the graph. Retrieval Paradigms: • Once Retrieval: Perform a single-pass retrieval for efficiency. • Iterative Retrieval: Refine retrieval results through multiple iterations for improved accuracy. • Multi-Stage Retrieval: Combine different retrieval methods in a pipeline for enhanced flexibility. Retrieval Granularity & Enhancement: • Support retrieval at various levels of granularity: nodes, triplets (subject-predicateobject), paths, subgraphs, or hybrid combinations. • Employ query enhancement (e.g., query expansion) and knowledge enhancement (e.g., merging and pruning retrieved information) techniques to further refine the retrieval process. https://iaeme.com/Home/journal/IJRCAIT 101 editor@iaeme.com Ravi Kiran Magham 3. Graph-Enhanced Generation The retrieved graph data is then transformed into a format that LLMs can readily understand, and the generation process is augmented to produce informative and contextually relevant responses.  This stage involves: Graph Format Conversion: • Convert the retrieved graph data into LLM-compatible formats, such as adjacency tables, natural language descriptions, or code-like representations. • Explore various graph languages and representations to optimize the integration with LLMs. Generator Selection: • GNNs: Leverage graph neural networks for tasks that require explicit reasoning over the graph structure. • LMs: Utilize language models for natural language generation and response formulation. • Hybrid Models: Combine GNNs and LMs in cascaded or parallel paradigms to leverage the strengths of both. Generation Enhancement: • Pre-Generation: Refine the input to the LLM before generation. • Mid-Generation: Guide the generation process using techniques like constrained decoding. • Post-Generation: Refine and combine the generated outputs for improved coherence and relevance. 4. Training and Optimization To achieve peak performance, GraphRAG systems undergo training and optimization procedures that fine-tune both the retrieval and generation components. This can involve: Retriever Training: • Explore training-free approaches for closed-source LLMs. • Implement training-based methods for custom retrievers using techniques like distant supervision and self-supervised pre-training. [5] Generator Training: • Adapt training strategies based on the chosen generator type (GNN, LM, or hybrid). • Implement supervised fine-tuning for generative LLMs to align them with the specific task and domain. Joint Training: Develop techniques for end-to-end optimization of both retrieval and generation components to ensure seamless collaboration. editor@iaeme.com https://iaeme.com/Home/journal/IJRCAIT 102 Graphrag and Role of Graph Databases in Advancing AI 5. Evaluation and Monitoring Rigorous evaluation and continuous monitoring are essential to ensure the effectiveness and reliability of GraphRAG systems. This involves: Task-Specific Metrics: • Implement relevant evaluation metrics tailored to the specific downstream tasks (e.g., Exact Match, F1 score for knowledge base question answering). Retrieval Quality Metrics: • Assess the relevance and coverage of the retrieved information. • Evaluate the consistency and reliability of retrieval results. Generation Quality Metrics: • Evaluate the accuracy, coherence, and fluency of the generated outputs. • Utilize standard metrics like BLEU, ROUGE-L, and METEOR for natural language generation tasks. This elaborated reference architecture provides a comprehensive framework for building GraphRAG systems, incorporating the latest advancements in graph-based retrieval and generation techniques as discussed in the cited literature. It allows for flexibility in component selection and optimization based on specific use cases and requirements, while addressing key challenges in areas such as scalability, efficiency, and domain adaptation. Fig 2: GraphRAG Architecture: Relative Significance of Core Elements [4-6] III. BENEFITS OF GRAPHRAG GraphRAG offers significant advantages over traditional Retrieval-Augmented Generation (RAG) systems, enhancing AI capabilities in several key areas: https://iaeme.com/Home/journal/IJRCAIT 103 editor@iaeme.com Ravi Kiran Magham 1. Improved accuracy and contextual relevance of responses: By leveraging knowledge graphs, GraphRAG provides LLMs with a more structured and comprehensive context. This results in responses that are not only more accurate but also more contextually appropriate, as the system can better understand the relationships between different pieces of information. Hu et al. [7] demonstrated that GraphRAG systems achieved a 15% improvement in response accuracy compared to traditional RAG methods when tested on complex question-answering tasks. 2. Better handling of complex, interconnected information: The graph-based approach allows GraphRAG to efficiently navigate and utilize intricate relationships between entities and concepts. This is particularly beneficial in domains with highly interconnected data, such as scientific research or complex business environments, where traditional RAG systems might struggle to capture the full context. Zhang et al. [8] showed that GraphRAG outperformed conventional RAG systems by 20% in tasks involving multi-hop reasoning on biomedical knowledge graphs. 3. Enhanced ability to answer global questions about large datasets: GraphRAG excels at synthesizing information from across large knowledge domains. Its graph-based structure allows for efficient traversal of vast datasets, enabling the system to answer broad, overarching questions that might require integrating information from multiple sources or domains. This capability is particularly valuable for tasks such as trend analysis and pattern recognition in large-scale data. 4. Reduced token usage and improved scalability: By storing information in a structured graph format, GraphRAG can more efficiently retrieve relevant context without needing to process large amounts of raw text. This leads to reduced token usage when generating prompts for LLMs, resulting in faster processing times and lower computational costs. Additionally, the graph structure allows for easier scaling as the knowledge base grows, maintaining performance even with large datasets. 5. Dynamic knowledge integration: Unlike traditional RAG systems that often work with static document collections, GraphRAG can easily incorporate new information into its knowledge graph. This allows the system to stay up-to-date with the latest information without requiring a complete reindexing of the dataset. This dynamic approach is particularly valuable in rapidly evolving fields such as current events analysis or technological research. 6. Improved reasoning capabilities: The graph structure enables more sophisticated reasoning, allowing GraphRAG to make inferences and connections that might not be immediately apparent in a flat text representation. This leads to more insightful and nuanced responses, particularly for complex queries. For instance, GraphRAG systems have shown superior performance in tasks requiring causal reasoning and relationship inference compared to traditional RAG methods [8]. These benefits make GraphRAG a powerful tool for enhancing AI systems' ability to work with domain-specific knowledge, handle complex queries, and provide more accurate and contextually relevant responses. As organizations increasingly deal with large, interconnected datasets, GraphRAG's capabilities offer a significant advantage over traditional RAG systems in developing more sophisticated and effective AI applications. IV. GRAPH DATABASES: FOUNDATIONS FOR ADVANCED AI Graph databases have emerged as a crucial technology in the AI landscape, offering unique capabilities for storing and managing highly connected data. Unlike traditional relational databases, graph databases are specifically designed to efficiently handle complex relationships between entities, making them ideal for supporting advanced AI applications like GraphRAG. editor@iaeme.com https://iaeme.com/Home/journal/IJRCAIT 104 Graphrag and Role of Graph Databases in Advancing AI A. Fundamentals of Graph Databases Graph databases are optimized for storing and querying interconnected data. They represent information as nodes (entities) and edges (relationships), allowing for efficient traversal and analysis of complex data structures. The property graph model, which is widely used in graph databases, extends this basic structure by allowing properties to be attached to both nodes and edges, providing a rich and flexible way to represent real-world data [9]. This model offers several advantages: 1. Intuitive representation: The node-edge structure closely mirrors how humans conceptualize relationships, making it easier to model and understand complex domains. 2. Performance optimization: By directly storing relationships, graph databases can perform queries that would be computationally expensive in relational databases much more efficiently. 3. Scalability: Graph databases can handle large-scale, highly connected data without significant performance degradation, making them suitable for big data applications. B. Key features of graph databases include: 1. Native graph storage: Data is stored in a graph structure, eliminating the need for complex joins and improving query performance. This approach allows for constanttime traversals per relationship, regardless of the total size of the graph. In large-scale AI applications, this can lead to orders of magnitude improvement in query performance compared to relational databases [9]. 2. Index-free adjacency: Relationships are physically stored as connections between nodes, enabling rapid traversal of the graph. This feature is particularly beneficial for applications that require real-time analysis of complex relationships, such as recommendation systems or fraud detection algorithms. 3. Flexible schema: Graph databases can easily adapt to changing data requirements without the need for extensive restructuring. This flexibility is crucial in AI applications where data models may evolve as new insights are gained or as the application's scope expands. 4. Powerful query languages: Graph-specific query languages like Cypher and Gremlin allow for expressive and efficient querying of graph data. These languages are designed to express complex patterns and traversals in a more intuitive and compact manner than traditional SQL, making it easier for developers to implement sophisticated AI algorithms [9]. Additional features that make graph databases particularly well-suited for handling complex, interconnected data in real-time applications include: 1. Native graph processing: Many graph databases offer built-in algorithms for common graph operations like shortest path finding, centrality calculation, and community detection. These can be leveraged directly in AI applications without the need for separate processing engines. 2. Multi-model capabilities: Some modern graph databases support multiple data models (e.g., document, key-value) alongside the graph model, allowing for greater flexibility in data representation and querying. 3. Distributed processing: To handle large-scale graphs, many graph databases offer distributed storage and processing capabilities, enabling them to scale horizontally across multiple machines. https://iaeme.com/Home/journal/IJRCAIT 105 editor@iaeme.com Ravi Kiran Magham Feature Graph Databases (Score 1-10) Complex relationship handling 10 Relational Databases (Score 1-10) Query performance for connected data 5 9 Scalability for large datasets 9 6 Intuitive data representation 7 9 Flexibility of schema 6 8 Native graph processing 5 10 Powerful query languages for graph operations 3 9 Index-free adjacency 9 5 Multi-model capabilities 3 7 Distributed processing 5 8 Table 2: Comparative Analysis: Graph Databases vs. Relational Databases for AI Applications [9] 7 V. THE SYNERGY BETWEEN GRAPHRAG AND GRAPH DATABASES The effectiveness of GraphRAG is closely tied to the capabilities of graph databases. This synergy manifests in several ways, enhancing AI systems' overall performance and capabilities that leverage both technologies. 1. Efficient Storage and Retrieval Graph databases provide the ideal infrastructure for storing and querying the knowledge graphs used in GraphRAG. Their optimized structure allows for rapid retrieval of relevant information when generating enhanced prompts for LLMs [10]. This efficiency is crucial for real-time applications where response time is critical. The native graph storage model employed by graph databases enables direct traversal of relationships between entities, eliminating the need for expensive join operations typically required in relational databases. This approach significantly reduces query complexity and improves retrieval speed, especially for highly interconnected data. Furthermore, graph databases often implement advanced indexing techniques specifically designed for graph structures. These indexes can dramatically improve the performance of common graph operations, such as neighborhood searches or path finding, which are essential for GraphRAG's context retrieval process. 2. Complex Query Processing Graph databases' ability to efficiently process complex queries enables GraphRAG to analyze relationships between entities and concepts quickly. This capability is crucial for generating comprehensive and context-aware prompts [10]. Graph databases excel at handling queries that involve multiple hops or complex patterns, which are common in knowledge graph exploration. Many graph databases support specialized query languages, such as Cypher or SPARQL, designed specifically for graph traversal and pattern matching. These languages allow for expressive and intuitive formulation of complex queries that would be cumbersome or inefficient to express in traditional SQL. Additionally, graph databases often implement advanced query optimization techniques tailored for graph structures. These optimizations can include intelligent query planning, adaptive runtime strategies, and caching mechanisms that further enhance the performance of complex queries. https://iaeme.com/Home/journal/IJRCAIT 106 editor@iaeme.com Graphrag and Role of Graph Databases in Advancing AI 3. Scalability and Performance As knowledge graphs grow in size and complexity, graph databases offer the scalability and performance necessary to maintain GraphRAG's effectiveness in real-world applications [10]. Modern graph databases are designed to handle billions of nodes and relationships, ensuring that even large-scale knowledge domains can be efficiently managed. To achieve this scalability, graph databases employ various strategies: • Distributed Storage: Many graph databases support horizontal scaling, partitioning the graph across multiple machines. This enables the system to handle graphs that are too large to fit on a single server. • Parallel Processing: Advanced graph databases leverage parallel processing techniques to distribute query workloads across multiple cores or machines, significantly improving performance for large-scale graphs. • Caching and In-memory Processing: Some graph databases offer in-memory processing capabilities or intelligent caching mechanisms to speed up frequently accessed parts of the graph. • Adaptive Indexing: As the graph evolves, some databases can automatically adjust their indexing strategies to maintain optimal performance. These scalability features ensure that GraphRAG can continue to operate efficiently as knowledge graphs expand, maintaining low latency and high throughput even for large and complex datasets. 1. Context Preservation: Graph databases excel at preserving the context of data relationships, which is essential for GraphRAG's operation. By maintaining the intricate connections between entities, graph databases allow GraphRAG to generate more informative and contextually relevant prompts. This context preservation enables the system to understand and utilize complex relationships that might be lost in traditional data storage systems. 2. Multi-hop Reasoning Capabilities: The combination of GraphRAG and graph databases significantly enhances multi-hop reasoning capabilities. Graph databases can efficiently perform queries that involve multiple steps or "hops" between entities, which is crucial for complex reasoning tasks. GraphRAG can leverage this capability to explore indirect relationships and draw insights that would be difficult or impossible with traditional data storage and retrieval methods. Capability GraphRAG Contribution Efficient Storage and Retrieval Graph Database Contribution Knowledge Graph Creation Context Retrieval Native Graph Storage Advanced Indexing Complex Query Processing Relationship Analysis Context-Aware Prompts Specialized Query Languages Query Optimization Techniques Scalability Dynamic Information Updates Large-Scale Knowledge Domains Distributed Storage Parallel Processing Caching and In-memory Processing Adaptive Indexing Real-time Performance Prompt Generation Semantic Reasoning Context Understanding Low Latency Queries Pattern Matching Table 2: Performance Metrics of Integrated GraphRAG and Graph Database Systems [10] https://iaeme.com/Home/journal/IJRCAIT 107 editor@iaeme.com Ravi Kiran Magham The combination of efficient storage and retrieval, powerful query processing, and robust scalability provided by graph databases creates a strong foundation for GraphRAG. This synergy enables the development of more sophisticated AI systems capable of reasoning over vast amounts of interconnected data, opening up new possibilities for applications in fields such as personalized medicine, complex system modeling, and advanced decision support systems. VI. CHALLENGES AND FUTURE DIRECTIONS While GraphRAG demonstrates significant potential, several challenges and areas for future research remain: 1. Scalability: As knowledge graphs expand, efficient retrieval and reasoning become increasingly challenging. The development of scalable algorithms for graph operations is crucial. Recent work by Galkin et al. [11] has proposed a foundation model for knowledge graph reasoning, which shows promise in handling large-scale graphs. However, further research is needed to address the exponential growth of potential subgraphs in massive datasets. Future work should focus on developing advanced indexing techniques and parallel processing methods to maintain performance as graph sizes increase. 2. Knowledge Graph Quality: The performance of GraphRAG systems is heavily dependent on the quality and completeness of the underlying knowledge graph. Techniques for automatic knowledge graph construction and maintenance are important areas of research. Zhu et al. [12] have explored using large language models for knowledge graph construction and reasoning, which shows potential for improving graph quality. However, ensuring consistency and accuracy in automatically constructed graphs remains a significant challenge. Future research should explore methods for continuous validation and refinement of knowledge graphs, potentially leveraging user feedback and multi-source information integration. 3. Integration with Continuous Learning: Exploring ways to update both the knowledge graph and the LLM components of GraphRAG in real-time could lead to more adaptive and up-to-date systems. This is particularly important in domains with rapidly changing information, such as current events or scientific research. Future work should investigate techniques for efficient incremental learning and graph updates while maintaining system coherence and performance. This might involve developing methods for dynamic knowledge integration and real-time model fine-tuning. 4. Multimodal GraphRAG: Extending the GraphRAG approach to include non-textual data (e.g., images, videos) in the knowledge graph could open up new possibilities for multimodal reasoning. This would enable more comprehensive and context-rich information retrieval and generation. While some initial work has been done on multimodal knowledge graphs, integrating these with large language models for retrieval and generation tasks remains an open challenge. Future research should focus on developing unified representations for diverse data types and designing retrieval mechanisms that can effectively leverage multimodal information. Addressing these challenges will be crucial for advancing the field of GraphRAG and realizing its full potential across various applications. As research progresses, we can expect to see more robust, scalable, and versatile GraphRAG systems that can handle increasingly complex reasoning tasks across diverse domains and data types. CONCLUSION The integration of GraphRAG and graph databases represents a significant advancement in AI's ability to reason with complex, interconnected information. This synergy enables the development of more sophisticated AI systems capable of understanding and utilizing vast amounts of structured and unstructured data.  editor@iaeme.com https://iaeme.com/Home/journal/IJRCAIT 108 Graphrag and Role of Graph Databases in Advancing AI By leveraging the strengths of graph-based knowledge representation and advanced language models, GraphRAG overcomes limitations of traditional RAG systems, offering improved accuracy, contextual relevance, and scalability. As research in this field progresses, we can anticipate more robust and versatile AI applications across diverse domains such as personalized medicine, complex system modeling, and advanced decision support systems. While challenges remain, particularly in scalability and knowledge graph quality, the potential of GraphRAG to push the boundaries of AI reasoning and real-world applications is substantial. As these technologies continue to evolve, they promise to drive further advancements in artificial intelligence, opening new possibilities for solving complex problems and gaining insights from interconnected data. REFERENCES [1] [2]  [3] [4] [5] [6] [7] T. L. Scao et al., "Bloom: A 176B-Parameter Open-Access Multilingual Language Model," arXiv preprint arXiv:2211.05100, 2022. [Online]. Available: https://arxiv.org/abs/2211.05100 G. Izacard, P. Lewis, E. Lomeli, L. Hosseini, F. Petroni, T. Schick, J. Dwivedi, A. Cancedda, S. Riedel, and S. Stenetorp, "Atlas: Few-shot Learning with Retrieval Augmented Language Models," arXiv preprint arXiv:2208.03299, 2022. [Online]. Available: https://arxiv.org/abs/2208.03299 Robinson, J. Webber, and E. Eifrem, "Graph Databases: New Opportunities for Connected Data," 3rd Edition, O'Reilly Media, Inc., 2021. [Online]. Available: https://www.oreilly.com/library/view/graph-databases/9781492044062/ Sara AlMahri, Liming Xu, Alexandra Brintrup. "Enhancing Supply Chain Visibility with Knowledge Graphs and Large Language Models." arXiv:2408.07705v1 [cs.IR], Aug 2024. https://arxiv.org/html/2408.07705v1 Boci Peng, Yun Zhu, et al. "Graph Retrieval-Augmented Generation: A Survey." arXiv:2408.08921, Sep 2024. https://arxiv.org/pdf/2408.08921 Ben Lorica, Prashanth Rao. "GraphRAG: Design Patterns, Challenges, Recommendations." Gradient Flow, May 2024. https://gradientflow.com/graphragdesign-patterns-challenges-recommendations/ Y. Hu, Z. Zhang, Y. Lei, G. Pan, C. Ling, and L. Zhao, "GRAG: Graph RetrievalAugmented Generation," in 2024 IEEE International Conference on Data Engineering (ICDE), 2024, https://ieeexplore.ieee.org/document/10409137 [8] pp. 1-12. J. Zhang, X. Zhang, J. Yu, J. Tang, J. Tang, C. Li, and H. Chen, "Subgraph Retrieval Enhanced Model for Multi-hop Knowledge Base Question Answering," in Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), https://aclanthology.org/2022.acl-long.396/ [9] O'Reilly 2022, pp. 5773-5784. J. Webber and I. Robinson, "Graph Databases: New Opportunities for Connected Data," Media, Inc., 2nd Edition, https://www.oreilly.com/library/view/graph-databases-new/9781098155308/ https://iaeme.com/Home/journal/IJRCAIT 109 2021. editor@iaeme.com Ravi Kiran Magham [10] [11] [12] Data," I. Robinson, J. Webber, and E. Eifrem, "Graph Databases: New Opportunities for Connected O'Reilly Media, Inc., 3rd Edition, https://www.oreilly.com/library/view/graph-databases-3rd/9781098150013/ 2023. M. Galkin, X. Yuan, H. Mostafa, J. Tang, and Z. Zhu, "Towards Foundation Models for Knowledge Graph Reasoning," in The Twelfth International Conference on Learning Representations, 2024. https://openreview.net/forum?id=V8N9bxJAh8 Y. Zhu, X. Wang, J. Chen, S. Qiao, Y. Ou, Y. Yao, S. Deng, H. Chen, and N. Zhang, "LLMs for Knowledge Graph Construction and Reasoning: Recent Capabilities and Future Opportunities," arXiv:2305.13168 https://arxiv.org/abs/2305.13168 [cs.AI], May 2023. Citation: Ravi Kiran Magham. (2024). Graphrag and Role of Graph Databases in Advancing AI. International Journal of Research in Computer Applications and Information Technology (IJRCAIT), 7(2), 98-110. Abstract Link: https://iaeme.com/Home/article_id/IJRCAIT_07_02_007 Article Link:  https://iaeme.com/MasterAdmin/Journal_uploads/IJRCAIT/VOLUME_7_ISSUE_2/IJRCAIT_07_02_007.pdf ✉ editor@iaeme.com  """
kg = KnowledgeGraph(client, claude_model)

await kg.process_large_document(document_text)


analysis = kg.analyze_graph()

print(analysis)

graph_data = kg.export_graph()

kg.close()

Splitting document into chunks...
Document split into 9 chunks
Processing batch 1/2




Processing batch 2/2
Extraction complete! Found 37 entities and 52 relationships
{'total_entities': 37, 'total_relationships': 40, 'most_connected_entities': [{'entity': 'Graphrag', 'type': 'Technology', 'degree': 23}, {'entity': 'Graph Databases', 'type': 'Technology', 'degree': 14}, {'entity': 'Large Language Models', 'type': 'Technology', 'degree': 4}, {'entity': 'Knowledge Graphs', 'type': 'Concept', 'degree': 3}, {'entity': 'Artificial Intelligence', 'type': 'Concept', 'degree': 2}, {'entity': 'Retrieval-Augmented Generation', 'type': 'Methodology', 'degree': 2}, {'entity': 'Knowledge Graph', 'type': 'Concept', 'degree': 2}, {'entity': 'Large Language Models (Llms)', 'type': 'Technology', 'degree': 1}, {'entity': 'Query-Focused Summarization', 'type': 'Methodology', 'degree': 1}, {'entity': 'Graph-Guided Retrieval', 'type': 'Methodology', 'degree': 1}], 'entity_types_distribution': {'Technology': 15, 'Concept': 10, 'Methodology': 7, 'Metric': 3, 'Tool': 2}, 'average_degree': 2.162