In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [1]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

## Local Search Example

Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [2]:
INPUT_DIR = "../graphfleet/output/graphindex/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

#### Read entities

In [3]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 2208


Unnamed: 0,level,title,type,description,source_id,degree,human_readable_id,id,graph_embedding,community,size,entity_type,top_level_node_id,x,y
0,0,DARREN EDGE,PERSON,"Darren Edge is an author of the paper ""From Lo...",e8d83e6e7a7c0f57b218cef24976b745,8,0,b45241d70f0e43fca764df95b2b81f77,,,,,b45241d70f0e43fca764df95b2b81f77,,
1,0,HA TRINH,PERSON,"Ha Trinh is an author of the paper ""From Local...",e8d83e6e7a7c0f57b218cef24976b745,8,1,4119fd06010c494caa07f439b333f4c5,,,,,4119fd06010c494caa07f439b333f4c5,,
2,0,NEWMAN CHENG,PERSON,"Newman Cheng is an author of the paper ""From L...",e8d83e6e7a7c0f57b218cef24976b745,8,2,d3835bf3dda84ead99deadbeac5d0d7d,,,,,d3835bf3dda84ead99deadbeac5d0d7d,,
3,0,JOSHUA BRADLEY,PERSON,"Joshua Bradley is an author of the paper ""From...",e8d83e6e7a7c0f57b218cef24976b745,8,3,077d2820ae1845bcbb1803379a3d1eae,,,,,077d2820ae1845bcbb1803379a3d1eae,,
4,0,ALEX CHAO,PERSON,"Alex Chao is an author of the paper ""From Loca...",e8d83e6e7a7c0f57b218cef24976b745,8,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,,,,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,,


#### Read relationships

In [4]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 1179


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,DARREN EDGE,HA TRINH,1.0,Darren Edge and Ha Trinh co-authored the paper...,[e8d83e6e7a7c0f57b218cef24976b745],28b7457ca5dc4a38a488946a3f8e207e,0,8,8,16
1,DARREN EDGE,NEWMAN CHENG,1.0,Darren Edge and Newman Cheng co-authored the p...,[e8d83e6e7a7c0f57b218cef24976b745],8029a14d15404e6db95ddf5e2bf9fc15,1,8,8,16
2,DARREN EDGE,JOSHUA BRADLEY,1.0,Darren Edge and Joshua Bradley co-authored the...,[e8d83e6e7a7c0f57b218cef24976b745],389314ca89d445888c8d4985864dd733,2,8,8,16
3,DARREN EDGE,ALEX CHAO,1.0,Darren Edge and Alex Chao co-authored the pape...,[e8d83e6e7a7c0f57b218cef24976b745],87fe1462b9064d5692641ab48e826301,3,8,8,16
4,DARREN EDGE,APURVA MODY,1.0,Darren Edge and Apurva Mody co-authored the pa...,[e8d83e6e7a7c0f57b218cef24976b745],a55175ac57014df696ca09d0def9604b,4,8,8,16


In [5]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

Claim records: 201


#### Read community reports

In [6]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 102


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,100,# Natural Language Processing and Information ...,2,8.5,Natural Language Processing and Information Re...,The rating is high due to the significant impa...,The community of 'Natural Language Processing ...,[{'explanation': 'Community summaries play a p...,"{\n ""title"": ""Natural Language Processing a...",8c20fd3b-1450-4841-a64a-18429d05d5d3
1,101,# Natural Language Processing and Information ...,2,9.0,Natural Language Processing and Information Re...,The text is highly significant and impactful i...,The community of Natural Language Processing (...,[{'explanation': 'Recent advancements in trans...,"{\n ""title"": ""Natural Language Processing a...",cea459eb-b1cf-46cd-aabb-0ce4f3c184b7
2,71,# Natural Language Processing and Information ...,2,9.0,Natural Language Processing and Information Re...,The text is highly significant and impactful i...,The community of 'Natural Language Processing ...,[{'explanation': 'The 'NEWS ARTICLES' dataset ...,"{\n ""title"": ""Natural Language Processing a...",1458f6ef-6da1-486e-94d1-3070e4e280ff
3,72,# Natural Language Processing and Information ...,2,9.0,Natural Language Processing and Information Re...,The text is highly significant and impactful i...,The community of 'Natural Language Processing ...,[{'explanation': 'The 'PODCAST TRANSCRIPTS' da...,"{\n ""title"": ""Natural Language Processing a...",bcb2afbf-6b77-4e7c-ad5f-845d873cd10d
4,73,# Graph RAG and Community Summarization in NLP...,2,9.0,Graph RAG and Community Summarization in NLP a...,The rating is high due to the significant impa...,The community revolves around the Graph RAG sy...,[{'explanation': 'Graph RAG is a pivotal syste...,"{\n ""title"": ""Graph RAG and Community Summa...",6a14f684-f45d-4563-88f7-18a2cac9b5cb


#### Read text units

In [7]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 30


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,e8d83e6e7a7c0f57b218cef24976b745,From Local to Global: A Graph RAG Approach to\...,512,[0668cddc5f873265ba50da5a0a06edad],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[28b7457ca5dc4a38a488946a3f8e207e, 8029a14d154...","[d267bd27-b88d-41e7-a314-c9a8e873da73, 5a6063e..."
1,f0306814bf64f5c9e79603fc6a52f4ea,.\n1 Introduction\nHuman endeavors across a r...,512,[0668cddc5f873265ba50da5a0a06edad],"[d91a266f766b4737a06b0fda588ba40b, 1943f245ee4...","[a7c2a64e06374091adce74adb36801ab, 107568a67ca...","[a7270549-cf82-4e3e-a640-f740c5576df7, 2e4e078..."
2,fb3c48579608fa28be585ceb6cd2f0fe,"et al., 2006b) by asking questions of a globa...",512,[0668cddc5f873265ba50da5a0a06edad],"[c9632a35146940c2a86167c7726d35e9, 3d0dcbc8971...","[f770bc07cecf4aba8fe2d2c33fdc5542, 7cea9903153...",[b41beeef-ef1a-4e75-aa5b-67120c0d2fbf]
3,21e52bc06a82796b1f4bcd73edda1f2a,a new RAG approach specifically targeting glo...,512,[0668cddc5f873265ba50da5a0a06edad],"[de988724cfdf45cebfba3b13c43ceede, 96aad7cb4b7...","[192a6d23595045f38b0d46a3d8e52fd6, a6ae1d99330...",[701f3b3d-d6e1-47b6-99da-a52d61caac96]
4,bc9e2c9e369c4108cf4f6dd5f60960f4,intermediate- and low-level community summari...,512,[0668cddc5f873265ba50da5a0a06edad],"[96aad7cb4b7d40e9b7e13b94a67af206, c9632a35146...","[5174cdabb6024de0975762d3a80b059f, e379fba9011...",[da077ccf-d904-45e5-9948-00fd86435b7d]


In [8]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
api_base = os.environ["GRAPHRAG_API_BASE"]
api_version = os.environ["GRAPHRAG_API_VERSION"]


llm = ChatOpenAI(
    api_key=api_key,
    api_base=api_base,
    api_version=api_version,
    model=llm_model,
    api_type=OpenaiApiType.AzureOpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=api_base,
    api_version=api_version,
    api_type=OpenaiApiType.AzureOpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

### Create local search context builder

In [9]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

### Create local search engine

In [11]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [12]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### Run local search on sample queries

In [14]:
result = await search_engine.asearch("What is GraphRAG be short")
print(result.response)



### Overview of GraphRAG

GraphRAG, short for Graph Retrieval-Augmented Generation, is an advanced method that leverages the modularity of graphs to enhance text summarization and question-answering tasks. Developed by NebulaGraph, this approach integrates multiple stages and concepts, including knowledge graph generation, retrieval-augmented generation (RAG), and query-focused summarization (QFS), to support comprehensive and structured overviews of text corpora [Data: Entities (13, 386); Relationships (37, 185)].

### Key Features

1. **Graph-Based Text Indexing**: GraphRAG uses a graph-based text index to organize and retrieve information efficiently. This index is self-generated from the source documents and includes various elements extracted using Large Language Model (LLM) prompts [Data: Entities (119); Relationships (161)].

2. **Community Summaries**: The method employs community detection algorithms to partition graphs into modular communities. Summaries of these communities 

In [15]:
question = "What is the purpose of GraphRAG?"
result = await search_engine.asearch(question)
print(result.response)



### Purpose of GraphRAG

GraphRAG, or Graph Retrieval-Augmented Generation, is a sophisticated method designed to enhance the capabilities of text summarization and question answering over large datasets. Its primary purpose is to leverage the natural modularity of graphs to partition data, facilitating global summarization tasks. This approach integrates multiple stages and concepts, including knowledge graph generation, retrieval-augmented generation (RAG), and query-focused summarization (QFS), to support human sensemaking over text corpora [Data: Entities (13, 407); Relationships (37)].

### Enhancing Text Summarization

One of the key purposes of GraphRAG is to improve the comprehensiveness and diversity of generated answers compared to traditional methods. By using a graph-based text index, GraphRAG can efficiently manage and retrieve information, allowing for more detailed and varied responses. This is particularly useful in providing comprehensive and structured overviews of pu

#### Inspecting the context data used to generate the response

In [16]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,13,GRAPH RAG,Graph RAG (Retrieval-Augmented Generation) is ...,90,True
1,42,GRAPH RAG PIPELINE,Graph RAG pipeline is a process using an LLM-d...,7,True
2,407,GLOBAL APPROACH TO GRAPH RAG,A method that combines knowledge graph generat...,2,True
3,11,RAG,RAG (Retrieval-Augmented Generation) is a deve...,31,True
4,387,GRAPHRAG,A method that can create and reason over knowl...,1,True


In [17]:
result.context_data["relationships"].head()

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,37,RAG,GRAPH RAG,Graph RAG is a specific implementation of RAG ...,7.0,121,1,True
1,161,GRAPH RAG,GRAPH INDEX,Graph RAG utilizes a self-generated graph inde...,3.0,108,2,True
2,183,GRAPH RAG,LOCAL GRAPH RAG APPROACHES,Graph RAG includes local graph RAG approaches,1.0,91,1,True
3,59,RAG,GRAPH INDEX,A graph index is a data structure used in RAG ...,2.0,49,2,True
4,67,LLM,GRAPH RAG,Graph RAG utilizes Large Language Models (LLMs...,4.0,122,3,True


In [19]:
result.context_data["reports"].head()

KeyError: 'reports'

In [20]:
result.context_data["sources"].head()

Unnamed: 0,id,text
0,3,a new RAG approach specifically targeting glo...
1,18,these chunks into a vector space in which sim...
2,20,"dataset sizes, as well as to validate our sens..."
3,17,C0) require dramatically fewer tokens per que...
4,0,From Local to Global: A Graph RAG Approach to\...


In [21]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

   id entity object_id status start_date end_date  \
0  48    RAG      NONE   TRUE       NONE     NONE   

                                         description  in_context  
0  RAG is identified as a developing research are...        True  


### Question Generation

This function takes a list of user queries and generates the next candidate questions.

In [22]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [23]:
question_history = [
    "Tell me about Agent Mercer",
    "What happens in Dulce military base?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)

['- What are the key contributions of Andrés Morales Esquivel to the work acknowledged in the document?', '- How do Ranade and Joshi utilize large language models in intelligence analysis?', '- What are the main findings of Kuratov et al. (2024) and Liu et al. (2023) regarding recall degradation in longer context windows?', '- What advancements have been made in multimodal models like the Gemini series?', '- How do LLM prompts enhance the performance of large language models in information retrieval tasks?']
