In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [2]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

## Local Search Example

Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [3]:
INPUT_DIR = "../graphfleet/output/20240828-113421/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

#### Read entities

In [4]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 9384


[2024-08-28T12:59:12Z WARN  lance::dataset] No existing dataset at /Volumes/Samsung-SSD-T7/Qredence/GraphFleet/GraphFleet/notebook/../graphfleet/output/20240828-113421/artifacts/lancedb/entity_description_embeddings.lance, it will be created


Unnamed: 0,level,title,type,description,source_id,degree,human_readable_id,id,graph_embedding,community,size,entity_type,top_level_node_id,x,y
0,0,DARREN EDGE,PERSON,"Darren Edge is an author of the paper ""From Lo...",0c932f7def033fa2b1bf210fbb771e7d,7,0,b45241d70f0e43fca764df95b2b81f77,,,,,b45241d70f0e43fca764df95b2b81f77,,
1,0,HA TRINH,PERSON,"Ha Trinh is an author of the paper ""From Local...",0c932f7def033fa2b1bf210fbb771e7d,7,1,4119fd06010c494caa07f439b333f4c5,,,,,4119fd06010c494caa07f439b333f4c5,,
2,0,NEWMAN CHENG,PERSON,"Newman Cheng is an author of the paper ""From L...",0c932f7def033fa2b1bf210fbb771e7d,7,2,d3835bf3dda84ead99deadbeac5d0d7d,,,,,d3835bf3dda84ead99deadbeac5d0d7d,,
3,0,JOSHUA BRADLEY,PERSON,"Joshua Bradley is an author of the paper ""From...",0c932f7def033fa2b1bf210fbb771e7d,7,3,077d2820ae1845bcbb1803379a3d1eae,,,,,077d2820ae1845bcbb1803379a3d1eae,,
4,0,ALEX CHAO,PERSON,"Alex Chao is an author of the paper ""From Loca...",0c932f7def033fa2b1bf210fbb771e7d,7,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,,,,,3671ea0dd4e84c1a9b02c5ab2c8f4bac,,


#### Read relationships

In [5]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 2232


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,DARREN EDGE,HA TRINH,16.0,Darren Edge and Ha Trinh co-authored the paper...,[0c932f7def033fa2b1bf210fbb771e7d],b66b4a2ce0e944fd98496859cf6a4a90,0,7,7,14
1,DARREN EDGE,NEWMAN CHENG,16.0,Darren Edge and Newman Cheng co-authored the p...,[0c932f7def033fa2b1bf210fbb771e7d],33cbbd7dc94d4e018649baaf0b1b3975,1,7,7,14
2,DARREN EDGE,JOSHUA BRADLEY,16.0,Darren Edge and Joshua Bradley co-authored the...,[0c932f7def033fa2b1bf210fbb771e7d],2739678ac40f4916ae90ab0e26481d0e,2,7,7,14
3,DARREN EDGE,ALEX CHAO,16.0,Darren Edge and Alex Chao co-authored the pape...,[0c932f7def033fa2b1bf210fbb771e7d],5b2a7279941d43f59d9c59aa3bf81297,3,7,7,14
4,DARREN EDGE,APURVA MODY,16.0,Darren Edge and Apurva Mody co-authored the pa...,[0c932f7def033fa2b1bf210fbb771e7d],6f30c405b4554c50967a5cf1e963a829,4,7,7,14


In [6]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

Claim records: 306


#### Read community reports

In [7]:
def read_indexer_entities(entity_df, entity_embedding_df, community_level):
    # Create an explicit copy if needed
    entity_df = entity_df.copy()
    
    # Use .loc to modify the DataFrame
    entity_df.loc[:, "community"] = entity_df["community"].fillna(-1)
    entity_df.loc[:, "community"] = entity_df["community"].astype(int)
    
    # ... rest of the function implementation
    
    return entities  # Assuming this function returns entities

# Outside the function
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()
    # ... rest of the function

Report records: 263


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,149,# LATS and its Impact on AI and ML Benchmarks\...,2,9.0,LATS and its Impact on AI and ML Benchmarks,The report provides a comprehensive and insigh...,The community revolves around the LATS (Langua...,[{'explanation': 'LATS (Language Agent Tree Se...,"{\n ""title"": ""LATS and its Impact on AI and...",7df98cc7-daf6-401b-9dcf-ae5c8f5f607c
1,150,"# Search Algorithms in AI and ML: A*, DFS, and...",2,8.5,"Search Algorithms in AI and ML: A*, DFS, and LATS",The rating is high due to the significant impa...,The community revolves around the study and ap...,[{'explanation': 'A* is a search algorithm tha...,"{\n ""title"": ""Search Algorithms in AI and M...",db392895-4869-439a-b953-337446d24da6
2,151,# LATS Algorithm and Its Action Space\n\nThe c...,2,8.5,LATS Algorithm and Its Action Space,The rating is high due to the detailed insight...,The community revolves around the LATS algorit...,[{'explanation': 'The LATS algorithm is a pivo...,"{\n ""title"": ""LATS Algorithm and Its Action...",f6f9c187-6d63-4775-a0ff-9b42259b3235
3,152,# Monte Carlo Tree Search (MCTS) and its Role ...,2,9.0,Monte Carlo Tree Search (MCTS) and its Role in...,The rating is high due to the significant impa...,The community revolves around Monte Carlo Tree...,[{'explanation': 'Monte Carlo Tree Search (MCT...,"{\n ""title"": ""Monte Carlo Tree Search (MCTS...",7eed378a-8347-4466-9180-88d87cbfb6bc
4,153,# Sampling and Stochastic Nature in Language M...,2,8.5,Sampling and Stochastic Nature in Language Mod...,The rating is high due to the critical role th...,The community focuses on the interplay between...,[{'explanation': 'Sampling is a crucial proces...,"{\n ""title"": ""Sampling and Stochastic Natur...",24afb751-6e1d-4143-8b3d-ef0e3768ef2e


In [8]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 263


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,149,# LATS and its Impact on AI and ML Benchmarks\...,2,9.0,LATS and its Impact on AI and ML Benchmarks,The report provides a comprehensive and insigh...,The community revolves around the LATS (Langua...,[{'explanation': 'LATS (Language Agent Tree Se...,"{\n ""title"": ""LATS and its Impact on AI and...",7df98cc7-daf6-401b-9dcf-ae5c8f5f607c
1,150,"# Search Algorithms in AI and ML: A*, DFS, and...",2,8.5,"Search Algorithms in AI and ML: A*, DFS, and LATS",The rating is high due to the significant impa...,The community revolves around the study and ap...,[{'explanation': 'A* is a search algorithm tha...,"{\n ""title"": ""Search Algorithms in AI and M...",db392895-4869-439a-b953-337446d24da6
2,151,# LATS Algorithm and Its Action Space\n\nThe c...,2,8.5,LATS Algorithm and Its Action Space,The rating is high due to the detailed insight...,The community revolves around the LATS algorit...,[{'explanation': 'The LATS algorithm is a pivo...,"{\n ""title"": ""LATS Algorithm and Its Action...",f6f9c187-6d63-4775-a0ff-9b42259b3235
3,152,# Monte Carlo Tree Search (MCTS) and its Role ...,2,9.0,Monte Carlo Tree Search (MCTS) and its Role in...,The rating is high due to the significant impa...,The community revolves around Monte Carlo Tree...,[{'explanation': 'Monte Carlo Tree Search (MCT...,"{\n ""title"": ""Monte Carlo Tree Search (MCTS...",7eed378a-8347-4466-9180-88d87cbfb6bc
4,153,# Sampling and Stochastic Nature in Language M...,2,8.5,Sampling and Stochastic Nature in Language Mod...,The rating is high due to the critical role th...,The community focuses on the interplay between...,[{'explanation': 'Sampling is a crucial proces...,"{\n ""title"": ""Sampling and Stochastic Natur...",24afb751-6e1d-4143-8b3d-ef0e3768ef2e


#### Read text units

In [9]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 82


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,0c932f7def033fa2b1bf210fbb771e7d,From Local to Global: A Graph RAG Approach to\...,1200,[0668cddc5f873265ba50da5a0a06edad],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[b66b4a2ce0e944fd98496859cf6a4a90, 33cbbd7dc94...",[fc0a7b90-6ae9-4f44-b28d-34c758da02ea]
1,64476a39d7d8b87b399e3bd3cead79c7,on\nthe state-of-the-art for all such summari...,1200,[0668cddc5f873265ba50da5a0a06edad],"[de988724cfdf45cebfba3b13c43ceede, d91a266f766...","[0938a5e373784d429ca1be153385ae2a, e2bd6ef1a4c...","[0001fef1-20a7-4cc4-ba0b-8172d343fae8, cf53919..."
2,e66ed885a08f92cc69f4895302c33047,examples provided to the LLM for in-context l...,1200,[0668cddc5f873265ba50da5a0a06edad],"[d91a266f766b4737a06b0fda588ba40b, bc0e3f075a4...","[2f573394724c454ba65edaf1779629c2, 8c9d4c12234...",[0fcb646f-5a75-423b-97a9-853622a73881]
3,4930fce6da868f894757a9da465807ba,which reveals internal structure within these...,1200,[0668cddc5f873265ba50da5a0a06edad],"[04dbbb2283b845baaeac0eaf0c34c9da, 32ee140946e...","[a830d753ebd242ab9801c4393891f41a, ddd7cfa40b6...",[fd0c379c-e127-4d61-9eab-5cace159e794]
4,26b2dad01a219bc034ac7d6a32d07582,"understanding of dataset contents, and not th...",1200,[0668cddc5f873265ba50da5a0a06edad],"[de988724cfdf45cebfba3b13c43ceede, b462b94ce47...","[780c50ba20de4a0b8fb5b8a5c73bd06e, a78008a389c...","[e25a84d2-fcf2-40f9-bb0a-a0eb4870d8d2, df680f4..."


In [10]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
api_base = os.environ["GRAPHRAG_API_BASE"]
api_version = os.environ["GRAPHRAG_API_VERSION"]


llm = ChatOpenAI(
    api_key=api_key,
    api_base=api_base,
    api_version=api_version,
    model=llm_model,
    api_type=OpenaiApiType.AzureOpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=api_base,
    api_version=api_version,
    api_type=OpenaiApiType.AzureOpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

### Create local search context builder

In [11]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

### Create local search engine

In [13]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [14]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### Run local search on sample queries

In [15]:
result = await search_engine.asearch("What is GraphRAG be short")
print(result.response)



### Overview of GraphRAG

GraphRAG is a sophisticated system designed to create and reason over knowledge graphs, particularly in the NebulaGraph format. It leverages the natural modularity of graphs to partition data for global summarization, making it highly efficient for handling large datasets and complex queries [Data: Entities (11, 217); Relationships (88, 98, 45)].

### Key Features

1. **Graph Indexing**: GraphRAG uses a self-generated graph index to organize and retrieve information. This index is built using generic prompts for entity and relationship extraction, tailored to the specific domain of the data [Data: Claims (24)].
   
2. **Global Summarization**: The system excels in global summarization of source texts, achieving competitive performance at a fraction of the token cost compared to other methods [Data: Claims (43, 47)].

3. **Integration with LLMs**: GraphRAG incorporates large language models (LLMs) to enhance its retrieval and generation processes. This integrat

In [16]:
question = "What is the purpose of GraphRAG?"
result = await search_engine.asearch(question)
print(result.response)



### Purpose of GraphRAG

GraphRAG is designed to create and reason over knowledge graphs, specifically in the NebulaGraph format. This system leverages the natural modularity of graphs to partition data for global summarization, making it particularly effective for handling large and complex datasets [Data: Entities (229); Relationships (88, 98)].

### Key Features

1. **Knowledge Graph Generation**: GraphRAG excels in generating knowledge graphs, which are structured representations of information. These graphs are used to enhance the retrieval and reasoning processes, making the system a powerful tool for various AI and ML tasks [Data: Entities (62); Relationships (46)].

2. **Global Summarization**: One of the primary purposes of GraphRAG is to achieve global summarization of source texts. By using a graph index, the system can partition data efficiently, allowing for comprehensive and coherent summaries [Data: Entities (11); Relationships (45, 98)].

3. **Scalability**: GraphRAG de

#### Inspecting the context data used to generate the response

In [17]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,11,GRAPH RAG,Graph RAG is a method that leverages graph com...,86,True
1,229,GRAPHRAG,GraphRAG is a system designed to create and re...,2,True
2,71,GRAPH RAG PIPELINE,Graph RAG Pipeline is the implementation of th...,1,True
3,51,GRAPH RAG APPROACH,Graph RAG Approach is a high-level data flow a...,3,True
4,274,GRAPH-BASED RAG APPLICATIONS,Graph-based RAG applications are systems that ...,4,True


In [18]:
result.context_data["relationships"].head()

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,88,GRAPH RAG,GRAPHRAG,Graph RAG's ability to create and reason over ...,7.0,88,1,True
1,49,GRAPH RAG,GRAPH RAG PIPELINE,Graph RAG Pipeline is the implementation of th...,9.0,87,1,True
2,98,GRAPH RAG,GRAPH INDEX,Graph RAG uses a graph index to partition data...,9.0,87,1,True
3,122,GRAPH RAG,RODRIGO RACANICCI,Rodrigo Racanicci contributed to the work on G...,8.0,87,1,True
4,45,GRAPH RAG,GLOBAL SUMMARIZATION,Graph RAG is an approach based on global summa...,9.0,90,2,True


In [20]:
result.context_data["reports"].head()

KeyError: 'reports'

In [21]:
result.context_data["sources"].head()

Unnamed: 0,id,text
0,8,"Index, 2024) libraries,\nwhile a more general ..."
1,7,"win rates of 57% and 64%, respectively. Diver..."
2,6,\n502028252121\n805044413836\n725650525452\n75...
3,5,Directors [...]Public Figures in Controversy ...


In [22]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

   id     entity object_id status           start_date             end_date  \
0  24  GRAPH RAG      NONE   TRUE                 NONE                 NONE   
1  40  GRAPH RAG      NONE   TRUE  2024-01-01T00:00:00  2024-12-31T00:00:00   
2  42  GRAPH RAG      NONE   TRUE  2024-01-01T00:00:00  2024-12-31T00:00:00   
3  43  GRAPH RAG      NONE   TRUE  2024-01-01T00:00:00  2024-12-31T00:00:00   
4  44  GRAPH RAG      NONE   TRUE  2024-01-01T00:00:00  2024-12-31T00:00:00   

                                         description  in_context  
0  Graph RAG uses a graph index created with gene...        True  
1  Graph RAG is mentioned as a system that uses t...        True  
2  Graph RAG is mentioned as achieving the best h...        True  
3  Graph RAG is mentioned as performing competiti...        True  
4  Graph RAG is mentioned as having many possibil...        True  


### Question Generation

This function takes a list of user queries and generates the next candidate questions.

In [23]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [24]:
question_history = [
    "Tell me about Language Agent Tree Search?",
    "What is the best way to create a knowledge graph agent ?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)

['- How does LATS improve decision-making in language models?', '- What benchmarks has LATS been evaluated on, and what were the results?', '- How does LATS integrate with advanced language models like GPT-4 and GPT-3.5?', '- What are the key components of the LATS framework?', '- How does LATS address the shortcomings of previous techniques like CoT, ToT, and ReAct?']
