In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [1]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore

## Local Search Example

Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [4]:
INPUT_DIR = "../graphfleet/output/20240826-060116/artifacts"
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

#### Read entities

In [5]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 3867


[2024-08-26T04:21:20Z WARN  lance::dataset] No existing dataset at /Volumes/Samsung-SSD-T7/Qredence/GraphFleet/GraphFleet/notebook/../graphfleet/output/20240826-060116/artifacts/lancedb/entity_description_embeddings.lance, it will be created


Unnamed: 0,level,title,type,description,source_id,entity_type,community,degree,human_readable_id,id,size,graph_embedding,top_level_node_id,x,y
0,0,GRAPH RAG,"APPROACH, METHOD",### Analysis:\n\nGRAPH RAG is a novel method f...,"0c932f7def033fa2b1bf210fbb771e7d,26b2dad01a219...","APPROACH, METHOD",4,62,0,b45241d70f0e43fca764df95b2b81f77,62.0,"[0.04181117191910744, -0.05127706006169319, 0....",b45241d70f0e43fca764df95b2b81f77,-11.10175,-0.566615
1,0,QUERY-FOCUSED SUMMARIZATION (QFS),"TASK, METHOD",Analysis:\nQUERY-FOCUSED SUMMARIZATION (QFS) i...,"0c932f7def033fa2b1bf210fbb771e7d,ac21ebe9a9d70...","TASK, METHOD",10,6,1,4119fd06010c494caa07f439b333f4c5,6.0,"[0.04077928513288498, -0.034901514649391174, -...",4119fd06010c494caa07f439b333f4c5,-4.729228,23.263651
2,0,RETRIEVAL-AUGMENTED GENERATION (RAG),"TECHNIQUE, METHOD",A technique that retrieves relevant informatio...,0c932f7def033fa2b1bf210fbb771e7d,"TECHNIQUE, METHOD",4,2,2,d3835bf3dda84ead99deadbeac5d0d7d,2.0,"[0.027234233915805817, -0.03883614018559456, -...",d3835bf3dda84ead99deadbeac5d0d7d,-10.848432,0.336364
3,0,LARGE LANGUAGE MODELS (LLMS),"TECHNOLOGY, MODEL",Analysis:\n\nLarge Language Models (LLMs) are ...,"0c932f7def033fa2b1bf210fbb771e7d,6fe27f9eb76cf...","TECHNOLOGY, MODEL",18,7,3,077d2820ae1845bcbb1803379a3d1eae,7.0,"[0.06786838918924332, -0.0716349184513092, -0....",077d2820ae1845bcbb1803379a3d1eae,1.621845,11.211926
4,0,COMMUNITY DETECTION,"TECHNIQUE, METHOD",### Analysis:\n\nCommunity detection is a proc...,"0c932f7def033fa2b1bf210fbb771e7d,aa79049289e65...","TECHNIQUE, METHOD",12,7,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,7.0,"[-0.061953868716955185, -0.050750549882650375,...",3671ea0dd4e84c1a9b02c5ab2c8f4bac,15.824801,4.987949


#### Read relationships

In [6]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 606


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,GRAPH RAG,QUERY-FOCUSED SUMMARIZATION (QFS),4.0,### Analysis:\n\nGraph RAG is an innovative me...,"[0c932f7def033fa2b1bf210fbb771e7d, ac21ebe9a9d...",d6f67aa7ef0e4a19bf5830e777aafea5,0,62,6,68
1,GRAPH RAG,RETRIEVAL-AUGMENTED GENERATION (RAG),2.0,Graph RAG aims to address the limitations of R...,[0c932f7def033fa2b1bf210fbb771e7d],bbf61f9cd3e14f46a010d704e86be008,1,62,2,64
2,GRAPH RAG,LARGE LANGUAGE MODELS (LLMS),2.0,Graph RAG uses large language models to build ...,[0c932f7def033fa2b1bf210fbb771e7d],5d34e587bd2f41dba285e9178f179577,2,62,7,69
3,GRAPH RAG,COMMUNITY DETECTION,2.0,Graph RAG uses community detection to partitio...,[0c932f7def033fa2b1bf210fbb771e7d],901b491be7344401b4544ff05e591a0e,3,62,7,69
4,GRAPH RAG,SENSEMAKING,4.0,### Analysis:\n\nGraph RAG is a sophisticated ...,"[0c932f7def033fa2b1bf210fbb771e7d, ac21ebe9a9d...",ecacbf62b81d485396a56e1730e75a04,4,62,4,66


In [7]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

Claim records: 220


#### Read community reports

In [8]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 110


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,100,# MIRAGE Datasets and Their Evaluation in NLP\...,2,9.0,MIRAGE Datasets and Their Evaluation in NLP,The MIRAGE datasets and their associated evalu...,The community revolves around the MIRAGE datas...,[{'explanation': 'The MIRAGE datasets serve as...,"{\n ""title"": ""MIRAGE Datasets and Their Eva...",1130a53b-d692-49b6-a573-13d5bcf45335
1,101,# Graph RAG and Community Summaries in NLP\n\n...,2,9.0,Graph RAG and Community Summaries in NLP,The text is highly relevant and significant to...,The community focuses on the use of Graph Retr...,[{'explanation': 'Community summaries are comp...,"{\n ""title"": ""Graph RAG and Community Summa...",eac2c84e-2c96-4462-a427-657a909225dd
2,102,# Advancements in Computational Linguistics an...,2,9.0,Advancements in Computational Linguistics and NLP,The text is highly relevant and impactful in a...,The community of Computational Linguistics and...,[{'explanation': 'Recent advancements in retri...,"{\n ""title"": ""Advancements in Computational...",839c3851-2561-4c60-be78-d1bdc38691ea
3,103,# Advancements in Computational Linguistics an...,2,9.0,Advancements in Computational Linguistics and NLP,The text is highly relevant and impactful in a...,The community of Computational Linguistics and...,[{'explanation': 'GLOBAL SUMMARIZATION is a me...,"{\n ""title"": ""Advancements in Computational...",b88a3c23-e191-483d-8451-e94622d6fe92
4,104,# Advancements in Graph-Based Retrieval-Augmen...,2,9.0,Advancements in Graph-Based Retrieval-Augmente...,The rating is high due to the significant impa...,The community focuses on the development and a...,[{'explanation': 'The GRAPH INDEX is an advanc...,"{\n ""title"": ""Advancements in Graph-Based R...",39ce50ff-6b80-4a28-83d2-8e7ca17f035e


#### Read text units

In [9]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 32


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,0c932f7def033fa2b1bf210fbb771e7d,From Local to Global: A Graph RAG Approach to\...,1200,[0668cddc5f873265ba50da5a0a06edad],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[d6f67aa7ef0e4a19bf5830e777aafea5, bbf61f9cd3e...","[035a81f3-b6cc-4792-a689-8ac48ab97121, ff476d4..."
1,64476a39d7d8b87b399e3bd3cead79c7,on\nthe state-of-the-art for all such summari...,1200,[0668cddc5f873265ba50da5a0a06edad],"[b45241d70f0e43fca764df95b2b81f77, 254770028d7...","[e38eb1698900424bb7392a74ff0f3351, 855c57eecf2...","[5592cbac-8860-4ec3-8474-aa693be845ba, 163a5e0..."
2,e66ed885a08f92cc69f4895302c33047,examples provided to the LLM for in-context l...,1200,[0668cddc5f873265ba50da5a0a06edad],"[3671ea0dd4e84c1a9b02c5ab2c8f4bac, e2f5735c7d7...","[cac3f76fbc11413e92cdfd3064d56ece, f120d98b793...",[5f520244-fd53-4677-9228-81697fa02084]
3,4930fce6da868f894757a9da465807ba,which reveals internal structure within these...,1200,[0668cddc5f873265ba50da5a0a06edad],"[e2f5735c7d714423a2c4f61ca2644626, deece7e64b2...","[97038fe907af4710859c3daeb13972e9, e0595082eb9...",[40f2bf60-24b2-4c06-b59a-490b1d630c9f]
4,26b2dad01a219bc034ac7d6a32d07582,"understanding of dataset contents, and not th...",1200,[0668cddc5f873265ba50da5a0a06edad],"[b45241d70f0e43fca764df95b2b81f77, e69dc259edb...","[8f10c11ecb5142029869025521c73431, 4dd086fcba7...","[9703d941-662a-4a61-931d-2ce933dc3543, 20f0afe..."


In [10]:
api_key = os.environ["GRAPHRAG_API_KEY"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
api_base = os.environ["GRAPHRAG_API_BASE"]
api_version = os.environ["GRAPHRAG_API_VERSION"]


llm = ChatOpenAI(
    api_key=api_key,
    api_base=api_base,
    api_version=api_version,
    model=llm_model,
    api_type=OpenaiApiType.AzureOpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    max_retries=20,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=api_base,
    api_version=api_version,
    api_type=OpenaiApiType.AzureOpenAI,
    model=embedding_model,
    deployment_name=embedding_model,
    max_retries=20,
)

### Create local search context builder

In [11]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    covariates=covariates,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

### Create local search engine

In [12]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [13]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### Run local search on sample queries

In [14]:
result = await search_engine.asearch("What is GraphRAG be short")
print(result.response)

### Overview of Graph RAG

Graph RAG (Retrieval-Augmented Generation) is an advanced method for question answering over large text corpora. It integrates retrieval-augmented generation with query-focused summarization and knowledge graph generation to enhance the comprehensiveness and diversity of generated answers. This approach is particularly useful for applications requiring precise and contextually appropriate information retrieval and summarization [Data: Graph RAG and its Contributors (41)].

### Key Features

1. **Graph-Based Text Index**: Graph RAG uses a self-generated graph index to partition data, which allows for efficient global summarization. This index is created by a large language model (LLM) and includes nodes (entities), edges (relationships), and covariates (claims) [Data: Entities (0); Relationships (18)].

2. **Community Summaries**: The system generates community summaries for groups of closely-related entities within the graph index. These summaries are used to

In [15]:
question = "What is the purpose of GraphRAG?"
result = await search_engine.asearch(question)
print(result.response)

# Purpose of Graph RAG

Graph RAG (Retrieval-Augmented Generation) is an advanced technique designed to enhance the process of generating relevant and concise summaries from large text corpora. It integrates retrieval-augmented generation (RAG) with query-focused summarization (QFS) and knowledge graph generation to improve the comprehensiveness and diversity of generated answers. This approach is particularly useful for applications requiring precise and contextually appropriate information retrieval and summarization [Data: Graph RAG and its Contributors (41)].

## Enhancing Query-Focused Summarization

The primary purpose of Graph RAG is to address the limitations of traditional RAG methods by incorporating query-focused summarization. This integration allows Graph RAG to generate more accurate and relevant content, making it suitable for handling complex queries and generating high-quality summaries. By leveraging the capabilities of both RAG and QFS, Graph RAG represents a signifi

#### Inspecting the context data used to generate the response

In [16]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,0,GRAPH RAG,### Analysis:\n\nGRAPH RAG is a novel method f...,62,True
1,27,OPEN-SOURCE IMPLEMENTATION,"An open-source, Python-based implementation of...",1,True
2,26,GLOBAL GRAPH RAG,A variant of the Graph RAG approach that scale...,1,True
3,262,GRAPH RAG INDEX,A self-generated graph index used in Graph RAG...,1,True
4,316,RODRIGO RACANICCI,A contributor to the work on Graph RAG,1,True


In [17]:
result.context_data["relationships"].head()

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,6,GRAPH RAG,MICROSOFT STRATEGIC MISSIONS AND TECHNOLOGIES,Microsoft Strategic Missions and Technologies ...,2.0,63,1,True
1,9,GRAPH RAG,HA TRINH,Ha Trinh contributed to the development of the...,2.0,63,1,True
2,13,GRAPH RAG,APURVA MODY,Apurva Mody contributed to the development of ...,2.0,63,1,True
3,14,GRAPH RAG,STEVEN TRUITT,Steven Truitt contributed to the development o...,2.0,63,1,True
4,21,GRAPH RAG,LOCAL GRAPH RAG,Local Graph RAG is a variant of the Graph RAG ...,2.0,63,1,True


In [18]:
result.context_data["reports"].head()

Unnamed: 0,id,title,content
0,41,Graph RAG and its Contributors,# Graph RAG and its Contributors\n\nThe commun...


In [19]:
result.context_data["sources"].head()

Unnamed: 0,id,text
0,7,"Index, 2024) libraries,\nwhile a more general ..."
1,0,From Local to Global: A Graph RAG Approach to\...
2,6,"win rates of 57% and 64%, respectively. Diver..."
3,1,on\nthe state-of-the-art for all such summari...


In [20]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

   id                                         entity object_id status  \
0  30                                      GRAPH RAG      NONE   TRUE   
1  11                                  STEVEN TRUITT      NONE   TRUE   
2   6                                       HA TRINH      NONE   TRUE   
3   5                                    DARREN EDGE      NONE   TRUE   
4   3  MICROSOFT STRATEGIC MISSIONS AND TECHNOLOGIES      NONE   TRUE   

            start_date             end_date  \
0                 NONE                 NONE   
1  2024-04-24T00:00:00  2024-04-24T00:00:00   
2  2024-04-24T00:00:00  2024-04-24T00:00:00   
3  2024-04-24T00:00:00  2024-04-24T00:00:00   
4  2024-04-24T00:00:00  2024-04-24T00:00:00   

                                         description  in_context  
0  Graph RAG is compared against six different co...        True  
1  Steven Truitt has contributed to research by c...        True  
2  Ha Trinh has contributed to research by co-aut...        True  
3  Darren 

### Question Generation

This function takes a list of user queries and generates the next candidate questions.

In [21]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [22]:
question_history = [
    "Tell me about creating a knowledge graph agent",
    "What is the best way to create a knowledge graph agent ?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)

['- What are the key steps involved in constructing a knowledge graph for NLP applications?', '- How does modularity enhance the structure and organization of knowledge graphs?', '- What role do community detection algorithms play in the analysis of knowledge graphs?', '- How can knowledge graphs be integrated with various systems for improved information retrieval?', '- What contributions have Trajanoska et al. (2023) and Yao et al. (2023) made to the field of knowledge graph creation and completion?']
