In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

In [12]:
import os

import pandas as pd
import tiktoken

from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey
from graphrag.query.indexer_adapters import (
    read_indexer_covariates,
    read_indexer_entities,
    read_indexer_relationships,
    read_indexer_reports,
    read_indexer_text_units,
)
from graphrag.query.input.loaders.dfs import (
    store_entity_semantic_embeddings,
)
from graphrag.query.llm.oai.chat_openai import ChatOpenAI
from graphrag.query.llm.oai.embedding import OpenAIEmbedding
from graphrag.query.llm.oai.typing import OpenaiApiType
from graphrag.query.question_gen.local_gen import LocalQuestionGen
from graphrag.query.structured_search.local_search.mixed_context import (
    LocalSearchMixedContext,
)
from graphrag.query.structured_search.local_search.search import LocalSearch
from graphrag.vector_stores.lancedb import LanceDBVectorStore
from dotenv import load_dotenv

_ = load_dotenv()

## Local Search Example

Local search method generates answers by combining relevant data from the AI-extracted knowledge-graph with text chunks of the raw documents. This method is suitable for questions that require an understanding of specific entities mentioned in the documents (e.g. What are the healing properties of chamomile?).

### Load text units and graph data tables as context for local search

- In this test we first load indexing outputs from parquet files to dataframes, then convert these dataframes into collections of data objects aligning with the knowledge model.

### Load tables to dataframes

In [25]:
FOLDER = "20240903-181408"
INPUT_DIR = f"../bin/output/{FOLDER}/artifacts" 
LANCEDB_URI = f"{INPUT_DIR}/lancedb"

COMMUNITY_REPORT_TABLE = "create_final_community_reports"
ENTITY_TABLE = "create_final_nodes"
ENTITY_EMBEDDING_TABLE = "create_final_entities"
RELATIONSHIP_TABLE = "create_final_relationships"
COVARIATE_TABLE = "create_final_covariates"
TEXT_UNIT_TABLE = "create_final_text_units"
COMMUNITY_LEVEL = 2

#### Read entities

In [26]:
# read nodes table to get community and degree data
entity_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_TABLE}.parquet")
entity_embedding_df = pd.read_parquet(f"{INPUT_DIR}/{ENTITY_EMBEDDING_TABLE}.parquet")

entities = read_indexer_entities(entity_df, entity_embedding_df, COMMUNITY_LEVEL)

# load description embeddings to an in-memory lancedb vectorstore
# to connect to a remote db, specify url and port values.
description_embedding_store = LanceDBVectorStore(
    collection_name="entity_description_embeddings",
)
description_embedding_store.connect(db_uri=LANCEDB_URI)
entity_description_embeddings = store_entity_semantic_embeddings(
    entities=entities, vectorstore=description_embedding_store
)

print(f"Entity count: {len(entity_df)}")
entity_df.head()

Entity count: 6344


Unnamed: 0,level,title,type,description,source_id,community,degree,human_readable_id,id,size,graph_embedding,entity_type,top_level_node_id,x,y
0,0,MARKET,MARKET,"The ""MARKET"" is the arena of commercial dealin...","01f8ac021603c0416588ca16b8e25f91,0c43e8f182247...",5,48,0,b45241d70f0e43fca764df95b2b81f77,48,"[0.07086440175771713, -0.0688755065202713, -0....",,b45241d70f0e43fca764df95b2b81f77,0,0
1,0,BUSINESS,COMPANY,A **BUSINESS** is a commercial entity that aim...,"20642aed7debb52ac6a1fbb51c1d37cb,5044305935cc2...",25,26,1,4119fd06010c494caa07f439b333f4c5,26,"[0.014605529606342316, -0.004586625378578901, ...",,4119fd06010c494caa07f439b333f4c5,0,0
2,0,PRODUCT DEVELOPMENT,PRODUCT,PRODUCT DEVELOPMENT is the process of creating...,"01f8ac021603c0416588ca16b8e25f91,20642aed7debb...",25,4,2,d3835bf3dda84ead99deadbeac5d0d7d,4,"[0.057798705995082855, 0.06357988715171814, -0...",,d3835bf3dda84ead99deadbeac5d0d7d,0,0
3,0,PITCH,PITCH,"""PITCH"" is a structured presentation or speech...","20642aed7debb52ac6a1fbb51c1d37cb,2186a012e56a7...",5,21,3,077d2820ae1845bcbb1803379a3d1eae,21,"[-0.11226696521043777, 0.015231314115226269, 0...",,077d2820ae1845bcbb1803379a3d1eae,0,0
4,0,INVESTORS,INVESTMENT,**INVESTORS** are individuals or entities that...,"01f8ac021603c0416588ca16b8e25f91,07aa3332020a5...",28,43,4,3671ea0dd4e84c1a9b02c5ab2c8f4bac,43,"[-0.09893009066581726, 0.068280428647995, -0.0...",,3671ea0dd4e84c1a9b02c5ab2c8f4bac,0,0


#### Read relationships

In [27]:
relationship_df = pd.read_parquet(f"{INPUT_DIR}/{RELATIONSHIP_TABLE}.parquet")
relationships = read_indexer_relationships(relationship_df)

print(f"Relationship count: {len(relationship_df)}")
relationship_df.head()

Relationship count: 2226


Unnamed: 0,source,target,weight,description,text_unit_ids,id,human_readable_id,source_degree,target_degree,rank
0,MARKET,BUSINESS,2.0,A business operates within a market to sell it...,"[20642aed7debb52ac6a1fbb51c1d37cb, 91268bea004...",32d785e275be458fb7178ad2021ecdfc,0,48,26,74
1,MARKET,ANALYSIS AND RESEARCH ONLINE,1.0,Analysis and research online help understand t...,[3b59766cf7ab030a10e237fc8af3b657],0757f97d1fbf49748169ba696a364e4c,1,48,1,49
2,MARKET,TALK TO PEOPLE IN THE INDUSTRY,1.0,Talking to industry veterans or consultants he...,[3b59766cf7ab030a10e237fc8af3b657],ca9a355bf38b452cbde62dba747ec65f,2,48,2,50
3,MARKET,MARKET RESEARCH TOOLS,1.0,Market research tools help analyze various asp...,[3b59766cf7ab030a10e237fc8af3b657],ba297c67512447e4b86f0cbc39fbc301,3,48,2,50
4,MARKET,PRODUCT CATEGORIES,1.0,Product categories are part of the overall mar...,[3b59766cf7ab030a10e237fc8af3b657],00a9c8745b404b659c76a694dba9851c,4,48,1,49


In [28]:
# NOTE: covariates are turned off by default, because they generally need prompt tuning to be valuable
# Please see the GRAPHRAG_CLAIM_* settings
covariate_df = pd.read_parquet(f"{INPUT_DIR}/{COVARIATE_TABLE}.parquet")

claims = read_indexer_covariates(covariate_df)

print(f"Claim records: {len(claims)}")
covariates = {"claims": claims}

Claim records: 327


In [37]:
covariate_df

Unnamed: 0,id,human_readable_id,covariate_type,type,description,subject_id,subject_type,object_id,object_type,status,start_date,end_date,source_text,text_unit_id,document_ids,n_tokens
0,6aefddc9-e00e-495e-bb20-c564836405bc,1,claim,,,,,,,,,,,20642aed7debb52ac6a1fbb51c1d37cb,[66d616da48ddb30febdf69d0],1200
1,147a20ac-8ceb-4a60-9358-a096c5f32abb,2,claim,SUCCESSFUL PROPOSITION,Facebook had a successful proposition early in...,FACEBOOK,,INVESTORS,,TRUE,NONE,NONE,Without Facebook having such a successful prop...,20642aed7debb52ac6a1fbb51c1d37cb,[66d616da48ddb30febdf69d0],1200
2,d479df17-7be5-4618-9a14-cb189ea81c10,3,claim,,,There are no named entities that match the pre...,,,,,,,,b10026f1630f7cfc6a45f11b08542343,[66d616da48ddb30febdf69d0],1200
3,3ca907d6-0132-4610-a386-1f1f3c07a3c5,4,claim,,,,,,,,,,,198cc9c7876047173850cf0fbd262fd5,[66d616da48ddb30febdf69d0],1200
4,70f83dbd-953b-4884-b46e-be23deb230d4,5,claim,,,,,,,,,,,3b59766cf7ab030a10e237fc8af3b657,[66d616da48ddb30febdf69d0],463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322,8fd685b4-b4ec-4b3b-b238-109b2273af15,323,claim,SUCCESSFUL D2C BRAND,"The Honest Company, founded by Jessica Alba, f...",THE HONEST COMPANY,,NONE,,TRUE,NONE,NONE,If you’ve ever wondered where Jessica Alba tur...,83ab2fad3c97c23e719e3b703d33982c,[66d616e048ddb30febdf69f5],1155
323,0ce9d707-d368-4f13-9de8-836c4c3d9bc4,324,claim,DISRUPTIVE COMPANY,Uber is a hugely disruptive company that has p...,UBER,,NONE,,TRUE,NONE,NONE,Everyone knows that Uber is a hugely disruptiv...,83ab2fad3c97c23e719e3b703d33982c,[66d616e048ddb30febdf69f5],1155
324,dd54ad22-7e5e-4c95-b784-e9ade5b5a94d,325,claim,,,,,,,,,,,72d5c68cc70917432e845486770e8523,[66d616e048ddb30febdf69f5],55
325,08acf124-e6cc-4110-8f4a-ab65ddc93da6,326,claim,MENTORSHIP OPPORTUNITIES,"Antler provides mentorship, networking, and VC...",ANTLER,,NONE,,TRUE,NONE,NONE,"The mentorship, networking and VC funding oppo...",72d5c68cc70917432e845486770e8523,[66d616e048ddb30febdf69f5],55


#### Read community reports

In [29]:
report_df = pd.read_parquet(f"{INPUT_DIR}/{COMMUNITY_REPORT_TABLE}.parquet")
reports = read_indexer_reports(report_df, entity_df, COMMUNITY_LEVEL)

print(f"Report records: {len(report_df)}")
report_df.head()

Report records: 297


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].fillna(-1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  entity_df["community"] = entity_df["community"].astype(int)


Unnamed: 0,community,full_content,level,rank,title,rank_explanation,summary,findings,full_content_json,id
0,293,# MVP and Pre-Seed Investors in Startup Ecosys...,3,9.0,MVP and Pre-Seed Investors in Startup Ecosystems,The rating is high due to the critical role of...,The community revolves around the development ...,[{'explanation': 'The Minimum Viable Product (...,"{\n ""title"": ""MVP and Pre-Seed Investors in...",cb068c41-6437-4381-9e3f-a4ce4a4e1361
1,294,# Early Adopters and Marketing Strategy in Sta...,3,9.0,Early Adopters and Marketing Strategy in Start...,The rating is high due to the significant impa...,The community revolves around the critical rol...,[{'explanation': 'Early adopters are pivotal i...,"{\n ""title"": ""Early Adopters and Marketing ...",48490d91-e49b-4e3a-93ad-c37e03355889
2,295,# Pricing Strategy and Competitive Landscape i...,3,9.0,Pricing Strategy and Competitive Landscape in ...,The rating is high due to the significant impa...,The community revolves around the critical ele...,[{'explanation': 'Pricing strategy is a fundam...,"{\n ""title"": ""Pricing Strategy and Competit...",1d8ac395-f047-4291-a929-bdc8c39bb817
3,296,# Subscription Revenue Model and Recurring Rev...,3,9.0,Subscription Revenue Model and Recurring Revenue,The rating is high due to the significant impa...,The community revolves around the Subscription...,[{'explanation': 'The Subscription Revenue Mod...,"{\n ""title"": ""Subscription Revenue Model an...",275dc09b-f490-4ae8-bacf-2d884f440814
4,159,# AXE and the #PraiseUp Campaign\n\nThe commun...,2,9.0,AXE and the #PraiseUp Campaign,The rating is high due to the significant impa...,The community revolves around AXE and its #Pra...,[{'explanation': 'AXE has demonstrated a highl...,"{\n ""title"": ""AXE and the #PraiseUp Campaig...",78ad3323-9456-4bdf-bad4-0d7bb8322227


#### Read text units

In [30]:
text_unit_df = pd.read_parquet(f"{INPUT_DIR}/{TEXT_UNIT_TABLE}.parquet")
text_units = read_indexer_text_units(text_unit_df)

print(f"Text unit records: {len(text_unit_df)}")
text_unit_df.head()

Text unit records: 127


Unnamed: 0,id,text,n_tokens,document_ids,entity_ids,relationship_ids,covariate_ids
0,20642aed7debb52ac6a1fbb51c1d37cb,"# TAM, SAM & SOM: How To Calculate The Size Of...",1200,[66d616da48ddb30febdf69d0],"[b45241d70f0e43fca764df95b2b81f77, 4119fd06010...","[32d785e275be458fb7178ad2021ecdfc, 00cb0db6e46...","[6aefddc9-e00e-495e-bb20-c564836405bc, 147a20a..."
1,b10026f1630f7cfc6a45f11b08542343,"its TAM,\nbut few startups would want to dedi...",1200,[66d616da48ddb30febdf69d0],"[077d2820ae1845bcbb1803379a3d1eae, 1fd3fa8bb5a...","[20be7b3222174d31839fac6a278f8b61, 35688e258b0...",[d479df17-7be5-4618-9a14-cb189ea81c10]
2,198cc9c7876047173850cf0fbd262fd5,"love a âGoldilocksâ business, as far as th...",1200,[66d616da48ddb30febdf69d0],"[3671ea0dd4e84c1a9b02c5ab2c8f4bac, 19a7f254a5d...","[9a27717e1a1b499981031fd69c58aff1, 4efbe8fc23a...",[3ca907d6-0132-4610-a386-1f1f3c07a3c5]
3,3b59766cf7ab030a10e237fc8af3b657,their own business will fit in.\n\n### **1) A...,463,[66d616da48ddb30febdf69d0],"[b45241d70f0e43fca764df95b2b81f77, 19a7f254a5d...","[0757f97d1fbf49748169ba696a364e4c, ca9a355bf38...",[70f83dbd-953b-4884-b46e-be23deb230d4]
4,b4e3a5127b17e2d9ecc232f9491d5d45,# Unicorn Companies Explained: How Startups Ac...,1200,[66d616da48ddb30febdf69d1],"[1fd3fa8bb5a2408790042ab9573779ee, bf4e255cdac...","[e7ac741e4aa4433ca5f2379726f90b33, 43645eb9258...","[ebd0c996-e91f-4434-bec3-458b057601be, 5b27eea..."


In [31]:
api_key = os.environ["AZURE_OPENAI_API_KEY"]
api_base = os.environ["GRAPHRAG_API_BASE"]
api_version = os.environ["GRAPHRAG_API_VERSION"]
llm_model = os.environ["GRAPHRAG_LLM_MODEL"]
llm_deployment_name = os.environ["GRAPHRAG_LLM_DEPLOYMENT_NAME"]
embedding_model = os.environ["GRAPHRAG_EMBEDDING_MODEL"]
embedding_deployment_name = os.environ["GRAPHRAG_EMBEDDING_DEPLOYMENT_NAME"]
embedding_version = os.environ["GRAPHRAG_EMBEDDING_VERSION"]

llm = ChatOpenAI(
    api_key=api_key,
    model=llm_model,
    api_type=OpenaiApiType.AzureOpenAI,  # OpenaiApiType.OpenAI or OpenaiApiType.AzureOpenAI
    deployment_name=llm_deployment_name,
    max_retries=20,
    api_base=api_base,
    api_version=api_version,
)

token_encoder = tiktoken.get_encoding("cl100k_base")

text_embedder = OpenAIEmbedding(
    api_key=api_key,
    api_base=api_base,
    api_type=OpenaiApiType.AzureOpenAI,
    model=embedding_model,
    deployment_name=embedding_deployment_name,
    max_retries=20,
    api_version=api_version,
)

### Create local search context builder

In [32]:
context_builder = LocalSearchMixedContext(
    community_reports=reports,
    text_units=text_units,
    entities=entities,
    relationships=relationships,
    # if you did not run covariates during indexing, set this to None
    covariates=covariates,
    # covariates=None,
    entity_text_embeddings=description_embedding_store,
    embedding_vectorstore_key=EntityVectorStoreKey.ID,  # if the vectorstore uses entity title as ids, set this to EntityVectorStoreKey.TITLE
    text_embedder=text_embedder,
    token_encoder=token_encoder,
)

### Create local search engine

In [33]:
# text_unit_prop: proportion of context window dedicated to related text units
# community_prop: proportion of context window dedicated to community reports.
# The remaining proportion is dedicated to entities and relationships. Sum of text_unit_prop and community_prop should be <= 1
# conversation_history_max_turns: maximum number of turns to include in the conversation history.
# conversation_history_user_turns_only: if True, only include user queries in the conversation history.
# top_k_mapped_entities: number of related entities to retrieve from the entity description embedding store.
# top_k_relationships: control the number of out-of-network relationships to pull into the context window.
# include_entity_rank: if True, include the entity rank in the entity table in the context window. Default entity rank = node degree.
# include_relationship_weight: if True, include the relationship weight in the context window.
# include_community_rank: if True, include the community rank in the context window.
# return_candidate_context: if True, return a set of dataframes containing all candidate entity/relationship/covariate records that
# could be relevant. Note that not all of these records will be included in the context window. The "in_context" column in these
# dataframes indicates whether the record is included in the context window.
# max_tokens: maximum number of tokens to use for the context window.


local_context_params = {
    "text_unit_prop": 0.5,
    "community_prop": 0.1,
    "conversation_history_max_turns": 5,
    "conversation_history_user_turns_only": True,
    "top_k_mapped_entities": 10,
    "top_k_relationships": 10,
    "include_entity_rank": True,
    "include_relationship_weight": True,
    "include_community_rank": False,
    "return_candidate_context": False,
    "embedding_vectorstore_key": EntityVectorStoreKey.ID,  # set this to EntityVectorStoreKey.TITLE if the vectorstore uses entity title as ids
    "max_tokens": 12_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 5000)
}

llm_params = {
    "max_tokens": 2_000,  # change this based on the token limit you have on your model (if you are using a model with 8k limit, a good setting could be 1000=1500)
    "temperature": 0.0,
}

In [34]:
search_engine = LocalSearch(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
    response_type="multiple paragraphs",  # free form text describing the response type and format, can be anything, e.g. prioritized list, single paragraph, multiple paragraphs, multiple-page report
)

### Run local search on sample queries

In [36]:
result = await search_engine.asearch("Tell me about Antler")
print(result.response)

# Antler: A Comprehensive Overview

## Introduction to Antler

Antler is a global startup generator and early-stage venture capital firm that plays a pivotal role in the entrepreneurial ecosystem. The firm supports founders from the pre-seed, pre-team, and even pre-idea stages, emphasizing the importance of investor-founder fit. Antler provides a comprehensive suite of resources, including mentorship, networking, and first principles coaching, to help founders develop and validate their ideas [Data: Entities (136); Claims (199, 307); Relationships (975, 980)].

## Comprehensive Support System

Antler's support system is designed to ensure that entrepreneurs have a solid foundation for growth. This includes a wide range of services such as resources, network, coaching, and mentoring. The firm’s emphasis on first principles coaching helps founders break down complex problems and rebuild them from the ground up, fostering innovation and long-term success [Data: Entities (136, 1136); Claim

In [17]:
# result = await search_engine.asearch("Tell me about Antler")
# print(result.response)

# Antler: A Global Startup Generator and Early-Stage Venture Capital Firm

Antler is a prominent global startup generator and early-stage venture capital firm that plays a crucial role in supporting founders from the pre-seed, pre-team, and even pre-idea stages. The firm is dedicated to helping visionary individuals build successful companies by providing extensive resources, mentorship, and financial backing [Data: Entities (136); Relationships (975, 978, 972, 973, 974)].

## Comprehensive Suite of Resources

Antler offers a comprehensive suite of resources designed to assist entrepreneurs in developing and validating their ideas. These resources include mentorship, networking opportunities, and first principles coaching. The mentorship programs provide valuable guidance from experienced professionals, while networking opportunities help entrepreneurs build relationships that foster collaboration and innovation. First principles coaching helps founders break down complex problems and 

#### Inspecting the context data used to generate the response

In [18]:
result.context_data["entities"].head()

Unnamed: 0,id,entity,description,number of relationships,in_context
0,136,ANTLER,ANTLER is a global startup generator and early...,20,True
1,1584,CITIES,25 locations around the world where Antler's s...,1,True
2,1582,ANTLER RESIDENCY,"Antler residency offers mentorship, networking...",1,True
3,676,MENTORSHIP,MENTORSHIP: Antler offers mentorship opportuni...,4,True
4,1465,RESIDENCY PROGRAM,Programs offered by Antler to support founders...,1,True


In [19]:
result.context_data["relationships"].head()

Unnamed: 0,id,source,target,description,weight,rank,links,in_context
0,988,ANTLER,NETWORKING,Antler offers networking opportunities to help...,2.0,28,2,True
1,987,ANTLER,MENTORSHIP,Antler offers mentorship opportunities for ent...,2.0,24,1,True
2,981,ANTLER,FUNDRAISING CONCIERGE SERVICE,Antler developed and offers a fundraising conc...,2.0,22,1,True
3,971,ANTLER,UNICORN FOUNDER,Antler supports founders aiming to become unic...,1.0,21,1,True
4,972,ANTLER,PRE-SEED,Antler supports founders from the pre-seed stage,2.0,21,1,True


In [20]:
result.context_data["reports"].head()

Unnamed: 0,id,title,content
0,231,Antler and its Global Entrepreneurial Ecosystem,# Antler and its Global Entrepreneurial Ecosys...
1,231,Antler and its Global Entrepreneurial Ecosystem,# Antler and its Global Entrepreneurial Ecosys...


In [21]:
result.context_data["sources"].head()

Unnamed: 0,id,text
0,85,like Antler that go well beyond the money. Wi...
1,124,"+ and Apple TV sprung up,\nNetflix knew it nee..."
2,84,# How To Raise A Pre-Seed Round \n **_TLDR:_**...
3,125,idency](https://www.antler.co/)! The mentorshi...
4,6,unicorn disruption. There was a real market\n...


In [22]:
if "claims" in result.context_data:
    print(result.context_data["claims"].head())

### Question Generation

This function takes a list of user queries and generates the next candidate questions.

In [23]:
question_generator = LocalQuestionGen(
    llm=llm,
    context_builder=context_builder,
    token_encoder=token_encoder,
    llm_params=llm_params,
    context_builder_params=local_context_params,
)

In [24]:
question_history = [
    "Tell me about Antler",
    "What is the best approach for early stage startups to raise capital?",
]
candidate_questions = await question_generator.agenerate(
    question_history=question_history, context_data=None, question_count=5
)
print(candidate_questions.response)

['- How does Antler support founders in the pre-seed stage?', '- What resources does Antler provide to help entrepreneurs develop and validate their ideas?', "- How do Antler's residency programs assist entrepreneurs in finding co-founders and growing their businesses?", '- What is the importance of investor-founder fit according to Antler?', "- How does Antler's global reach benefit entrepreneurs?"]
