In [1]:
import os
import openai
from llama_index.llms import OpenAI
from llama_index.embeddings import OpenAIEmbedding
from llama_index import(
    ServiceContext,
    StorageContext,
    SimpleDirectoryReader,
    LangchainEmbedding,
    VectorStoreIndex,
    load_index_from_storage,
    load_graph_from_storage,
    LLMPredictor,
    PromptHelper,
    KnowledgeGraphIndex,
    LLMPredictor,
    )

# upload model
from llama_index.llms import LangChainLLM
from llama_index.graph_stores import SimpleGraphStore
from llama_index import (KnowledgeGraphIndex)
from llama_index.storage.storage_context import StorageContext
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.llms import LlamaCpp
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.graph_stores import SimpleGraphStore

from IPython.display import Markdown, display

os.environ["OPENAI_API_KEY"] = "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"
openai.api_key = os.environ["OPENAI_API_KEY"]

In [2]:
from llama_index import download_loader

WikipediaReader = download_loader("WikipediaReader")

loader = WikipediaReader()

documents = loader.load_data(pages=["2023 in science"], auto_suggest=False)

In [3]:
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="text-davinci-002", streaming=True))
embed_model = OpenAIEmbedding(embed_batch_size=10)
#embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name = "sentence-transformers/all-MiniLM-L6-v2",model_kwargs = {'device': 'cpu'}))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, chunk_size=512)

In [4]:
space_name = "llamaindex"
edge_types, rel_prop_names = ["relationship"], [
    "relationship"]  # default, could be omit if create from an empty kg
tags = ["entity"]  

In [5]:
graph_store = SimpleGraphStore(space_name=space_name,edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,)

storage_context = StorageContext.from_defaults(graph_store=graph_store)

kg_index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context = storage_context,
    service_context = service_context,
    max_triplets_per_chunk=10,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
    include_embeddings=True,
    show_progress = True
)

  from .autonotebook import tqdm as notebook_tqdm
Parsing documents into nodes: 100%|██████████| 1/1 [00:00<00:00,  9.17it/s]
Generating embeddings: 100%|██████████| 8/8 [00:01<00:00,  6.51it/s]
Generating embeddings: 100%|██████████| 11/11 [00:02<00:00,  4.22it/s]
Generating embeddings: 100%|██████████| 10/10 [00:01<00:00,  9.36it/s]
Generating embeddings: 100%|██████████| 10/10 [00:00<00:00, 11.86it/s]
Generating embeddings: 100%|██████████| 11/11 [00:01<00:00,  9.43it/s]
Generating embeddings: 100%|██████████| 13/13 [00:01<00:00,  8.26it/s]
Generating embeddings: 100%|██████████| 10/10 [00:00<00:00, 10.89it/s]
Generating embeddings: 100%|██████████| 11/11 [00:02<00:00,  5.49it/s]
Generating embeddings: 100%|██████████| 8/8 [00:01<00:00,  6.13it/s]
Generating embeddings: 100%|██████████| 8/8 [00:01<00:00,  6.69it/s]
Generating embeddings: 100%|██████████| 10/10 [00:00<00:00, 11.06it/s]
Generating embeddings: 100%|██████████| 10/10 [00:00<00:00, 12.35it/s]
Generating embeddings: 100%|

In [6]:
vector_index = VectorStoreIndex.from_documents(documents, service_context= service_context)

In [7]:
# import QueryBundle
from llama_index import QueryBundle

# import NodeWithScore
from llama_index.schema import NodeWithScore

# Retrievers
from llama_index.retrievers import BaseRetriever, VectorIndexRetriever, KGTableRetriever

from typing import List


class CustomRetriever(BaseRetriever):
    """Custom retriever that performs both Vector search and Knowledge Graph search"""

    def __init__(
        self,
        vector_retriever: VectorIndexRetriever,
        kg_retriever: KGTableRetriever,
        mode: str = "OR",
    ) -> None:
        """Init params."""

        self._vector_retriever = vector_retriever
        self._kg_retriever = kg_retriever
        if mode not in ("AND", "OR"):
            raise ValueError("Invalid mode.")
        self._mode = mode

    def _retrieve(self, query_bundle: QueryBundle) -> List[NodeWithScore]:
        """Retrieve nodes given query."""

        vector_nodes = self._vector_retriever.retrieve(query_bundle)
        kg_nodes = self._kg_retriever.retrieve(query_bundle)

        vector_ids = {n.node.node_id for n in vector_nodes}
        kg_ids = {n.node.node_id for n in kg_nodes}

        combined_dict = {n.node.node_id: n for n in vector_nodes}
        combined_dict.update({n.node.node_id: n for n in kg_nodes})

        if self._mode == "AND":
            retrieve_ids = vector_ids.intersection(kg_ids)
        else:
            retrieve_ids = vector_ids.union(kg_ids)

        retrieve_nodes = [combined_dict[rid] for rid in retrieve_ids]
        return retrieve_nodes


In [8]:
from llama_index import get_response_synthesizer
from llama_index.query_engine import RetrieverQueryEngine

# create custom retriever
vector_retriever = VectorIndexRetriever(index=vector_index)
kg_retriever = KGTableRetriever(
    index=kg_index, retriever_mode="keyword", include_text=False
)
custom_retriever = CustomRetriever(vector_retriever, kg_retriever)

# create response synthesizer
response_synthesizer = get_response_synthesizer(
    service_context=service_context,
    response_mode="tree_summarize",
)


In [9]:
custom_query_engine = RetrieverQueryEngine(
    retriever=custom_retriever,
    response_synthesizer=response_synthesizer,
)

vector_query_engine = vector_index.as_query_engine()

kg_keyword_query_engine = kg_index.as_query_engine(
    # setting to false uses the raw triplets instead of adding the text from the corresponding nodes
    include_text=False,
    retriever_mode="keyword",
    response_mode="tree_summarize",
)

In [10]:
response = kg_keyword_query_engine.query("Tell me events about NASA")
display(Markdown(f"<b>{response}</b>"))

<b>
NASA has been involved in many events, including the first human spaceflight in 1961, the first moon landing in 1969, the launch of the Hubble Space Telescope in 1990, and the launch of the Mars Curiosity Rover in 2011.</b>

In [11]:
response = vector_query_engine.query("Tell me events about NASA")
display(Markdown(f"<b>{response}</b>"))

<b>
NASA is expected to receive increased budgets in 2023, which will be used to fund various research topics and agencies. On 12 July, astronomers reported considerable success of the James Webb Space Telescope (JWST) after its first year of operations. On 14 July, the Indian Space Research Organisation (ISRO) successfully launched its Chandrayaan-3 spacecraft towards the Moon. On 19 July, astronomers reported the discovery of a bizarre 'two-faced' star, with one side made up of hydrogen and the other consisting of helium. On 25 July, a study published in Nature found that a collapse of the Atlantic meridional overturning circulation (AMOC) is highly likely this century, and may occur as early as 2025. On 26 July, DARPA, in collaboration with NASA, began work on the first in-orbit demonstration of a nuclear thermal rocket engine.</b>

In [12]:
response = custom_query_engine.query("Tell me events about NASA")
display(Markdown(f"<b>{response}</b>"))

<b>
NASA is scheduled to launch a Venus probe in October 2023, which will partly search for signs of life on Venus. Additionally, NASA has been provided with an increased budget for various fields, research topics, and agencies, including the new Advanced Research Projects Agency for Health (ARPA-H).</b>

In [None]:
## create graph
from pyvis.network import Network

g = kg_index.get_networkx_graph(200)
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)
net.show("2023_Science_Wikipedia_KnowledgeGraph.html")

2023_Science_Wikipedia_KnowledgeGraph.html
