# KG generation with Zephyr

In [36]:
import os
import requests
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
from llama_index.prompts import PromptTemplate
from transformers import BitsAndBytesConfig
from IPython.display import Markdown, display
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline,
    BitsAndBytesConfig
)
from typing import Optional, List, Mapping, Any, Tuple
from llama_index.prompts.base import PromptTemplate
from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from llama_index import (
    ServiceContext, 
    SimpleDirectoryReader, 
#     LangchainEmbedding, 
#     ListIndex,
    KnowledgeGraphIndex
)
from llama_index.callbacks import CallbackManager
from llama_index.llms import (
    CustomLLM, 
    CompletionResponse, 
    CompletionResponseGen,
    LLMMetadata,
)
from llama_index.storage.storage_context import StorageContext
from llama_index.graph_stores import NebulaGraphStore
from llama_index.llms.base import llm_completion_callback

## 1. Customize LLM class

In [25]:
class ZephyrEndpointLLM(CustomLLM):
    api_endpoint: str
    endpoint_path: str = "/v1/models/model:predict"

    context_window: int = 2048
    num_output: int = 256
    model_name: str = "HuggingFaceH4/zephyr-7b-beta"

    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=self.context_window,
            num_output=self.num_output,
            model_name=self.model_name
        )

    @llm_completion_callback()
    def complete(
        self, prompt: str, 
        stop: Optional[List[str]] = [],
        temperature: float = 0.5,
        max_new_tokens: int = 1024,
        **kwargs: Any) -> CompletionResponse:
        # prompt_length = len(prompt)
        # response = pipeline(prompt, max_new_tokens=self.num_output)[0]["generated_text"]

        # # only return newly generated tokens
        # text = response[prompt_length:]
        data = {
            "prompt": prompt,
            "temperature": temperature,
            "max_new_tokens": max_new_tokens,
            "stop": stop or [],
        }
        try:
            response = requests.post(self.api_endpoint + self.endpoint_path, json=data)
            if response.status_code == 200:
                text = dict(response.json())['data']['generated_text']
            else:
                raise ValueError(f'The response status code was: {response.status_code}, '
                                 'expected: 200')
        except requests.exceptions.RequestException as e:
            raise SystemExit(e)

        return CompletionResponse(text=text)
    
    @llm_completion_callback()
    def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
        raise NotImplementedError()

Prompt completion

In [26]:
DEFAULT_KG_TRIPLET_EXTRACT_TMPL = (
    "Some text is provided below. Given the text, extract up to "
    "{max_knowledge_triplets} "
    "knowledge triplets in the form of (subject, predicate, object). Avoid stopwords.\n"
    "---------------------\n"
    "Example:"
    "Text: Alice is Bob's mother."
    "Triplets:\n(Alice, is mother of, Bob)\n"
    "Text: Philz is a coffee shop founded in Berkeley in 1982.\n"
    "Triplets:\n"
    "(Philz, is, coffee shop)\n"
    "(Philz, founded in, Berkeley)\n"
    "(Philz, founded in, 1982)\n"
    "---------------------\n"
    # "Text: {text}\n"
    # "Triplets:\n"
)
user_prompt = (
    "Text: {text}\n"
    "Triplets:\n"
)

In [27]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
messages = [
    {
        "role": "system",
        "content": DEFAULT_KG_TRIPLET_EXTRACT_TMPL,
    },
    {"role": "user", "content": user_prompt},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


<|system|>
Some text is provided below. Given the text, extract up to {max_knowledge_triplets} knowledge triplets in the form of (subject, predicate, object). Avoid stopwords.
---------------------
Example:Text: Alice is Bob's mother.Triplets:
(Alice, is mother of, Bob)
Text: Philz is a coffee shop founded in Berkeley in 1982.
Triplets:
(Philz, is, coffee shop)
(Philz, founded in, Berkeley)
(Philz, founded in, 1982)
---------------------
</s>
<|user|>
Text: {text}
Triplets:
</s>
<|assistant|>



In [37]:
from llama_index.prompts.prompt_type import PromptType
KG_TRIPLET_EXTRACT_PROMPT = PromptTemplate(
    prompt, prompt_type=PromptType.KNOWLEDGE_TRIPLET_EXTRACT
)

In [28]:
# define our LLM
llm = ZephyrEndpointLLM(api_endpoint="http://127.0.0.1:8080")

embed_model = HuggingFaceBgeEmbeddings(model_name="dangvantuan/sentence-camembert-large")

context_window = 2048
# set number of output tokens
num_output = 1024
chunk_size = 512

service_context = ServiceContext.from_defaults(
    llm=llm, 
    embed_model=embed_model,
    context_window=context_window, 
    chunk_size=chunk_size,
    num_output=num_output
)

No sentence-transformers model found with name /home/xli/.cache/torch/sentence_transformers/dangvantuan_sentence-camembert-large. Creating a new one with MEAN pooling.


## 2. Connect with nebula graph

In [29]:
import time
from nebula3.gclient.net import Connection
from nebula3.gclient.net.SessionPool import SessionPool
from nebula3.Config import SessionPoolConfig
from nebula3.common.ttypes import ErrorCode

In [30]:
os.environ['NEBULA_USER'] = "root"
os.environ['NEBULA_PASSWORD'] = "nebula"
os.environ["GRAPHD_HOST"] = "127.0.0.1"
os.environ["GRAPHD_PORT"] = "9669"
os.environ['NEBULA_ADDRESS'] = "127.0.0.1:9669"
space_name = "Digital_Safety"

In [31]:
config = SessionPoolConfig()

# prepare space
conn = Connection()
conn.open(os.environ["GRAPHD_HOST"], os.environ["GRAPHD_PORT"], 1000)
auth_result = conn.authenticate(os.environ["NEBULA_USER"], os.environ["NEBULA_PASSWORD"])
assert auth_result.get_session_id() != 0
resp = conn.execute(
    auth_result._session_id,
    "CREATE SPACE IF NOT EXISTS "+space_name+"(vid_type=FIXED_STRING(256), partition_num=1, replica_factor=1);",
)
assert resp.error_code == ErrorCode.SUCCEEDED
# insert data need to sleep after create schema
time.sleep(10)

session_pool = SessionPool(os.environ["NEBULA_USER"], os.environ["NEBULA_PASSWORD"], space_name, [(os.environ["GRAPHD_HOST"], os.environ["GRAPHD_PORT"])])
assert session_pool.init(config)

# add schema
resp = session_pool.execute(
    'CREATE TAG IF NOT EXISTS entity(name string);'
    'CREATE EDGE IF NOT EXISTS relationship(relationship string);'
    'CREATE TAG INDEX IF NOT EXISTS entity_index ON entity(name(256));'
)

In [32]:
os.environ['NEBULA_USER'] = os.environ["NEBULA_USER"]
os.environ['NEBULA_PASSWORD'] = os.environ["NEBULA_PASSWORD"]
os.environ['NEBULA_ADDRESS'] = os.environ["NEBULA_ADDRESS"]

edge_types, rel_prop_names = ["relationship"], ["relationship"]
tags = ["entity"]

graph_store = NebulaGraphStore(
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)
storage_context = StorageContext.from_defaults(graph_store=graph_store)

## 3. Load documents

In [41]:
# Load the your data
documents = SimpleDirectoryReader("../../../data/").load_data()

In [42]:
documents

[Document(id_='9967d40e-fc12-4227-b1c4-57ff9fda1af7', embedding=None, metadata={'page_label': '1', 'file_name': 'Digital Safety_Livrable1_Etat de l_art_RCO-5.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='f19a671fc007c757599848b2bbf57dfc0acde5e68f204fe4125c3c09509371e5', text=" NOTE TECHNIQUE   Etat de l’art sur la cartographie automatique et dynamique des relations   intras et inters documentaires des documents composant un rapport de sûreté pour le Projet I1 « Digital Safety » \n5  \nNOTE TECHNIQUE – CARTOGRAPHIE AUTOMATIQUE ET DYNAMIQUE DES RELATIONS INTERNES ET ENTRE LES DOCUMENTS Contexte Dans le cadre du projet I1 « Digital Safety », ASSYSTEM souhaite développer une solution qui s’appuierait sur une Intelligence Artificielle (IA) pour cartographier de façon automatique et dynamique les relations intras et inters documentaires des documents composant un rapport de sûreté. Cette note technique présente un état de l’art sur les algorit

In [43]:
kg_index = KnowledgeGraphIndex.from_documents(
    documents,
    storage_context=storage_context,
    service_context=service_context,
    kg_triple_extract_template=KG_TRIPLET_EXTRACT_PROMPT,
    max_triplets_per_chunk=10,
    space_name=space_name,
    edge_types=edge_types,
    rel_prop_names=rel_prop_names,
    tags=tags,
)

1. Project I1 "Digital Safety" - seeks - AI solution for automated and dynamic mapping of intra and interdocument relations in safety reports
2. Safety report - is - a complex, voluminous, and detailed document describing all aspects related to nuclear safety for an installation
3. Safety report - consists of - a multitude of documents, ranging from thousands to tens of thousands of pages, depending on the size, complexity, and type of the nuclear installation
4. Project I1 "Digital Safety" - aims to develop - an AI-based solution for automated and dynamic mapping of intra and interdocument relations in safety reports
5. AI-based solution - can - automate and dynamically map intra and interdocument relations in safety reports
6. Safety report - includes - a multitude of interdependent documents
7. Project I1 "Digital Safety" - requires - an AI-based solution for automated and dynamic mapping of intra and interdocument relations in safety reports due to the complexity and rigor involved

In [44]:
from pyvis.network import Network

g = kg_index.get_networkx_graph()
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)
net.show('./example.html')

./example.html
