# Knowledge graph extraction 

In this notebook, we use `outlines` extract knowledge graphs from a book snippet.

We use the following libraries
 - `outlines`: to get structured outputs during generation 
 - `pydantic`: to provide class for structured outputs
 - `rich`: for nicer console tools
 - `llama_cpp`: to run LLMs

And models
 - `NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF` (llama cpp)

In [2]:
from outlines import Generator, from_transformers, Template
from pydantic import BaseModel, Field
from rich import print as rprint 
from rich.json import JSON  
import json
from pathlib import Path

from rich.console import Console
from rich.text import Text
from rich.panel import Panel

In [3]:
# Helpers
def highlight_words(text, words, style="bold red"):
    rich_text = Text(text)
    for word in words:
        start = 0
        while True:
            pos = text.lower().find(word.lower(), start)
            if pos == -1:
                break
            rich_text.stylize(style, pos, pos + len(word))
            start = pos + len(word)
    return rich_text

In [4]:
import llama_cpp
import outlines

# wget https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
# then move inside the `scratch/` folder
model_dir = Path.home() / 'scratch' 

llm = llama_cpp.Llama(
    str(model_dir / "Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf"),
    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
        "NousResearch/Hermes-2-Pro-Llama-3-8B"
    ),
    n_gpu_layers=-1,
    flash_attn=True,
    n_ctx=8192,
    verbose=False
)
model = outlines.from_llamacpp(llm)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# This text is longer on purpose, and requires some handling.
# It contains boilerplate text too. Both we don't doubt the tokens are accurate.
text = """Like Marshall McLuhan, with whom he was often compared (“the other eminent Catholic-electronic prophet, ” said a scornful Frank Kermode), Ong had the misfortune to make his visionary assessments of a new age just before it actually arrived."""

In [6]:
# 1. Describe pydantic class
class Node(BaseModel):
    """Node of the Knowledge Graph"""

    id: int = Field(..., description="Unique identifier of the node")
    label: str = Field(..., description="Label of the node")
    property: str = Field(..., description="Property of the node")

class Edge(BaseModel):
    """Edge of the Knowledge Graph"""
    source: int = Field(..., description="Unique source of the edge")
    target: int = Field(..., description="Unique target of the edge")
    label: str = Field(..., description="Label of the edge")
    property: str = Field(..., description="Property of the edge")

class KnowledgeGraph(BaseModel):
    """Generated Knowledge Graph"""

    nodes: list[Node] = Field(..., description="List of nodes of the knowledge graph")
    edges: list[Edge] = Field(..., description="List of edges of the knowledge graph")

schema = KnowledgeGraph.model_json_schema()

In [7]:
# 2. Describe prompt
generate_hermes_prompt = Template.from_string(
    """
    <|im_start|>system
    You are a world class AI model who answers questions in JSON
    Here's the json schema you must adhere to:
    <schema>
    {{ schema }}
    </schema>
    <|im_end|>
    <|im_start|>user
    {{ text }}
    <|im_end|>
    <|im_start|>assistant
    <schema>
    """
)

In [8]:
from outlines import Generator

generator = Generator(model, KnowledgeGraph)
prompt = generate_hermes_prompt(schema=schema, text=text)
result = generator(prompt, max_tokens=1024, temperature=0, seed=42)



In [9]:
rprint(JSON(result))