# Tokenizing


In [None]:
import tiktoken

from knwl.base import QueryParam

ENCODER = None


def encode_string_by_tiktoken(content: str, model_name: str = "gpt-4o"):
    global ENCODER
    if ENCODER is None:
        ENCODER = tiktoken.encoding_for_model(model_name)
    tokens = ENCODER.encode(content)
    return tokens


def decode_tokens_by_tiktoken(tokens: list[int], model_name: str = "gpt-4o"):
    global ENCODER
    if ENCODER is None:
        ENCODER = tiktoken.encoding_for_model(model_name)
    content = ENCODER.decode(tokens)
    return content


def chunking_by_token_size(
        content: str, overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o"
):
    tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
    results = []
    for index, start in enumerate(
            range(0, len(tokens), max_token_size - overlap_token_size)
    ):
        chunk_content = decode_tokens_by_tiktoken(
            tokens[start: start + max_token_size], model_name=tiktoken_model
        )
        results.append(
            {
                "tokens": min(max_token_size, len(tokens) - start),
                "content": chunk_content.strip(),
                "index": index,
            }
        )
    return results


In [None]:
with open("./books/relativity.txt", "r") as f:
    content = f.read()
toks = chunking_by_token_size(content, 10, 100)

In [None]:
len(toks)

In [None]:
print(toks[0]["content"])

In [None]:
print(toks[1]["content"])

# Logging

Ah, the logging definition inside the package can't be hijacked.

# Data classes

In [None]:
from dataclasses import dataclass
from typing import Any


@dataclass
class Stuff:
    a: int
    b: str

    def __call__(self, *args: Any, **kwds: Any) -> Any:
        return self.a


In [None]:
c = Stuff(a=1654, b="hello")
c()

# Stuff


In [None]:

from typing import Union

import re


def locate_json_string_body_from_string(content: str) -> Union[str, None]:
    """Locate the JSON string body from a string"""
    maybe_json_str = re.search(r"{.*}", content, re.DOTALL)
    if maybe_json_str is not None:
        return maybe_json_str.group(0)
    else:
        return None


locate_json_string_body_from_string(
    """
    A lot of text here
    =====

    {
        "a": 1,
        "b": 2
    }
    """)

In [None]:

from hashlib import md5


def compute_args_hash(*args):
    return md5(str(args).encode()).hexdigest()


compute_args_hash("hello", 1, 2, 3, True)

In [None]:
import ollama

from knwl.storage import JsonStorage

ollama_client = ollama.AsyncClient()
messages = []
history_messages = []

hashing_kv: JsonStorage = JsonStorage()
messages.extend(history_messages)
messages.append({"role": "user", "content": prompt})
if hashing_kv is not None:
    args_hash = compute_args_hash(model, messages)
    if_cache_return = await hashing_kv.get_by_id(args_hash)
    if if_cache_return is not None:
        return if_cache_return["return"]

response = await ollama_client.chat(model=model, messages=messages, **kwargs)

result = response["message"]["content"]

if hashing_kv is not None:
    await hashing_kv.upsert({args_hash: {"return": result, "model": model}})

return result

In [None]:
import ollama

input = ["My name is Swa", "The weather is nice today", "What is my name? (explain your answer)"]
msgs = [{"role": "user", "content": i} for i in input]
ollama.chat("llama3.2", msgs)

In [None]:
from knwl.utils import split_string_by_multi_markers

split_string_by_multi_markers('"hello","world;this is a test"', [",", ";"])

In [None]:
a = {'entities': {'JOHN': [{'description': 'John is a person who knows Maria.', 'entity_name': 'JOHN', 'entity_type': 'PERSON', 'source_id': 'a'}], 'MARIA': [{'description': 'Maria is known by John.', 'entity_name': 'MARIA', 'entity_type': 'PERSON', 'source_id': 'a'}]}, 'relationships': {('JOHN', 'MARIA'): [{'description': 'John and Maria are acquainted with each other.', 'keywords': 'acquaintance, social connection', 'source_id': 'a', 'src_id': 'JOHN', 'tgt_id': 'MARIA', 'weight': 5.0}]}}

In [None]:
import json

json.dumps(a)

In [None]:
a

In [None]:
from knwl.storage import VectorStorage

store = VectorStorage(namespace="edges")
await store.count()

In [None]:
await store.to_dataframe()

In [None]:
await  store.query("sadfs sad")

In [None]:
from knwl.prompt import PROMPTS
from knwl.llm import llm
q = PROMPTS["keywords_extraction"].format(query="John and Maria are acquainted with each other.")
await  llm.ask(q)


In [None]:
query = "John and Maria are acquainted with each other."
keywords_prompt = PROMPTS["keywords_extraction"].format(query=query)
result = {'high_level_keywords': ['Acquaintance', 'Relationship'],
          'low_level_keywords': ['John', 'Maria', 'Know each other']}
result = (
    result.replace(keywords_prompt[:-1], "")
    .replace("user", "")
    .replace("model", "")
    .strip()
)
result = "{" + result.split("{")[1].split("}")[0] + "}"

keywords_data = json.loads(result)
low_keywords = keywords_data.get("low_level_keywords", [])
low_keywords = ", ".join(low_keywords)
low_keywords

In [None]:
from knwl.simple import Simple, QueryParam

s = Simple()
found = await s.query("Who is John?", QueryParam(mode="local"))
print(found)

In [5]:
prompt = """-Goal-
Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.

-Steps-
1. Identify all entities. For each identified entity, extract the following information:
- entity_name: Name of the entity, capitalized
- entity_type: One of the following types: [{entity_types}]
- entity_description: Comprehensive description of the entity's attributes and activities
Format each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>

2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.
For each pair of related entities, extract the following information:
- source_entity: name of the source entity, as identified in step 1
- target_entity: name of the target entity, as identified in step 1
- relationship_description: explanation as to why you think the source entity and the target entity are related to each other
- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity
- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details
Format each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_strength>)

3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.
Format the content-level key words as ("content_keywords"{tuple_delimiter}<high_level_keywords>)

4. Return output in English as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.

5. When finished, output {completion_delimiter}
######################
-Real Data-
######################
Entity_types: {entity_types}
Text: {input_text}
######################
Output:

"""

prompt = prompt.format(input_text="John met Maria on the way to London.", entity_types="PERSON, LOCATION, ORGANIZATION", record_delimiter="|||", completion_delimiter="DONE", tuple_delimiter="|")

import ollama
from IPython.display import display, Markdown
r = ollama.chat("qwen2.5:7b", [{"role": "user", "content": prompt}])
print(r["message"]["content"])

("entity"|John|PERSON|"A person named John who met someone.")
("entity"|Maria|PERSON|"A person named Maria who met John.")
("entity"|London|LOCATION|"The city where John and Maria were heading towards during their meeting.")

("relationship"|John|Maria|"John and Maria met each other."|"Meeting"|1)

("content_keywords"|Meeting, Traveling, London)

DONE


In [4]:
print(r["message"]["content"])

("entity"|John|PERSON|Person named John who interacted with another individual in a specific event.)
("entity"|Maria|PERSON|Person named Maria who interacted with another individual in a specific event.)
("entity"|London|LOCATION|A city that serves as the destination for a journey mentioned in the text.)
("relationship"|John|Maria|John met Maria, indicating an interaction between them.|interaction|1)
("relationship"|John|London|John is traveling to London, suggesting a goal or destination.|travel|2)
("relationship"|Maria|London|Maria is also on her way to London, implying she has the same goal as John.|travel|2)

("content_keywords"|interaction, travel, destination)

DONE
