# Parse snippets
This notebook was intended to build knowlage graph over documentation.
## Current maps
- Raw synonym map over docs
- TOLK AST node types to documents that those are explaining
- Raw predicates map over statements in documentation
- Instructions documentation generated from tvm-specification.json

## Postponed
Due to knowlage graph being extremely time consuming(manual labor)
approach, it is currently postponed in favor of LLM powered intent extraction to speed up the prototyping.
But i still beleive this is a valid approach which can bring massive benifit,
compared to LLM powered retrieval.

In [None]:
#pip uninstall -y tree-sitter

In [None]:
pip install tree_sitter==0.21.3 rank-bm25 ipywidgets

In [None]:
import pathlib
import sys
root_path = pathlib.Path.cwd().parent.resolve()
sys.path.insert(0, str(root_path))

In [None]:
from tree_sitter import Parser, Language

In [None]:
Language.build_library("tolk-tree-sitter.so", ["../tree-sitter-tolk/"])

In [None]:
# Let's vectorize all the tolk related documents first
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.retrievers import BM25Retriever

def create_doc(doc):
    return Document(page_content=doc["page_content"], id=doc["id"], metadata=doc["metadata"])
    
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", cache_folder= str(root_path / ".models"))
#code_embedder = HuggingFaceEmbeddings(model_name="microsoft/codebert-base",
#                                      encode_kwargs={"normalize_embeddings": True},   # L2‑normalize for FAISS
#                                     )



In [None]:
import json
def load_json_dump(path: str):
    parsed_data = []
    with open(path, encoding="utf8") as data_file:
        for line in data_file:
            parsed_data.append(json.loads(line))
    return parsed_data



In [None]:
from utils.json import load_json_dump
parsed_snippets = load_json_dump("../rag-data/latest_snippets.jsonl")
parsed_docs = load_json_dump("../rag-data/latest_docs.jsonl")

In [None]:
tolk_documents = list(
    map(
        create_doc, filter(lambda x: "languages/tolk" in x["metadata"]["from"] and "changelog" not in x["metadata"]["from"], parsed_docs)
    )
)
#(len(parsed_snippets))
tolk_snippets = list(map(create_doc, filter(lambda x: x["metadata"]["lang"] == "tolk" , parsed_snippets)))
print(len(tolk_snippets))

In [None]:
TOLK_LANG = Language("../tolk-tree-sitter.so", "tolk")
tolk_parser = Parser()
tolk_parser.set_language(TOLK_LANG)
with open("../tolk-contracts/contracts_Tolk/03_notcoin/jetton-utils.tolk", "r") as test_file:
    ast = tolk_parser.parse(bytes(test_file.read(), "utf8"))
print(ast.root_node)
ast.root_node.sexp()

In [None]:
concept_set = set()
for doc in tolk_documents:
    if "tolk" in doc.metadata["from"]:
        concept_set.add(doc.metadata["crumbs"])
concept_set

In [None]:
import json
from binascii import hexlify
import unicodedata
# This was born in collaborative effort of LLM and good old manual labor.
# Idea is that from documents crumbs (headers hierarchy) we create semantic labesl
# That will be later used in knowlage graph
with open("../rag-data/tolk-semantic-labels.json", "r", encoding="utf8") as heading_semantic:
    heading = json.loads(unicodedata.normalize('NFKC', heading_semantic.read()))
#for heading_key in heading:
#    if "Overall" in heading_key:
#        print(f"{heading_key} ({hexlify(bytes(heading_key, "utf8"))})")
#print(heading["Imports and name resolution>All top‐level symbols must have unique names"])
#print("\n\n")
for doc in tolk_documents:
    crumbs = doc.metadata["crumbs"]
    if crumbs in heading:
        heading_data = heading[crumbs]
        doc.metadata["label"] = heading_data["summary"]
        doc.metadata["short_desc"] = heading_data["reason"]
    else:
        print(f"{crumbs} ({hexlify(bytes(crumbs, "utf8"))})not found!")

In [None]:
pip install nltk

In [None]:
doc_headers = []
for doc in tolk_documents:
    doc_headers.append(Document(page_content=doc.metadata['crumbs'], metadata={'doc_id': doc.id}))
headers_index = FAISS.from_documents(doc_headers, embedding=embedder)#BM25Retriever.from_documents(doc_headers, preprocess_func=word_tokenize, k=2)

In [None]:
from langchain_community.vectorstores.utils import DistanceStrategy
from nltk.tokenize import word_tokenize
docs_vectors = FAISS.from_documents(tolk_documents, embedding=embedder)
docs_bm25 = BM25Retriever.from_documents(tolk_documents, preprocess_func=word_tokenize)
code_storage = BM25Retriever.from_documents(tolk_snippets)
#with open("bm25_dump.json", "w", encoding="utf8") as bm25_out:
#    bm25_out.write(docs_bm25.model_dump_json())
#code_storage = FAISS.from_documents(tolk_snippets, embedding=code_embedder, normalize_L2=False, distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT)

In [None]:
test_query = "output specification"
#docs_vectors.save_local('tolk_only')
vector_res = docs_vectors.similarity_search_with_relevance_scores(test_query, k=10)
bm25_res = docs_bm25.invoke(test_query)
print(f"Vec:{vector_res}\n\n")
print(f"Headers:{headers_index.similarity_search(test_query)}\n\n")
print(f"BM25:{bm25_res}\n\n")
print(f"CodeVec:{code_storage.invoke(test_query)}\n\n")

In [None]:
import pathlib
import ipywidgets as w
from IPython.display import clear_output
import asyncio
from IPython.display import display
# What we're going to do here is parse all the TOLK contracts files out there and
# Label each unique AST node type to the documentation concept

tolk_semantic_map: dict = {}
def node_type2query(node_type: str):
    return node_type.replace('_', ' ') # Replacing node type
async def prompt_human(prompt_text: [str], candidates: [str]):
    #print(f"Node type {node_type}\nRepr:${node_text}\nSelect label:")
    paragraphs = ""
    for paragraph in prompt_text:
        paragraphs += f"<p>{paragraph}</p>"
        
    prompt = w.HTML(
    value=f"""
    {paragraphs}
    <i>You may select more than one or 'Skip' the labeling. Use ⌘ (Mac) / Ctrl (Win/Linux) to add to the selection.</i>
    """,
    layout=w.Layout(margin='0 0 5px 0')
)
    multi = w.SelectMultiple(
    options=candidates + ['Skip'],
    description="Select one or more candidate",
    layout=w.Layout(width='800px'),
    rows=20
    )
    submit = w.Button(description="Submit label", button_style='success', disabled=True)
    # Multi-select change event
    def _on_selection_change(change):
        submit.disabled = len(change['new']) == 0
    multi.observe(_on_selection_change, names='value')
    
    out = w.Output()
    box = w.VBox([prompt, multi, submit, out])
    display(box)
    #print(f"Got here: {box}")
    loop = asyncio.get_event_loop()  # Get the event loop
    done_fut = loop.create_future()  # Create future on the correct loop
    selected_value = []

    def _on_click(_):
        if not done_fut.done():
            with out:
                #print("Got into click handler!")
                #print(f"Before fut: {done_fut}")
                selected_value.extend(list(multi.value))
                loop.call_soon_threadsafe(done_fut.set_result, True)      # resolve the Future
                #out.clear_output()
                #print(f"Done fut: {done_fut}")
                #print("✔️  Submitted – you may continue.")
                
    submit.on_click(_on_click)
    await done_fut
    box.close()
    clear_output(wait=False)
    print(f"{selected_value}")
    return selected_value

def add_label(node_type: str, label: str, label_map: dict):
    if node_type in label_map:
        if label not in label_map[node_type]:
            label_map[node_type].append(label)
    else:
        label_map[node_type] = [label]
        
async def label_node(node, label_map: dict):
    node_type = node.type
    if node_type not in label_map:
        query = node_type2query(node_type)
        concept_mapped = False
        print(f"Checking query: {query}")
        candidates = docs_vectors.similarity_search_with_relevance_scores(query, k=10)
        auto_candidates = filter(lambda resp: resp[1] >= 0.25, candidates)
        candidates_added = 0
        for response in auto_candidates:
            auto_label = response[0].metadata["label"]
            add_label(node_type, auto_label, label_map)
            candidates_added = candidates_added + 1
            print(f"Added auto label {node_type}:{auto_label}")
            #print(response
            concept_mapped = candidates_added >= 4
        if not concept_mapped:
            bm25_picks  = list(map(lambda bm_res: "(bm25)" + bm_res.metadata['crumbs'], docs_bm25.invoke(query)))
            hand_picked = await prompt_human([f"<b>Node ast type</b>:{node_type}",f"<b>Text</b/>:{node.text}"], list(map(lambda res: f"(score: {float(res[1]):.2f}){res[0].metadata['crumbs']}", candidates)) + bm25_picks)
            for picked_crumbs in hand_picked:
                crumb_label = picked_crumbs
                if picked_crumbs != "Skip":
                    picked_crumbs = picked_crumbs[picked_crumbs.index(")") + 1:]
                    crumb_label   = heading[picked_crumbs]["summary"]
                add_label(node_type, crumb_label, label_map)
            
async def label_tree(node, label_map: dict):
    await label_node(node, label_map)
    for child_node in node.named_children:
        await label_tree(child_node, label_map)
async def label_tolk_semantics():
    contracts_path = pathlib.Path("tolk-contracts/contracts_Tolk/")
    sources = contracts_path.rglob("*.tolk")
    for source_path in sources:
        with open(source_path, "r") as source_file:
            print(f"Processing {source_path}")
            try:
                ast = tolk_parser.parse(bytes(source_file.read(), "utf8"))
                await label_tree(ast.root_node, tolk_semantic_map)
            except Exception as err:
                print(f"Failed to parse file ${source_path} {err}")
                raise err

In [None]:
await label_tolk_semantics()

In [None]:
tolk_semantic_map

In [None]:
import json
with open("../rag-data/tolk_syntax_semantic.json", "w", encoding="utf8") as semantic_out:
    semantic_out.write(json.dumps(tolk_semantic_map))

In [None]:
def load_json_obj(path: str):
    with open(path,"r", encoding="utf8") as json_input:
        return json.loads(json_input.read())

In [None]:
from query_api import chat_completion

In [None]:
def query_triplets(text: str):
    system_prompt = f"""
You are an expert knowledge‑graph builder.
Read the following passage and output every *explicit* factual relation
you can find, as a JSON list of objects with the keys:
  - "subj": the subject noun (Should be directly identifiable no It/They/Them/etc)
  - "pred": the predicate (single verb in present tense, e.g. "USES", "EXPLAINS" keep the upper case single word)
  - "obj": the single noun should be also directly identifiable
  
Reply with a raw parsable json (don't enclose response into markdown tags)
If a sentence contains more than one relation, output them all.
If you cannot find a clear relation, output an empty list [].
"""
    messages =[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Passage: {text}"}
    ]
    return chat_completion(messages, "gpt-4o-mini", 0.0)


In [None]:
async def query_instruction(instruction):
    system_prompt = """
You are an expert in TVM assembly language.
Based only on the input json, provide detailed instruction description.
Use markdown headers to separate the object keys.
Summarize long and short description together.
If json key is empty, don't mention it.
    """
    messages =[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": json.dumps(instruction)}
    ]
    return chat_completion(messages, "gpt-4o-mini", 0.0)

In [None]:
async def query_synonyms(text: str):
    system_prompt = """
You are an expert synonym extraction assistant.
Read the following passage and output every group of words that
refer to the same entity.
Return the result as a JSON list, where each entry has:
{
  "entity":  "<canonical singular name you choose>",
  "mentions": ["<first occurrence>", "<second occurrence>", ...]
}

**Rules**
1️ Use only the exact words that appear in the input, but **convert each word to its singular form** (e.g. “collections” → “collection”, “set methods” → “set method”).  
2️ The **entity** should be the **shortest singular mention** in the group (if there is a tie, pick the one that appears first).  
3️ Preserve the original spelling/casing *except* for the plural‑to‑singular change.
4️ Do **not** invent synonyms that are not present in the text.  
5️ Return **pure JSON** – no introductory text, markdown fences, or explanations.  
6️ If a term appears more than once in the same form, list it only once inside the “mentions” array.
   Provide response in a form of raw json
"""
    messages =[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Passage: {text}"}
    ]
    return chat_completion(messages, "gpt-4o-mini", 0.0)

In [None]:
query_synonyms("Contract getters (get methods) are used to retrieve the on-chain information off-chain")

In [None]:
import json
def load_predicates(path: str):
    try:
        with open(path, "r", encoding="utf8") as predicate_map:
            return json.loads(predicate_map.read())
    except:
        print("Predicates not found!")
        return {}

In [None]:
instructions_parsed = load_predicates("tvm-spec/tvm-specification.json")

In [None]:
len(instructions_parsed["instructions"])
instructions_map = {}
for instruction in instructions_parsed["instructions"]:
    instruction_cat = instruction["category"] if "category" in instruction else "Uncategorized"
    # Don't want to everload LLM with complexity.
    del instruction["category"]
    del instruction["sub_category"]
    del instruction["layout"]
    if instruction_cat in instructions_map:
        instructions_map[instruction_cat].append(instruction)
    else:
        instructions_map[instruction_cat] = [instruction]

        

In [None]:
instruction_desc_map = {}
from tqdm.notebook import tqdm

for cat in instructions_map:
    for instruction in tqdm(instructions_map[cat]):
        if cat not in instruction_desc_map:
            instruction_desc_map[cat] = {}
        try:
            instruction_desc = await query_instruction(instruction)
            instruction_desc_map[cat][instruction["name"]] = instruction_desc
        except Exception as e:
            print(e)
        
        

In [None]:
from utils.json import save_json_dict
save_json_dict(instruction_desc_map, "../rag-data/instructions_desc.json")

In [None]:
predicates = load_predicates("../rag-data/raw_predicates.json")
i = 0
for doc in tolk_documents:
    if doc.id in predicates:
        print(f"Document {doc.id} is cached!")
        continue
    if doc.metadata["token_count"] < 1024:
        predicate_res = query_triplets(doc.page_content)
        #print(predicate_res)
        try:
            pred_json = json.loads(predicate_res['choices'][-1]['message']['content'])
            predicates[doc.id] = pred_json
            i = i + 1
            print(f"Processed doc: {doc.id} {i}")
        except Exception as e:
            print(f"Error while processsing {doc.id} {e}")

In [None]:
synonyms = load_predicates("../rag-data/raw_synonims.json")
i = 0
for doc in tolk_documents:
    if doc.id in synonyms:
        print(f"Document {doc.id} is cached!")
        continue
    
    llm_res = query_synonyms(doc.page_content)
    #print(predicate_res)
    try:
        synonym_json = json.loads(llm_res)
        synonyms[doc.id] = synonym_json
        i = i + 1
        print(f"Processed doc: {doc.id} {i}")
    except Exception as e:
        print(f"Error while processsing {doc.id} {e}")
        raise e

In [None]:
synonyms

In [None]:
with open("../rag-data/raw_predicates.json", "w", encoding="utf8") as predicates_file:
    predicates_file.write(json.dumps(predicates))

In [None]:
with open("../rag-data/raw_synonyms.json", "w", encoding="utf8") as predicates_file:
    predicates_file.write(json.dumps(synonyms))

In [None]:
tolk_predicates = load_predicates("../rag-data/raw_predicates.json")
def map_by_key(pred: dict, key):
    new_map = {}
    for doc_id in pred:
        for triplet in pred[doc_id]:
            pred_key = triplet[key]
            if pred_key in new_map:
                new_map[pred_key].append(triplet)
            else:
                new_map[pred_key] = [triplet]
    return new_map

In [None]:
subject_map = map_by_key(tolk_predicates, "subj")
pred_map = map_by_key(tolk_predicates, "pred")

In [None]:
subject_map

In [None]:
pred_map