# Stage 4: Build a Similarity map using Shortest Pathes Between Sections and TextChunks in Technical Documents
This is not intended for a TLDR, rather it provides a way to compare relationships between Content (specifically technical documents that have been in enriched with section and textchunk nodes)

The result of this step includes:
- A network of "Similarity Nodes" that pairwise link text chunks.

In [None]:
import logging

## Parameters
OpenTLDR workflows use the notebook block tagged as "parameters" to inject variables (for example to change the recommendation thresholds).

> **Do Not Change Variable Names in the Parameters Block** you are welcome to change the values of these parameter variables, but please do not change their names. They are used elsewhere in the notebook and in other workflow processes.

In [None]:
#Parameters
similarity_threshold = 0.70

delete_existing_similarities = True

# Logging level ranges are (from least to most verbose): ERROR, WARN, INFO, DEBUG
logging_level = logging.INFO

# List of the Content nodes to start from
list_of_uids = None

# level of unnecessary output
verbose = True


## Setup

In [None]:
logging.getLogger("OpenTLDR").setLevel(logging_level)

import opentldr.Domain as domain
from opentldr.Domain import Request, Content, Recommendation

import opentldr.ContentEnrichment as domain
from opentldr.ContentEnrichment import TechnicalPaper, Section, TextChunk, Enrichment, Similarity

from opentldr import KnowledgeGraph
kg=KnowledgeGraph()


In [None]:
new_uids = []
prev_uids = []

if list_of_uids is None:
    new_uids = kg.get_all_node_uids_by_tag('Content')
else:
    prev_uids = kg.get_all_node_uids_by_tag('Content')
    new_uids.append(list_of_uids)

if verbose:
    print("Found {} New nodes to process and {} Previous nodes.".format(len(new_uids),len(prev_uids)))

In [None]:
if delete_existing_similarities:
    kg.cypher_query("MATCH (s:Similarity) DETACH DELETE s ")

# KRAG - a pre-computable similarity matrix in the KG that can be used for RAG queries
This is a simple implementation of a RAG database constructed within the KG that doesn't depend on embedding vectors.

In [None]:
def get_score(kg:KnowledgeGraph, from_content_uid, to_content_uid, min_steps:int=3, max_steps:int=10, decay_rate:float=0.2) -> float:      
    count=1
    acc=0.0
    try:
        shortest_path_cyper = """
            MATCH path=shortestPath((s)-[*..{max_steps}]-(e))
            WHERE s.uid='{start_id}'
            AND e.uid='{end_id}'
            AND NONE(n IN nodes(path) WHERE 'Recommendation' IN LABELS(n))
            AND NONE(n IN nodes(path) WHERE 'Tldr' IN LABELS(n))
            AND NONE(n IN nodes(path) WHERE 'Summary' IN LABELS(n))
            AND NONE(n IN nodes(path) WHERE 'Feedback' IN LABELS(n))
            AND NONE(n IN nodes(path) WHERE 'Source' IN LABELS(n))           
            AND NONE(n IN nodes(path) WHERE 'User' IN LABELS(n))
            AND NONE(n IN nodes(path) WHERE 'EvalKey' IN LABELS(n))
            AND NONE(n IN nodes(path) WHERE 'Similarity' IN LABELS(n))
            RETURN path
            """.format(start_id=from_content_uid, end_id=to_content_uid, max_steps=max_steps)

        q = kg.neomodel_query(shortest_path_cyper)
        path=q[0][0][0]
        #TEMP
        #print("shortest path was {} nodes".format(len(path.nodes)))
        acc = 1.0-((len(path.nodes)-min_steps)*decay_rate)
    except Exception as e:
        #print(e)
        pass # no path remains 0.0

    if acc <= 0.0:
        return 0.0;

    return acc

### Build KRAG similarity relationships using TextChunk
Note: this is pretty slow because it averages the pairwise shortest paths between all text chunks stemming from a document (alot of them)

In [None]:
def get_uid_of_textchunks_from_content(kg, content_uid:str) -> list[str]:
    return kg.cypher_query('''
            MATCH (c:Content) where c.uid="{content_uid}"
            MATCH (p:TechnicalPaper)
            MATCH (s:Section)
            MATCH (t:TextChunk)
            MATCH (c)<-[:Enriches]-(p)-[:CONTAINS]->(s)-[:CONTAINS]->(t)
            RETURN t.uid '''.format(content_uid=content_uid))

In [None]:
def krag_create_by_textchunk(kg:KnowledgeGraph, new_uids:list[str], previous_uids:list[str], threshold:float=0.0):
    unique_uids = []
    for from_content_uid in new_uids:
        from_content_node = kg.get_content_by_uid(from_content_uid)
        print("FROM: {}".format(from_content_node.title.replace("\n","")))
        unique_uids.append(from_content_uid)
        sim_counts=0
        
        from_textchunks_uids = get_uid_of_textchunks_from_content(kg,from_content_uid)
        if from_textchunks_uids is None:
            print ("No Text Chunks in Content: {}".format(from_content_uid))
            continue

        for to_content_uid in new_uids + prev_uids:
            if to_content_uid not in unique_uids:
                to_content_node = kg.get_by_uid(to_content_uid)
                to_textchunks_uids = get_uid_of_textchunks_from_content(kg,to_content_uid)

                count=0
                acc = 0.0
                max_score = 0.0

                for from_textchunk_uid in from_textchunks_uids:
                    for to_textchunk_uid in to_textchunks_uids:
                        count=count+1
                        score = get_score(kg, from_textchunk_uid, to_textchunk_uid, min_steps=3, max_steps=8, decay_rate=0.2)
                        #print("score: {} \t {} -> {}".format(score,from_textchunk_uid,to_textchunk_uid))
                        acc += score
                        if score > max_score:
                            max_score = score

                avg_score = acc / count
                
                #score = max_score
                score = avg_score

                if score >= threshold:
                    print("\tTO ({:.3f}): {}".format(score,to_content_node.title.replace("\n","")))
                    sim_counts+=1
                    sim_node = Similarity()
                    sim_node.score=score
                    sim_node.save()
                    sim_node.similar_to.connect(from_content_node)
                    sim_node.similar_to.connect(to_content_node)

        #print("To {} other nodes.".format(sim_counts))


### Build KRAG similarities from Sections
Note: this essentially uses the best textchunks for each section instead of pairwise text chunks. It still does a pairwise comparison of Sections, but there are many fewer of those than textchunks.

In [None]:
def get_uid_of_sections_from_content(kg, content_uid:str) -> list[str]:
    return kg.cypher_query('''
            MATCH (c:Content) where c.uid="{content_uid}"
            MATCH (p:TechnicalPaper)
            MATCH (s:Section)
            MATCH (c)<-[:Enriches]-(p)-[:CONTAINS]->(s)
            RETURN s.uid '''.format(content_uid=content_uid))

In [None]:
def krag_create_by_section(kg:KnowledgeGraph, new_uids:list[str], previous_uids:list[str], threshold:float=0.0):
    unique_uids = []
    for from_content_uid in new_uids:
        from_content_node = kg.get_content_by_uid(from_content_uid)
        print("FROM: {}".format(from_content_node.title.replace("\n","")))
        unique_uids.append(from_content_uid)
        sim_counts=0
        
        from_section_uids = get_uid_of_sections_from_content(kg,from_content_uid)
        if from_section_uids is None:
            print ("No Sections in Content: {}".format(from_content_uid))
            continue

        for to_content_uid in new_uids + prev_uids:
            if to_content_uid not in unique_uids:
                to_content_node = kg.get_by_uid(to_content_uid)
                to_section_uids = get_uid_of_sections_from_content(kg,to_content_uid)

                count=0
                acc = 0.0
                max_score = 0.0

                for from_section_uid in from_section_uids:
                    for to_section_uid in to_section_uids:
                        count=count+1
                        score = get_score(kg, from_section_uid, to_section_uid, min_steps=4, max_steps=10, decay_rate=0.2)
                        #print("score: {} \t {} -> {}".format(score,from_section_uid,to_section_uid))
                        acc += score
                        if score > max_score:
                            max_score = score

                avg_score = acc / count
                
                #score = max_score
                score = avg_score

                if score >= threshold:
                    print("\tTO ({:.3f}): {}".format(score,to_content_node.title.replace("\n","")))
                    sim_counts+=1
                    sim_node = Similarity()
                    sim_node.score=score
                    sim_node.save()
                    sim_node.similar_to.connect(from_content_node)
                    sim_node.similar_to.connect(to_content_node)
    
        print("\n")
        #print("To {} other nodes.".format(sim_counts))


In [None]:
#krag_create_by_textchunk(kg,new_uids,prev_uids,threshold=similarity_threshold)
krag_create_by_section(kg,new_uids,prev_uids,threshold=similarity_threshold)

## KRAG Query


In [None]:
def krag_query(kg:KnowledgeGraph, content_node:Content, limit:int=-1) -> list[Enrichment]:
    limit_clause = ""
    if limit>0:
        limit_clause = " LIMIT {} ".format(limit)

    return kg.cypher_query("""
        MATCH (q:Content) WHERE q.uid="{uid_id}"
        MATCH (r:Content)
        MATCH (s:Similarity)
        MATCH (q)<-[x:SIMILAR_TO]-(s)-[y:SIMILAR_TO]->(r)
        RETURN r ORDER BY s.score DESC {limit_clause} """.format(uid_id=content_node.uid, limit_clause=limit_clause))

In [None]:
def krag_query_and_print(kg:KnowledgeGraph, content_node:Content, limit:int=-1) -> list[Enrichment]:
    limit_clause = ""
    if limit>0:
        limit_clause = " LIMIT {} ".format(limit)

    for uid in kg.cypher_query("""
        MATCH (q:Content) WHERE q.uid="{uid_id}"
        MATCH (r:Content)
        MATCH (s:Similarity)
        MATCH (q)<-[x:SIMILAR_TO]-(s)-[y:SIMILAR_TO]->(r)
        RETURN s.uid ORDER BY s.score DESC {limit_clause} """.format(uid_id=content_node.uid, limit_clause=limit_clause)):
        sim_node = kg.get_by_uid(uid)
        print("-> {}".format(sim_node.to_text()))

In [None]:
for uid in new_uids:
    content_node=kg.get_content_by_uid(uid)
    print ("Query: {}".format(content_node.title.replace("\n","")))

    krag_query_and_print(kg, content_node,3)

#    for other in krag_query(kg,content_node,5):
#        print("\t-> {}".format(other.title.replace("\n","")))

#  Close the KG

In [None]:
kg.close()