# Stage 3: Entity Cosin Similarity
 
This notebook attempts to resolve what the Entities mentioned in the Content and Requests nodes with the Reference Data Objects.

The result of this step includes:
- Existing Entity nodes connected to ReferenceNodes with a REFERS_TO relationship
- New ReferenceNodes (with Hypothesized==True)

In [None]:
import logging

## Parameters
OpenTLDR workflows use the notebook block tagged as "parameters" to inject variables (for example to use different embedding models).

> **Do Not Change Variable Names in the Parameters Block** you are welcome to change the values of these parameter variables, but please do not change their names. They are used elsewhere in the notebook and in other workflow processes.

In [None]:
#Parameters
sentence_embedding_model = "sentence-transformers/all-MiniLM-L6-v2"

# The Connect Threshold sets when an active data entity refers to a reference data object.
# This only effects how similar text is to KNOWN entities of the same type as detected in NER.
connect_threshold = 0.25

# The Hypothesize Threshold sets how similar previously unknown active data entities need to be to invent reference data.
hypothesize_threshold = 0.9

# Standard Parameters 

# Logging level ranges are (from least to most verbose): ERROR, WARN, INFO, DEBUG
logging_level = logging.INFO

# level of unnecessary output
verbose = True


## Setup OpenTLDR


In [None]:
logging.getLogger("OpenTLDR").setLevel(logging_level)

from opentldr.Domain import Entity, ReferenceNode
from opentldr import KnowledgeGraph

kg=KnowledgeGraph()

## Cosin Similarity of Semantic Embedding
- Takes two string and computes each of their embeddings
- Computes the cosin distance between the two embeddings
- Returns this distance (which is 0-1) as a similarity metrics

In [None]:
# Note: on a mac I needed to install this on the commandline
#!{sys.executable} -m pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer(sentence_embedding_model)

embedding_cache = {}

def cached_cosin_similarity(node_1, node_2):

        string_1 = node_1.text
        embedding_1 = None
        string_2 = node_2.text
        embedding_2 = None

        if string_1 == string_2:
                return 1.0

        if node_1.uid in embedding_cache:
                embedding_1 = embedding_cache[node_1.uid]
        else:
                embedding_1= model.encode(string_1, convert_to_tensor=True)
                embedding_cache[node_1.uid]=embedding_1

        if node_2.uid in embedding_cache:
                embedding_2 = embedding_cache[node_2.uid]
        else:
                embedding_2= model.encode(string_2, convert_to_tensor=True)
                embedding_cache[node_2.uid]=embedding_2

        #compute the cosin similarity of the two embeddings
        similarity = util.cos_sim(embedding_1, embedding_2).cpu().numpy()[0][0]

        return round(similarity,4)



def cosin_similarity(string_1:str, string_2:str):
        if string_1 == string_2:
                return 1.0

        #compute the embeddings for each string
        embedding_1= model.encode(string_1, convert_to_tensor=True)
        embedding_2 = model.encode(string_2, convert_to_tensor=True)
        
        #compute the cosin similarity of the two embeddings
        similarity = util.cos_sim(embedding_1, embedding_2).cpu().numpy()[0][0]

        return round(similarity,4)

## Entity Resolution to Reference Data
Attempt to associate new entities with known objects specified as reference data

In [None]:
# Query KG for Entities from Articles that have not been connected with a REFERS_TO edge.
unreferred_entities = kg.cypher_query("MATCH (a:Entity) WHERE NOT (a)-[:REFERS_TO]->() RETURN (a)","a")
print ("Query found {count} Entity nodes that did not have REFER_TO edges.".format(count=len(unreferred_entities)))

# Query KG for Reference Nodes that might be appropriate to add a REFERS_TO edge.
reference_node_list = kg.get_all_reference_nodes();
print ("Query found {count} Reference nodes those entities might match.".format(count=len(reference_node_list)))

count=0
if len(unreferred_entities) > 0 and len(reference_node_list) > 0:
    for entity in unreferred_entities:

        # For each entity find the most semantically similar Reference Node
        max_score=0.0
        max_record:ReferenceNode=None
        for ref_node in reference_node_list:
            if entity.type == ref_node.type:
                this_score=cached_cosin_similarity(entity,ref_node)
                if this_score > max_score or max_record is None:
                    max_score = this_score
                    max_record = ref_node
        
        # If the most similar is above the threshold, add a REFERS_TO edge.
        if max_score > connect_threshold:
            if verbose:
                print ("Linking:\t{entity}\t-[REFERS_TO ({score})]->\t{reference}".format(
                    score=max_score,entity=entity.text,reference=max_record.text))
            
            kg.add_refers_to_edge(entity=entity, reference=max_record, confidence=max_score)
            count+=1
        #else:
            #print ("Skipping:\t{entity}".format(entity=entity.text))

print ("Discovered {count} new REFER_TO edges.".format(count=count))

### Entity Discovery and Resolution to Hypothesized Entities
Attempt to associate any unmatched entities to each other by hypothesizing a new objects to which some may refer.

In [None]:

# Query KG for Entities from Articles that have not been connected with a REFERS_TO edge.
remaining_entities_results = kg.cypher_query("MATCH (a:Entity) WHERE NOT (a)-[:REFERS_TO]->() RETURN (a)")
print ("Query found {count} Entity nodes that did not have REFER_TO edges.".format(count=len(remaining_entities_results)))

resolved= []
for entity_1 in remaining_entities_results:
    if entity_1.uid in resolved: # skip if it has already been aggregated elsewhere
        continue
    to_aggregate = [(entity_1, 1.0)]
    resolved.append(entity_1.uid)

    for entity_2 in remaining_entities_results:
        if entity_1.type == entity_2.type and entity_2.uid not in resolved:
            this_score=cached_cosin_similarity(entity_1,entity_2)
            if this_score > hypothesize_threshold:
                #print ("Inferred '"+entity_1.text+"' and '"+entity_2.text+"' are the same with "+str(this_score))
                to_aggregate.append((entity_2, this_score))
                resolved.append(entity_2.uid)

    if len(to_aggregate) > 1:
        hypothesis=kg.add_reference_node(text=entity_1.text,type=entity_1.type,hypothesized=True)
        print("Asserting:\t{text}: {type}".format(text=hypothesis.text,type=hypothesis.type))

        for entity, score in to_aggregate:
            print ("Hypothesizing:\t{entity}\t-[REFERS_TO ({score})]->\t{reference}".format(score=score,entity=entity.text,reference=hypothesis.text))
            kg.add_refers_to_edge(entity=entity, reference=hypothesis, confidence=score)

# Close down any remote connections

In [None]:
kg.close()