# Stage 4: Recommendations using an Embedding Model to Compute Relevance Between Requests and Articles
Attempts to determine how relevent each Content node is to each Request node and builds Recommendation nodes to store that score for highly rated relevance.

The result of this step includes:
- Recommendation nodes, connected to Content nodes with a RECOMMENDS, and Request nodes with a RELATES_TO relationship

In [None]:
import logging
from sentence_transformers import SentenceTransformer, util


## Parameters
OpenTLDR workflows use the notebook block tagged as "parameters" to inject variables (for example to change the recommendation thresholds).

> **Do Not Change Variable Names in the Parameters Block** you are welcome to change the values of these parameter variables, but please do not change their names. They are used elsewhere in the notebook and in other workflow processes.

In [None]:
#Parameters

# configuraiton of embedding model
sentence_embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
embedding_similarity_alg = "dot_product"
#embedding_similarity_alg = "cosine_sim"

# configuration for general filtering
recommendation_threshold = 0.60
recommendation_topk = 5
use_click_prediction = False

# only set to true if you are going to recompute all the recommedations (e.g. debugging)
delete_existing_recommendations = True

# Logging level ranges are (from least to most verbose): ERROR, WARN, INFO, DEBUG
logging_level = logging.INFO

# List of the Requests to process recommendations for
list_of_uids = None

# level of unnecessary output
verbose = True


## Setup

### Connect to Knowledge Graph (where Requests, Users, and Content reside)

In [None]:
logging.getLogger("OpenTLDR").setLevel(logging_level)

import opentldr.Domain as domain
from opentldr.Domain import Request, Content, Recommendation, User, Feedback, TldrEntry

from opentldr import KnowledgeGraph
kg=KnowledgeGraph()

### Determine which Requests we need to process (defaults to all)

In [None]:
if list_of_uids is None:
    list_of_uids = kg.get_all_node_uids_by_tag('Request')

if verbose:
    print("Found {} Request nodes to process.".format(len(list_of_uids)))

### Check if we should remove existing recommendations

In [None]:
if delete_existing_recommendations:
    kg.delete_all_recommendations()

# Candidate Generation
- This notebook follows the "Two Towers" model for candate generation.
- It uses the same Text Embedding Model for each "Tower".
- The similarity (either 'cosine_sim' or 'dot_product') compares the embeddings.
- If the similarity score is greater than or equal to the provided threshold, the item becomes a candidate.


## Embedding Model(s)
In this case, the same simple SBERT/Sentance Transformer model for both the Content and Request text.

In [None]:
model = SentenceTransformer(sentence_embedding_model)

# Tower One
# Items in OpenTLDR are Content nodes - they have text content (and other things) that can be used to recommend

content_embedding_cache = {}    # don't recompute the same content embeddings

def item_embedding(content:Content):
        if content.uid in content_embedding_cache:
                return content_embedding_cache[content.uid]
        else:
                tensor = model.encode(content.title+"\n"+content.text, convert_to_tensor=True)
                content_embedding_cache[content.uid] = tensor
                return tensor

# Tower Two
# We don't use "user" because the "request" seperates interests, and users can have multiple very different requests.

def user_embedding(user:User, request:Request):
        return model.encode(request.text, convert_to_tensor=True)


# Another, hey wait, that's Three Towers...
# this is used to get embeddings of summaries that they user has previously
# clicked on in TLDRs based on this request. So, a click history...
def history_embedding(feedback:Feedback):
        entry:TldrEntry = feedback.about_entry.single()
        text = entry.title+"\n"+entry.summary
        return model.encode(text, convert_to_tensor=True)

## Similarity Score

Options supported here (by setting embedding_similarity_alg) are dot_product and cosine_sim.

In [None]:
def similarity(embedding_1, embedding_2) -> float:

        #compute the cosin similarity of the two embeddings
        match (embedding_similarity_alg.lower()):
            case ('dot_product'):
                return util.dot_score(embedding_1, embedding_2).cpu().numpy()[0][0]
            case ('cosine_sim'):
                return util.cos_sim(embedding_1, embedding_2).cpu().numpy()[0][0]
             
            case _:
                logging.warning(f"No embedding similarity function found for {embedding_similarity_alg}, returning 0.0.")
                return 0.0

## Candidate Filtering
- Using Similarity to determine if a combination is a candidate or not.
- In this case, we just compare it to the threshold

In [None]:
def is_candidate (user:User, request:Request, content:Content, similarity_score:int) -> bool:
    return similarity_score >= recommendation_threshold

# Ranking
- It is not clear that any Top-K filter makes sense for the TLDR problem, so you can set it to -1 to allow everything.
- Ranking is simply a sorting by the scores previously computed, more complicated approaches might do better.


In [None]:
def rank (content_scores:dict, topk:int = -1) -> list:
    
    out= sorted(content_scores.items(), key=lambda x:x[1], reverse=True)

    if topk == -1:
        return out
    else:
        return out[:topk]

### Recalibrate Scores

- history_recalibrate_score: If there is feedback that adjusted a previous recommendation score, and
this node was alot like that one. Try to apply a similar adjustment to this score as well.

- simple_recalibrate_score: Give it a little buff since they are usually so low.

In [None]:
def history_recalibrate_score (feedback:Feedback, score:float) -> float:
    if feedback.score == -1:
        return score
    
    # adjusts this score similar to how a user adjusted similar score.
    entry:TldrEntry = feedback.about_entry.single()
    diff = feedback.score - entry.score
    print (diff)
    out = score + diff

    # ensure we stay in range
    if out > 1.0:
        out = 1.0
    
    if out < 0.0:
        out = 0.0

    return out

def simple_recalibrate_score (score:float) -> float:
    # embedding simiarity tends to low, this buffs it a bit
    return (score +1.0) / 2.0


### Average the distance relevance scores for each Article based on its neighbors 

# Process Each Query in the System

In [None]:
# Items to process (all)
content_uids = kg.get_all_node_uids_by_tag('Content')

# Loop thru Requests
for request_uid in list_of_uids:
    request = kg.get_request_by_uid(request_uid)
    user = request.get_requested_by()

    print(f"\nRequest: {request.title} by {user.name}")
    request_embedding = user_embedding(user,request)

    # Get any user clicked or rated content for this request
    feedback_nodes = kg.get_feedback_by_request(request)

    # Perform Candidate Generation
    content_scores=dict()

    # Loop thru items and score them
    for content_uid in content_uids:
        content = kg.get_content_by_uid(content_uid)
        content_embedding = item_embedding(content)

        similarity_score = similarity(request_embedding, content_embedding)
        recommendation_score:float = simple_recalibrate_score(similarity_score)

        # Click Prediction - is this content more similar to other content
        # previously clicked on for this request?
        if use_click_prediction:
            for feedback in feedback_nodes:

                tldr_entry:TldrEntry = feedback.about_entry.single()
                print (tldr_entry.title)

                # feedback.click_date is set if the user clicked thru to source content
                if feedback.click_date is not None:
                    feedback_embedding = history_embedding(feedback)
                    feedback_similarity = simple_recalibrate_score(similarity(feedback_embedding, content_embedding))
                    # take the best score and use that
                    if feedback_similarity > recommendation_score:
                        recommendation_score = feedback_similarity

        if is_candidate(user, request, content, recommendation_score):
            content_scores[content_uid]= recommendation_score 
            if verbose:
                print("\tIS a candidate:\t {} ({})".format(content.title, str(recommendation_score)))
        else:
            if verbose:
                print("\tNOT a candidate:\t {} ({})".format(content.title, str(recommendation_score)))

    # Assumption is that there are too many Content nodes (with text) to cache the objects in memory.
    if verbose:
        print("\nRanking:")

    feedback_nodes = kg.get_feedback_by_request(request)

    # Rank the item candidates
    for item in rank(content_scores, recommendation_topk):
        content = kg.get_content_by_uid(item[0])
        recommendation_score = item[1]
        print("\tRecommending:\t {} ({})".format(content.title, str(recommendation_score)))

        # Ultimately THIS is the call any recommender needs to make for each content node (i.e., item) recommended.
        kg.add_recommendation(request=request,content=content,score=recommendation_score)


In [None]:
kg.close()