# Summarize Content Nodes WITHOUT focus
Uses LLM summerization on the content text only and saves it as metadata for the content node.


## Setup

In [None]:
import os
import logging
import re
import json
import dotenv

dotenv.load_dotenv()  # Load environment variables from .env file

## Parameters
OpenTLDR workflows use the notebook block tagged as "parameters" to inject variables (for example to change the LLM model).

> **Do Not Change Variable Names in the Parameters Block** you are welcome to change the values of these parameter variables, but please do not change their names. They are used elsewhere in the notebook and in other workflow processes.

In [None]:
#Parameters

# If you set the llm_config to None, it will use the environment variable LLM_CONFIG
# Otherwise, here are some options (to run an LLM locally, you will need to download the model to your local machine)
# llm_config = {'type': 'GPT4ALL', 'device':'gpu', 'model':'../LLM_Models/mistral-7b-openorca.gguf2.Q4_0.gguf'}
# llm_config = {'type': 'Ollama', 'model':'mistral:latest'}
# llm_config = {'type': 'ChatGPT', 'model':'gpt-4'}
llm_config = None

llm_prompt = '''
    Write a concise summary of this content:\n {content}
    '''
#for ChatGPT, you need to set the environment variable CHATGPT_API_KEY
chatgpt_api_key = os.getenv("CHATGPT_API_KEY")

# Logging level ranges are (from least to most verbose): ERROR, WARN, INFO, DEBUG
logging_level = logging.INFO

# List of the UniqueIds to Ingest
list_of_uids = None

# level of unnecessary output
verbose = True

In [None]:
logging.getLogger("OpenTLDR").setLevel(logging_level)

import opentldr.Domain as domain
from opentldr import KnowledgeGraph

kg=KnowledgeGraph()

### Load Content Nodes

In [None]:
if list_of_uids is None:
    list_of_uids = kg.get_all_node_uids_by_tag("Content")

if verbose:
    print ("Found {} Content nodes to attempt pre-summarization (i.e., untailored).".format(len(list_of_uids)))

## Run an LLM Model
This notebook uses the `Summarizer` class to run an LLM model. 
You can set the LLM model by changing the `llm_config` variable in the parameters block above or setting LLM_CONFIG in the .env file or environment variable.

In [None]:
import Summarizer
llm:Summarizer = Summarizer.getSummarizer(llm_config, logging_level=logging_level)

## Build the prompt and run the LLM
This includes the original content, the request it is tailored for, and the explaination of the shortest path through the knowledge graph used to connect them.

In [None]:

for content_uid in list_of_uids:  
    content = kg.get_content_by_uid(content_uid)
    original= content.text
    #print("\tOriginal Content:\t{text}".format(text=original))
    #print("\tPath Text:\t{text}".format(text=path_text))
    
    if content.metadata is not None:
        if "summary" in content.metadata.keys():
            if len(content.metadata["summary"]) > 60:
                print ("... already summarized...")
                continue
            else:
                # assume there was something wrong with it, strip everything out and try again
                original = re.sub('[^a-zA-Z0-9 \n]', '', original)
    else:
        content.metadata = dict()

    if len(original) < 10:
        print("Deleting: {}".format(content.to_text()))
        kg.delete_content(content)
        continue

    print("Summarizing... {t}\t{s}".format(t=content.title, s=len(content.text)))

    prompt_text = llm_prompt.format(content=original).strip()
    summary= llm.summarize(prompt_text)

    print("summary ({reduction}):\t{text}".format(reduction=round(len(summary)/len(original),3),text=summary))
    
    # if it fails try it again without extra characters that might creep into content during collection
    if len(summary) < 10:
        logging.warning("Summarization failed for this content, trying without extra characters.")
        summary =  llm.summarize(re.sub('[^a-zA-Z0-9 \n]', '', prompt_text))
    
    if len(summary) < 10:
        logging.error("Pre-Summarization too short.")
    
    content.metadata["summary"] = summary
    content.save()
    
    print("\n")

In [None]:
kg.close()