# Enrich Content


In [None]:
import logging
import os
from datetime import date

from dotenv import load_dotenv
load_dotenv()

## Parameters

In [None]:
#Parameters

# where updated data exists
live_data_repo_path = os.getenv('LIVE_DATA_REPO_PATH')
media_cache_path = os.getenv('MEDIA_CACHE_PATH')

live = False
today = str(date.today())

data_repo_config = {'repo_type': 'files','path': os.path.join(live_data_repo_path,"arxiv",today)}
data_repo_config = {'repo_type': 'files','path': live_data_repo_path}
data_repo_config = None

# Logging level ranges are (from least to most verbose): ERROR, WARN, INFO, DEBUG
logging_level = logging.INFO

# List of the UserIdqs to Ingest
list_of_uids = None

# level of unnecessary output
verbose = True


## Setup

In [None]:
logging.getLogger("OpenTLDR").setLevel(logging_level)

from opentldr.Domain import Content
from opentldr.ContentEnrichment import *
from opentldr import KnowledgeGraph, DataRepo

kg=KnowledgeGraph()


In [None]:
if data_repo_config is not None:
    repo = DataRepo(kg,data_repo_config)

    if verbose:
        print("Loading Content from: {}".format(repo.describe()))

    list_of_uids = repo.importData()
    print("Loaded {count} articles from the repository.".format(count=len(list_of_uids)))

else:
    print("No Data Repo configured, will attempt to enrich all existing Content nodes in KG.")

In [None]:
if list_of_uids is None:
    list_of_uids=kg.get_all_node_uids_by_tag('Content')

if verbose:
    print("Found {} Request nodes to process.".format(len(list_of_uids)))

## Newer Version

In [None]:
from opentldr.Domain import Content
from EnrichedPdf import EnrichedPdf

use_html = False
cache_full_content = False

content_uids = kg.get_all_node_uids_by_tag('Content')

for uid in list_of_uids:
    try:
        content_node = kg.get_content_by_uid(uid)
        if content_node.type == 'technical paper':   # This is set in the update when content is added to the repo
            print ("Found Repo#{} entitled: {}".format(content_node.metadata["repo_uid"],content_node.title))

            e = EnrichedPdf(kg,content_node, media_cache_path)
            e.process()
            
            print ("Processed into KG off Content Node: {}".format(content_node.uid))

    except Exception as e:
        print ("Error: {}".format(repr(e)))


In [None]:
# NOT USED RIGHT NOW

def get_html_content(content:Content) -> TechnicalPaper:
    html_text = None
    try:
        media_file = "{}.html".format(content.metadata["repo_uid"])
        media_path = os.path.join(data_repo_config["path"],content.date,"media",media_file)
        if os.path.exists(media_path):
            with open(media_path, 'r') as file:
                html_text = process_text(file.read())
                if "reCAPTCHA" in html_text or "HTML is not available for the source." in html_text:
                    html_text=None
                    content.metadata['full_content_html']=None
                    content.save()
                    raise Exception("Hit reCAPTCHA or missing file")
            print ("Loaded {n} lines from file: {file}".format(file=media_path,n=len(html_text)))
    except Exception as e:
        print("Failed to load cached document: {}".format(repr(e)))
        
        if "full_content_html" in content.metadata:
            html_text = process_text(str(fetch_html(content.metadata["full_content_html"])))
            if "reCAPTCHA" in html_text or "HTML is not available for the source." in html_text:
                content.metadata['full_content_html']=None
                content.save()
                return None
            if verbose:
                print ("Loaded {n} lines from URL: {url}".format(url=content.url,n=len(html_text)))
        else:
            return None
    

    #if cache_full_content:
    #    file_name = os.path.join("./TEMP_HTML","{}.txt".format(content.metadata["repo_uid"]))
    #    with open(file_name, 'w') as f:
    #        f.write(html_text)

    return html_text
