In [None]:
import os
from datetime import date

from opentldr import Workflow

## Specify the Workflow in code
The workflow includes:
- **ONE TIME**: This block wipes the KG and resets it (only run if parameter "initialize" is set to True).
- **Daily**: this block simulates the daily live run.

### Parameters
- initialize (boolean) - determines if the KG gets wiped at start (true) or not (false).
- today (string) - whatever date represents today formatted as "YYYY-MM-DD".
- historic_days_to_run (list of date strings) - will run each day in order given.
- use_local_files (boolean) - sets the data repo to either local (true) or S3 (false).
- data_directory (path as string) - if using a local directory, which directory should be used.

Note: regardless of where the content is pulled from, the reference data .csv files need to be in the directory specified by the data_directory parameter.

In [None]:
# Parameters
initialize = False

today = str(date.today())
days_to_run = []

# The main directory for loading data for this notebook
use_local_files = True
data_directory = "../Data/TechInt"

In [None]:
# Setup a list of dates for which to run TLDRs

if today not in days_to_run:
    days_to_run.append(today)
print ("Running: ",days_to_run)

In [None]:
def get_data_repo_config(folder:str, today:str=None):
    '''
    format the data_repo_config json based on above and given subfolder
    '''
    
    path = format(os.path.join(data_directory,folder))

    if today is not None:
        path = format(os.path.join(data_directory,today,folder))

    if use_local_files:
        return {
            'repo_type': 'files',
            'path': path
        }
    else:
        return {
            'repo_type': 's3',
            'bucket': os.getenv("S3_BUCKET"),
            'aws_access_key_id': os.getenv("S3_ACCESS_KEY_ID"),
            'aws_secret_access_key': os.getenv("S3_SECRET_KEY"),
            'prefix': path
        }

### One-Time Initialization


In [None]:
if initialize:

    # Initialize the KnowledgeGraph
    workflow = {
        "Output": "./READ_ONLY_OUTPUT_TECHINT/SETUP",

        "Notebooks": [
            # Erases the entire KG - so only do this when you intend to
            ["Stage_1_Initialize/Clear_All.ipynb",{}],

            # Load up a Reference Data KG that is relevent to the Arxiv dataset
            ["Stage_1_Initialize/Load_CSV_Reference_Data.ipynb",{
                "keyword_to_concept_csv_file": os.path.join(data_directory,"reference","keywords.csv"),
                "concept_to_concept_csv_file": os.path.join(data_directory,"reference","concepts.csv"),
            }],

            # Load up the set of example Requests from this location
            ["Stage_2_Ingest/Load_Requests.ipynb",{
                "data_repo_config": get_data_repo_config('request'),
            }],
        ]}

    print ("Initializing the KG now...")
    wf = Workflow(workflow)
    wf.run()

### Daily

In [None]:
for today in days_to_run:
    print("Running for {}".format(today))

    workflow = {
        
        # Where a read-only version of the notebook AFTER execution is stored
        "Output": "../READ_ONLY_OUTPUT_TECHINT/{}".format(today),
        
        # Parameters passed into all notebooks in workflow
        "Common": {
            "logging_level":10,
            "vebose": True,
        },

        # Order and parameters of notebooks to execute in workflow
        "Notebooks": [

            # Setup the KG
            [ "../Stage_1_Initialize/Clear_Live_Content.ipynb", {}],

            # Load Content and Requests
            [ "../Stage_2_Ingest/Load_Content.ipynb", {
                "data_repo_config": {'repo_type': 'files', 'path': '../Data/Samplecontent'},
            }],

            [ "../Stage_2_Ingest/Load_Requests.ipynb", {
                "data_repo_config": {'repo_type': 'files', 'path': '../Data/Sample/request'},
            }],

            # Perform Analytics to link entities in Requests and Content nodes
            [ "../Stage_3_Connect/Entity_Cosin_Similarity.ipynb",{
                "sentence_embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
                "connect_threshold": 0.6,
                "hypothesize_threshold": 0.9
            }],

            # Compute recommendations based on relevance of content to request
            [ "../Stage_4_Recommend/Shortest_Path_Scoring.ipynb", {
                "recommendation_threshold": 0.6
            }],

            # Generate a summary of the content that is tailored with respect to the request and useful reference knowledge
            [ "../Stage_5_Summarize/Tailored_Abstractive_Summary.ipynb", {
                "llm_model_path": "../LLM_Models/mistral-7b-openorca.gguf2.Q4_0.gguf",
                "llm_prompt": "You are a helpful assistant responding to the request: {request} \n\n and were given these facts: {knowledge} \n\n Concisely summarize the following article: {content}"
            }],

            # Produce a TLDR Report for each request
            [ "../Stage_6_Produce/Build_TLDR.ipynb", {}],

            # Run the Evaluation
            [ "../Stage_7_Evaluate/Evaluate.ipynb", {
                "data_repo_config": {'repo_type': 'files', 'path': '../Data/Sample/evalkey'},
                "sentence_embedding_model": "sentence-transformers/all-MiniLM-L6-v2"
            }]
        ]}


    wf:Workflow = Workflow(workflow)
    wf.run()

In [None]:
workflow = {
    "Output": "./READ_ONLY_OUTPUT/PREP",
    "Notebooks": [
        ["Step_1_Ingest.ipynb", {
            "active_data_repo_config": {'repo_type': 'files', 'path': "./arxiv/content/"},
            }],
        ["Step_2_Connect.ipynb",{
            "sentence_embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
            "threshold_similarity_connect": 0.6,
            "threshold_similarity_hypothesize": 0.9
            }],
        
    ]}
wf:Workflow = Workflow(workflow)
wf.run()