In [None]:
import os
import logging
from datetime import date

from dotenv import load_dotenv
load_dotenv()

from opentldr import Workflow

## Specify the Workflow in code
The workflow includes:
- **Output**: the directory that the workflow writes copies of the notebooks as executed (read only!)
- **Notebooks**: this is a list of notebooks (full path) in the order that they should be executed
    - For each notebook the set of parameters that are to be passed into it thru the workflow process

### Parameters
- use_local_files (boolean) - sets the data repo to either local (true) or S3 (false)
- data_directory (path as string) - if using a local directory, which directory should be used.

Note: regardless of where the content is pulled from, the reference data .csv files need to be in the directory specified by the data_directory parameter.

In [None]:
# Parameters

# The main directory for loading data for this notebook
use_local_files = True

live = False
today = str(date.today())

data_directory = os.getenv("LIVE_DATA_REPO_PATH", "../Data/TechINT")
reference_data_path = os.getenv("REFERENCE_DATA_PATH", "../Data/TechINT/reference")
media_cache_path = os.getenv("MEDIA_CACHE_PATH","../Data/TechINT/MediaCache" )

llm_device = "cpu"


In [None]:
print (data_directory)
print (reference_data_path)
print (media_cache_path)

### Generate a Data Repo Config
The get_data_repo_config(folder) function produces a data repo configure for either a local path or an s3 bucket, this allows parameters to change the data source without needing to edit the actual workflow content.

In [None]:
def get_data_repo_config(folder:str, today:str = str(date.today())) -> dict:
    '''
    format the data_repo_config json based on above and given subfolder
    '''
    path = str(os.path.join(data_directory,folder))
    
    if live:
        path = str(os.path.join(data_directory,today,folder))

    if use_local_files:
        return {
            'repo_type': 'files',
            'path': path
        }
    else:
        return {
            'repo_type': 's3',
            'bucket': os.getenv("S3_BUCKET"),
            'aws_access_key_id': os.getenv("S3_ACCESS_KEY_ID"),
            'aws_secret_access_key': os.getenv("S3_SECRET_KEY"),
            'prefix': path
        }

## Define the Workflow

In [None]:
workflow = {

    # Where a read-only version of the notebook AFTER execution is stored
    "Output": "../READ_ONLY_OUTPUT_TECHNINT",
  
    # Parameters passed into all notebooks in workflow
    "Common": {
        "logging_level":logging.INFO,
        "verbose": True,
    },

    # Order and parameters of notebooks to execute in workflow
    "Notebooks": [

        # Erases the entire KG - so only do this when you intend to
        ["../Stage_1_Initialize/Clear_All.ipynb",{}],

        # Load up a Reference Data KG that is relevent to the Arxiv dataset
        ["../Stage_1_Initialize/Load_CSV_Reference_Data.ipynb",{
            "keyword_to_concept_csv_file": os.path.join(reference_data_path,"keywords.csv"),
            "concept_to_concept_csv_file": os.path.join(reference_data_path,"concepts.csv"),
        }],

        # Load up the set of example Requests (active data)
        ["../Stage_2_Ingest/Load_Requests.ipynb",{
            "data_repo_config": get_data_repo_config('request', today),
        }],

        # Load up the set of example Content (also active data)
        ["../Stage_2_Ingest/Load_Content.ipynb",{
            "data_repo_config": get_data_repo_config('content', today),
        }],

        # Load up the set of example Content (also active data)
        ["../Collectors/Enrich_Content.ipynb",{
            #"content_uids": [],
            #"request_uids": [],
            "media_cache_path": media_cache_path,
            "data_repo_config": None,   # content was already ingested
        }], 

        # Generate stand-alone untailored summaries ONLY NEEDED for multi-doc comparisons in digger
        [ "../Stage_5_Summarize/Presummarize.ipynb",{
            "llm_config" : {'type': 'GPT4ALL', 'device':llm_device, 'model':'../LLM_Models/mistral-7b-openorca.gguf2.Q4_0.gguf'}
        }],

        # Make the connections between active data and reference data
        [ "../Stage_3_Connect/Entity_Cosin_Similarity.ipynb",{
            "sentence_embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
            "connect_threshold": 0.25,
            "hypothesize_threshold": 0.80
        }],

        # Find paths thru the KG and determine what should be recommended
        [ "../Stage_4_Recommend/Shortest_Path_Scoring.ipynb", {
            "recommendation_threshold": 0.75
        }],

        # Create summaries for content that has been recommended 
        [ "../Stage_5_Summarize/Tailored_Abstractive_Summary.ipynb",{
            "llm_config" : {'type': 'GPT4ALL', 'device':llm_device, 'model':'../LLM_Models/mistral-7b-openorca.gguf2.Q4_0.gguf'},
            "llm_prompt": "Respond to the request: {request} while concisely summarizing this technical paper: {content}. Use the following facts: {knowledge}",
        }],

        # Build this into a TLDR structure (basically Tldr and TldrEntries)
        ["../Stage_6_Produce/Build_TLDR.ipynb",{
        }],

        # If there are any evalkeys for this data, apply them to see how it scores
        ["../Stage_7_Evaluate/Evaluate.ipynb",{
            "data_repo_config": get_data_repo_config('evalkey', today),
            "sentence_embedding_model": "sentence-transformers/all-MiniLM-L6-v2"
        }],
    ]}

In [None]:
wf:Workflow = Workflow(workflow)
wf.run()