# Setup experiments 
This notebook helps users to create a dedicated space on the bucket with selectable pdfs to run their instance of the demo.

In [1]:
# Author: ALLIANZ NLP esg data pipeline
import os
import pathlib
from dotenv import load_dotenv
from src.data.s3_communication import S3Communication
import tempfile
import ipywidgets as widgets
from IPython.display import display

In [2]:
# Load credentials
dotenv_dir = os.environ.get(
    "CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src")
)
dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env"
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path, override=True)

# S3 connecter for the bucket with source data
s3c = S3Communication(
    s3_endpoint_url=os.getenv("S3_ENDPOINT"),
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
    s3_bucket=os.getenv("S3_BUCKET"),
)

### Select the source where all the pdfs are stored

In [3]:
SOURCE_DATA_PREFIX = "aicoe-osc-demo/pipeline_run"
SOURCE_DATA = f"{SOURCE_DATA_PREFIX}/samples_145/pdfs"

In [4]:
# get all pdf names
all_pdf_names = [i.key[45:] for i in s3c.s3_resource.Bucket(s3c.bucket).objects.filter(Prefix=SOURCE_DATA)][1:]

Select the pdfs you are interested in. Multiple values can be selected with shift and/or ctrl (or command) pressed and mouse clicks or arrow keys. Once selected, move to the next cell.

In [5]:
def set_pdf_names(select_pdfs):
    global pdf_names
    pdf_names = select_pdfs


# widget to set query evaluation timestamp
display(
    widgets.interactive(
        set_pdf_names,
        select_pdfs=widgets.SelectMultiple(
            options=all_pdf_names,
            value=all_pdf_names[1:2],
            rows=20,
            layout=widgets.Layout(width="100%")
        ),
    )
)

interactive(children=(SelectMultiple(description='select_pdfs', index=(1,), layout=Layout(width='100%'), optio…

Next, You can set the experiment_name and sample_size as you prefer. They will be used as prefix for storing your experiments' files. The same values will have to updated in the config section of settings.yaml. 

In [6]:
EXPERIMENT_NAME = "test_select_pdfs"
SAMPLE_SIZE = f"samples_selected_{len(pdf_names)}"

In [7]:
DESTINATION_DATA_PREFIX = EXPERIMENT_NAME + "/pipeline_run"
DESTINATION_DATA = f"{DESTINATION_DATA_PREFIX}/{SAMPLE_SIZE}/pdfs"

In [8]:
with tempfile.TemporaryDirectory() as tmpdirname:
    for pdf in pdf_names:
        s3c.download_file_from_s3(tmpdirname + "/" + pdf, SOURCE_DATA, pdf)
    s3c.upload_files_in_dir_to_prefix(tmpdirname, DESTINATION_DATA)

## Training
Next, if you want to run training, you'll need an annotations file. You could provide your own or select from existing in the demo.

In [9]:
SOURCE_ANNOTATIONS = f"{SOURCE_DATA_PREFIX}/samples_145/annotations"
DESTINATION_ANNOTATIONS = f"{DESTINATION_DATA_PREFIX}/{SAMPLE_SIZE}/annotations"

In [10]:
all_annotation_names = [i.key[52:] for i in s3c.s3_resource.Bucket(s3c.bucket).objects.filter(Prefix=SOURCE_ANNOTATIONS)][1:]

In [11]:
def set_annotation_names(select_annotations):
    global annotation_names
    annotation_names = select_annotations


# widget to set query evaluation timestamp
display(
    widgets.interactive(
        set_annotation_names,
        select_annotations=widgets.SelectMultiple(
            options=all_annotation_names,
            value=all_annotation_names[1:2],
            rows=2,
            layout=widgets.Layout(width="70%")
        ),
    )
)

interactive(children=(SelectMultiple(description='select_annotations', layout=Layout(width='70%'), options=('2…

In [12]:
with tempfile.TemporaryDirectory() as tmpdirname:
    for annotation in annotation_names:
        s3c.download_file_from_s3(tmpdirname + "/" + annotation, SOURCE_ANNOTATIONS, annotation)
    s3c.upload_files_in_dir_to_prefix(tmpdirname, DESTINATION_ANNOTATIONS)

## KPI mapping
These mapping is required for defining the questions in the question answering task. It can be downloaded from the red hat physical landing bucket as shown below or it can also be downloaded from here: https://github.com/os-climate/aicoe-osc-demo/tree/master/data/kpi_mapping

In [13]:
with tempfile.TemporaryDirectory() as tmpdirname:
    s3c.download_files_in_prefix_to_dir("aicoe-osc-demo/kpi_mapping", tmpdirname)
    s3c.upload_files_in_dir_to_prefix(tmpdirname, f"{EXPERIMENT_NAME}/kpi_mapping")