# Run Confluence on an HPC

# Requirements
* docker installed somewhere where you have sudo priveledges to the point where "docker --version" completes successfully
* singularity or apptainer installed on your HPC
* a dockerhub account (free)


# Overall Tasks
* Git clone all of the repos you want to run to a machine where you have sudo priveledges and where "docker --version" works (locally)
* Run the "Prepare Images Locally" section of this notebook locally
* Run the "Confluence Module SLURM Script Generator" section of this notebook on your HPC to create SLURM submission scripts for each module
* Run the Confluence Driver Script Generator section of this notebook on your HPC to create a SLURM submission script that runs each of the modules one by one (the one click run)

---
## Functions (IGNORE)

In [None]:
# FUNCTIONS IGNORE
def build_and_push_images(repo_directory:str, target_repo_names:list, docker_username:str, push:bool = True, custom_tag_name:str = 'latest'):
    for a_repo_name in target_repo_names:
        repo_path = os.path.join(repo_directory, a_repo_name)
        docker_path = f'{docker_username}/{a_repo_name}:{custom_tag_name}'
        build_cmd = ['docker', 'build','--quiet', '-f', os.path.join(repo_path, "Dockerfile"), '-t', docker_path, repo_path]
        try:
            sp.run(build_cmd)
        except Exception as e:
            raise RuntimeError(
                f"Docker build failed...\n"
                f"Build Command: {build_cmd}\n"
                f"Error: {e}"
            )
        if push:
            try:
                push_cmd = ['docker', 'push','--quiet', docker_path]
                sp.run(push_cmd)
            except Exception as e:
                raise RuntimeError(
                    f"Docker push failed...\n"
                    f"Push Command: {push_cmd}\n"
                    f"Error: {e}"
                )
    
def build_sifs_and_create_slurm_scripts(run_list, included_modules, base_dir, docker_username, build):

    for run in run_list:
        
        # Has to exist with 'mnt' structure (Doit exister avec la structure 'mnt')
        mnt_dir = os.path.join(base_dir, f'confluence_{run}', f'{run}_mnt')
        
        # Create the sh_scripts directory (Cree le repertoire sh_scripts)
        sh_dir = os.path.join(base_dir, f'confluence_{run}', 'sh_scripts')
        if not os.path.exists(sh_dir):
            os.makedirs(sh_dir)
        
        # Create the sif directory (Cree la repertoire sif)
        sif_dir = os.path.join(base_dir, f'confluence_{run}', 'sif')
        if not os.path.exists(sif_dir):
            os.makedirs(sif_dir)
        
        # Create the report directory (Cree la repertoire report)
        report_dir = os.path.join(base_dir, f'confluence_{run}', 'report')
        if not os.path.exists(report_dir):
            os.makedirs(report_dir)


        submission_prefix = '#SBATCH'


        job_details = {
        'partition': 'cpu-preempt',
        'cpus-per-task': '1',
        'job-name': f'{run}_cfl',
        }
        


        command_dict = {
            'expanded_setfinder': 'singularity run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'setfinder.simg') + ' -r reaches_of_interest.json -c continent.json -e -s 16 -o /data -n /data -a MetroMan HiVDI SIC NeoBAM -i ${SLURM_ARRAY_TASK_ID}',
            'expanded_combine_data': 'singularity run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'combine_data.simg') + ' -d /data  -e -s 16',
            'input': 'singularity run --bind ' + f'{mnt_dir}/input:/mnt/data ' + os.path.join(sif_dir, 'input.simg') + ' -r /mnt/data/expanded_reaches_of_interest.json -i ${SLURM_ARRAY_TASK_ID}',
            'non_expanded_setfinder': 'singularity run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'setfinder.simg') + ' -c continent.json -s 16 -o /data -n /data -a MetroMan HiVDI SIC NeoBAM -i ${SLURM_ARRAY_TASK_ID}',
            'non_expanded_combine_data': 'singularity run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'combine_data.simg') + ' -d /data -s 16',
            'prediagnostics': 'singularity run --bind ' + f'{mnt_dir}/input:/mnt/data/input,{mnt_dir}/diagnostics/prediagnostics:/mnt/data/output ' + os.path.join(sif_dir, f'prediagnostics.simg') + ' -i ${SLURM_ARRAY_TASK_ID} -r reaches.json',
            #'unconstrained_priors': f'singularity run -c --writable-tmpfs --bind {mnt_dir}/input:/mnt/data {os.path.join(sif_dir, "priors.simg")} ' + ' -i ${SLURM_ARRAY_TASK_ID} -r unconstrained -p usgs riggs -g -s local',
            'hivdi': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/hivdi:/mnt/data/output ' + os.path.join(sif_dir, 'hivdi.simg') + ' /mnt/data/input/reaches.json --input-dir /mnt/data/input -i ${SLURM_ARRAY_TASK_ID}',
            'sic4dvar': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/sic4dvar:/mnt/data/output,{mnt_dir}/logs:/mnt/data/logs '+ os.path.join(sif_dir, 'sic4dvar.simg') + ' -r reaches.json --index ${SLURM_ARRAY_TASK_ID}',
            'metroman': f'singularity run --env AWS_BATCH_JOB_ID=”foo” --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/metroman:/mnt/data/output ' + os.path.join(sif_dir, "metroman.simg") + ' -i ${SLURM_ARRAY_TASK_ID} -r metrosets.json -s local -v',
            'metroman_consolidation': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/metroman:/mnt/data/flpe ' + os.path.join(sif_dir, 'metroman_consolidation.simg') + ' -i ${SLURM_ARRAY_TASK_ID}',
            'unconstrained_momma': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/momma:/mnt/data/output ' + os.path.join(sif_dir, 'momma.simg') + ' -r reaches.json -m 3 -i ${SLURM_ARRAY_TASK_ID}',
            'neobam': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/geobam:/mnt/data/output ' + os.path.join(sif_dir, 'neobam.simg') + ' -r reaches.json -i ${SLURM_ARRAY_TASK_ID}',
            'sad': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/sad:/mnt/data/output ' + os.path.join(sif_dir, 'sad.simg') + ' --reachfile reaches.json --index ${SLURM_ARRAY_TASK_ID}',
            'MOI': f'singularity run --env AWS_BATCH_JOB_ID="foo" --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/moi:/mnt/data/output ' + os.path.join(sif_dir, 'MOI.simg') + ' -j basin.json -v -b unconstrained -s local -i ${SLURM_ARRAY_TASK_ID}',
            'unconstrained_offline': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/moi:/mnt/data/moi,{mnt_dir}/offline:/mnt/data/output ' + os.path.join(sif_dir, 'offline.simg') + ' unconstrained timeseries integrator reaches.json ${SLURM_ARRAY_TASK_ID}',
            'Validation': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/moi:/mnt/data/moi, {mnt_dir}/offline:/mnt/data/offline,{mnt_dir}/validation:/mnt/data/output ' + os.path.join(sif_dir, 'Validation.simg') + ' reaches.json unconstrained ${SLURM_ARRAY_TASK_ID}',
            'output': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/diagnostics:/mnt/data/diagnostics,{mnt_dir}/moi:/mnt/data/moi, {mnt_dir}/offline:/mnt/data/offline,{mnt_dir}/validation:/mnt/data/validation,{mnt_dir}/output:/mnt/data/output ' + os.path.join(sif_dir, 'output.simg') + ' -s local -j /app/metadata/metadata.json -m input priors prediagnostics momma hivdi neobam metroman sic4dvar sad validation swot -i ${SLURM_ARRAY_TASK_ID}'
        }

        def create_slurm_script(job_details=job_details, build_image=False, sif_dir='foo'):
            submission_prefix = job_details['submission_prefix']
            if build_image:
                module_name = job_details['module_name']
                image_name = module_name.replace('expanded_', '').replace('non_', '').replace('unconstrained_', '').replace('constrained_', '')
                sp.run(['singularity', 'build', '-F', os.path.join(sif_dir, image_name + '.simg'), f"docker://{job_details['docker_username']}/{image_name}"])

            file = open(os.path.join(sh_dir, f'{module_to_run}.sh'), 'w')
            file.write('#!/bin/bash \n')
            file.write(f'{submission_prefix} -o {os.path.join(report_dir, f"{module_to_run}.%a.out")}' + ' \n')

            for item in job_details:
                if item not in ['run_command', 'module_name', 'docker_username', 'submission_prefix']:
                    file.write(f'{submission_prefix} --{item}={job_details[item]} \n')
            file.write(job_details["run_command"])
            file.close()

        included_modules = included_modules

        for module_to_run, run_command in command_dict.items():
            
            if module_to_run == 'MOI':
                time_to_use = '00:30:00'
                mem_to_use = '2G'
            elif module_to_run == 'neobam':
                time_to_use = '01:00:00'
                mem_to_use = '6G'
            elif module_to_run == 'output':
                time_to_use = '05:00:00'
                mem_to_use = '2G'
            else:
                time_to_use = '00:10:00'
                mem_to_use = '2G'
                
            if included_modules:
                if module_to_run not in included_modules:
                    continue

            print('DIRECTORY NAME: ', run, '\nMODULE: ', module_to_run)
            


            job_details.update({
                'run_command': run_command,
                'module_name': module_to_run,
                'mem': mem_to_use,
                'time': time_to_use,
                'docker_username': docker_username,
                'submission_prefix': submission_prefix,
                'job-name': f'{module_to_run}_{run}_cfl',

            })
            create_slurm_script(job_details=job_details, build_image=build, sif_dir=sif_dir)

                
def generate_slurm_driver(
    job_name: str,
    output_log_dir: str,
    partition: str,
    time_limit: str,
    ntasks: int,
    cpus_per_task: int,
    mem: str,
    total_jobs: int,
    array_step: int,
    batch_size: int,
    concurrent_jobs: int,
    directory: str,
    scripts: list[str]
) -> str:
    slurm_header = f"""#!/bin/bash
#SBATCH --job-name={job_name}
#SBATCH --output={output_log_dir}/{job_name}_%j.out
#SBATCH --error={output_log_dir}/{job_name}_%j.err
#SBATCH --partition={partition}
#SBATCH --time={time_limit}
#SBATCH --ntasks={ntasks}
#SBATCH --cpus-per-task={cpus_per_task}
#SBATCH --mem={mem}

"""

    # Turn list of scripts into a bash array
    script_array = '    ' + '\n    '.join(scripts)
    scripts_block = f"""scripts=(
{script_array}
)
"""

    body = f"""# Load modules
module load conda/latest

# Job control variables
total_jobs={total_jobs}
array_step={array_step}
batch_size={batch_size}
concurrent_jobs={concurrent_jobs}
directory="{directory}"

{scripts_block}

for slurm_script in "${{scripts[@]}}"; do
    echo "Starting submission for: $slurm_script"
    date

    # Override total_jobs based on script name
    if [[ "$slurm_script" == *setfinder* ]]; then
        current_total_jobs=7
    elif [[ "$slurm_script" == *combine_data* ]]; then
        current_total_jobs=1
    elif [[ "$slurm_script" == *priors* ]]; then
        current_total_jobs=7
    elif [[ "$slurm_script" == *output* ]]; then
        current_total_jobs=7
    else
        current_total_jobs=$total_jobs  # fallback to the default
    fi

    array_start=0
    while [ $array_start -le $current_total_jobs ]; do
        array_end=$((array_start + array_step - 1))
        if [ $array_end -gt $current_total_jobs ]; then
            array_end=$current_total_jobs
        fi

        echo "Processing array range: $array_start-$array_end"

        start=$array_start
        while [ $start -le $array_end ]; do
            end=$((start + batch_size - 1))
            if [ $end -gt $array_end ]; then
                end=$array_end
            fi

            echo "Submitting batch: $start-$end from $slurm_script"
            job_output=$(sbatch --array=${{start}}-${{end}}%${{concurrent_jobs}} "${{directory}}/${{slurm_script}}")
            job_id=$(echo $job_output | awk '{{print $4}}')

            echo "Submitted batch $start-$end (Job ID: $job_id), waiting..."
            while squeue -j "$job_id" 2>/dev/null | grep -q "$job_id"; do
                sleep 15
            done

            echo "Finished batch $start-$end (Job ID: $job_id)"
            sleep 5

            start=$((end + 1))
        done

        array_start=$((array_end + 1))
    done
done

echo "All submissions complete"
"""
    return slurm_header + body


---
### Prepare Docker Images (RUN LOCALLY, NOT HPC)
* Builds docker images locally and stores them on your dockerhub

In [38]:
import os
import subprocess as sp

In [None]:
#------------------------------------------------

# SETUP

# Directory where you are storing repos
repo_directory = '/storage/repos'
target_repo_names = ['setfinder', 'combine_data', 'input', 'momma']

# Only provide this if you want to store images on dockerhub to move to HPC (you probably do)
push = True
docker_username = 'travissimmons'
custom_tag_name = 'run1' # good to name same as the run

# --------------------------------------------------------------------------------------

In [None]:
build_and_push_images(\
                      repo_directory = repo_directory, \
                      target_repo_names = target_repo_names, \
                      docker_username = docker_username, \
                      push = push, \
                      custom_tag_name = custom_tag_name \
                     )
                      
# The output should look something like 
# sha256:6900c3d99325a4a7c8b282d4a7a62f2a0f3fc673f03f5ca3333c2746bf20d06a
# docker.io/travissimmons/setfinder:latest

---
### Confluence Module SLURM Script Generator (RUN ON HPC, NOT LOCALLY)
* Builds sif files from your dockerhub and generates scripts to submit to a SLURM job scheduler

In [39]:
import os
import subprocess as sp

In [None]:
#-------------------------------------------------

# SETUP

# Directory where you are storing repos
base_dir = '/nas/cee-water/cjgleason/travis/data/offline_consolidation/confluence_runs/'
included_modules= {'expanded_combine_data', 'expanded_setfinder', 'input', 'momma'}
docker_username = 'travissimmons' # your dockerhub username here
custom_tag_name = 'latest' # leave this as latest unless you have a really good reason!

# Providing a run list will create slurm scripts to run, name your parent and _mnt folders with the same run* name
run_list = ['run_1']

# Rebuild the sif
build = False

# --------------------------------------------------------------------------------------

In [None]:
build_sifs_and_create_slurm_scripts(run_list=run_list, \
                                    included_modules = included_modules, \
                                    base_dir = base_dir, \
                                    docker_username = docker_username,
                                    build = build
                                   )

DIRECTORY NAME:  run_1 
MODULE:  expanded_setfinder
DIRECTORY NAME:  run_1 
MODULE:  expanded_combine_data
DIRECTORY NAME:  run_1 
MODULE:  input
DIRECTORY NAME:  run_1 
MODULE:  neobam


---
### Confluence Driver Script Generator (RUN ON HPC, NOT LOCALLY)
* Creates a batch submission script that will run all of your sif files in serial
* use sbatch to submit the entire run
* low resources and a long time should be used here, as all this job will do is launch your SLURM scripts you created for each module, it is basically a job manager

In [None]:
run_dir = '/nas/cee-water/cjgleason/travis/data/offline_consolidation/confluence_runs/confluence_run_1'

driver_script = generate_slurm_driver(
    job_name="confluence_driver",  # Sets the SLURM job name shown in the queue
    output_log_dir=os.path.join(run_dir, "driver_log"),  # Directory to write SLURM output and error logs
    partition="ceewater_cjgleason-cpu",  # SLURM partition to submit the job to
    time_limit="30:00:00",  # Max time the whole workflow can run (HH:MM:SS)
    ntasks=1,  # Number of tasks (Don't change)
    cpus_per_task=1,  # Number of CPUs per task (Don't change)
    mem="10G",  # Memory required for driver (Don't change)
    total_jobs=20,  # Total number of jobs (e.g., reach IDs) to process
    array_step=10000,  # Size of each large array chunk (not used here since total_jobs < step)
    batch_size=1000,  # Number of array tasks to submit in each batch
    concurrent_jobs=400,  # Max number of jobs allowed to run at once within each batch
    directory=os.path.join(run_dir, 'sh_scripts'),  # Path to job scripts
    scripts=[
        "expanded_setfinder.sh",
        "expanded_combine_data.sh",
        "input.sh",
        "momma.sh"
    ]
)

# Write the generated SLURM driver script to a file for inspection or submission
with open(os.path.join(run_dir, "driver_submit_for_confluence_run_1.sh"), "w") as f:
    f.write(driver_script)

# Optionally submit
# import subprocess
# subprocess.run(["sbatch", "driver_submit.sh"])


---
# Running Tests

#### In order to run on specific reaches
* modify the file at /mnt/input/reaches_of_interest.json
#### In order to change a module and test it
* change the module locally, build it and push to dockerhub using the first part of this notebook and then run as usual
* you can use the run_list variable to generate more submission script per moule to test more than one change at a time. However, whenver you submit them, they will still run one at a time, it just submits the next run automatically.