# Run Confluence on an HPC

# Requirements
* docker installed somewhere where you have sudo priveledges to the point where "docker --version" completes successfully
* singularity or apptainer installed on your HPC
* a dockerhub account (free)


# Overall Tasks
* Pull all repos you want to run
* Run the "Prepare Images Locally" section of this notebook where you have sudo priveledges and "docker --version" working
* Run the "Confluence Module SLURM Script Generator" section of this notebook on your HPC

---

### Prepare Docker Images Locally
* Builds docker images locally and stores them on your dockerhub

In [18]:
import os
import subprocess as sp

In [19]:

#-------------------------------------------------

# SETUP

# Directory where you are storing repos
repo_directory = '/mnt/repos'
target_repo_names = ['setfinder', 'input']

# Only provide this if you want to store images on dockerhub to move to HPC (you probably do)
push = True
docker_username = 'tsimmons'
custom_tag_name = 'latest' # leave this as latest unless you have a really good reason!

# --------------------------------------------------------------------------------------

def build_and_push_images(repo_directory:str, target_repo_names:list, docker_username:str, push:bool = True, custom_tag_name:str = 'latest'):
    for a_repo_name in target_repo_names:
        repo_path = os.path.join(repo_directory, a_repo_name)
        docker_path = f'{docker_username}/{a_repo_name}:{custom_tag_name}'
        build_cmd = ['docker', 'build', '-f', os.path.join(repo_path, "Dockerfile"), '-t', docker_path]
        try:
            sp.run(build_cmd)
        except Exception as e:
            raise RuntimeError(
                f"Docker build failed...\n"
                f"Build Command: {build_cmd}\n"
                f"Error: {e}"
            )
        if push:
            try:
                push_cmd = ['docker', 'push', docker_path]
                sp.run(push_cmd)
            except Exception as e:
                raise RuntimeError(
                    f"Docker push failed...\n"
                    f"Push Command: {push_cmd}\n"
                    f"Error: {e}"
                )


In [9]:
build_and_push_images(\
                      repo_directory = repo_directory, \
                      target_repo_names = target_repo_names, \
                      docker_username = docker_username, \
                      push = push, \
                      custom_tag_name = custom_tag_name \
                     )
                      
                      

RuntimeError: ('Docker build failed...', ['docker', 'build', '-f', '/mnt/repos/setfinder/Dockerfile', '-t', 'tsimmons/setfinder:latest'], '\n', NameError("name 'sp' is not defined"))

In [20]:
build_and_push_images(\
                      repo_directory = repo_directory, \
                      target_repo_names = target_repo_names, \
                      docker_username = docker_username, \
                      push = push, \
                      custom_tag_name = custom_tag_name \
                     )
                      
                      

RuntimeError: Docker build failed...
Build Command: ['docker', 'build', '-f', '/mnt/repos/setfinder/Dockerfile', '-t', 'tsimmons/setfinder:latest']
Error: [Errno 2] No such file or directory: 'docker'

# Confluence Module SLURM Script Generator

### Confluence Module SLURM Script Generator
* Builds sif files from your dockerhub and generates scripts to submit to a SLURM job scheduler

In [None]:
def process_runs(run_list, included_modules, excluded_modules, base_dir, docker_username):
    import os
    import subprocess as sp

    for run in run_list:
        
        # Has to exist with 'mnt' structure (Doit exister avec la structure 'mnt')
        mnt_dir = os.path.join(base_dir, f'confluence_{run}', f'{run}_mnt')
        
        # Create the sh_scripts directory (Cree le repertoire sh_scripts)
        sh_dir = os.path.join(base_dir, f'confluence_{run}', 'sh_scripts')
        if not os.path.exists(sh_dir):
            os.mkdir(sh_dir)
        
        # Create the sif directory (Cree la repertoire sif)
        sif_dir = os.path.join(base_dir, f'confluence_{run}', 'sif')
        if not os.path.exists(sif_dir):
            os.mkdir(sif_dir)
        
        # Create the report directory (Cree la repertoire report)
        report_dir = os.path.join(base_dir, f'confluence_{run}', 'report')
        if not os.path.exists(report_dir):
            os.mkdir(report_dir)


        submission_prefix = '#SBATCH'

        job_details = {
            'partition': 'cpu-preempt',
            'cpus-per-task': '1',
            'mem': '2G',
            'time': '00:10:00',
            'job-name': f'{run}_cfl',
        }

        command_dict = {
            'expanded_setfinder': 'singularity run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'setfinder.simg') + ' -r reaches_of_interest.json -c continent.json -e -s 16 -o /data -n /data -a MetroMan HiVDI SIC NeoBAM -i ${SLURM_ARRAY_TASK_ID}',
            'expanded_combine_data': 'singularity run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'combine_data.simg') + ' -d /data  -e -s 16',
            'input_fs': 'singularity run --bind ' + f'{mnt_dir}/input:/mnt/data ' + os.path.join(sif_dir, 'input_fs.simg') + ' -r /mnt/data/expanded_reaches_of_interest.json -i ${SLURM_ARRAY_TASK_ID}',
            'non_expanded_setfinder': 'singularity run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'setfinder.simg') + ' -c continent.json -s 16 -o /data -n /data -a MetroMan HiVDI SIC NeoBAM -i ${SLURM_ARRAY_TASK_ID}',
            'non_expanded_combine_data': 'singularity run --bind ' + f'{mnt_dir}/input:/data ' + os.path.join(sif_dir, 'combine_data.simg') + ' -d /data -s 16',
            'prediagnostics_s_bb_bb': 'singularity run --bind ' + f'{mnt_dir}/input:/mnt/data/input,{mnt_dir}/diagnostics/prediagnostics:/mnt/data/output ' + os.path.join(sif_dir, f'prediagnostics_s_bb_bb.simg') + ' -i ${SLURM_ARRAY_TASK_ID} -r reaches.json',
            'prediagnostics_strict': 'singularity run --bind ' + f'{mnt_dir}/input:/mnt/data/input,{mnt_dir}/diagnostics/prediagnostics:/mnt/data/output ' + os.path.join(sif_dir, 'prediagnostics_strict.simg') + ' -i ${SLURM_ARRAY_TASK_ID} -r reaches.json',
            'prediagnostics_permissive': 'singularity run --bind ' + f'{mnt_dir}/input:/mnt/data/input,{mnt_dir}/diagnostics/prediagnostics:/mnt/data/output ' + os.path.join(sif_dir, 'prediagnostics_permissive.simg') + ' -i ${SLURM_ARRAY_TASK_ID} -r reaches.json',
            'prediagnostics_relaxed': 'singularity run --bind ' + f'{mnt_dir}/input:/mnt/data/input,{mnt_dir}/diagnostics/prediagnostics:/mnt/data/output ' + os.path.join(sif_dir, 'prediagnostics_relaxed.simg') + ' -i ${SLURM_ARRAY_TASK_ID} -r reaches.json',
            #'unconstrained_priors': f'singularity run -c --writable-tmpfs --bind {mnt_dir}/input:/mnt/data {os.path.join(sif_dir, "priors.simg")} ' + ' -i ${SLURM_ARRAY_TASK_ID} -r unconstrained -p usgs riggs -g -s local',
            'hivdi': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/hivdi:/mnt/data/output ' + os.path.join(sif_dir, 'hivdi.simg') + ' /mnt/data/input/reaches.json --input-dir /mnt/data/input -i ${SLURM_ARRAY_TASK_ID}',
            'sic4dvar': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/sic4dvar:/mnt/data/output,{mnt_dir}/logs:/mnt/data/logs '+ os.path.join(sif_dir, 'sic4dvar.simg') + ' -r reaches.json --index ${SLURM_ARRAY_TASK_ID}',
            'metroman': f'singularity run --env AWS_BATCH_JOB_ID=”foo” --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/metroman:/mnt/data/output ' + os.path.join(sif_dir, "metroman.simg") + ' -i ${SLURM_ARRAY_TASK_ID} -r metrosets.json -s local -v',
            'metroman_consolidation': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/metroman:/mnt/data/flpe ' + os.path.join(sif_dir, 'metroman_consolidation.simg') + ' -i ${SLURM_ARRAY_TASK_ID}',
            'unconstrained_momma': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/momma:/mnt/data/output ' + os.path.join(sif_dir, 'momma.simg') + ' -r reaches.json -m 3 -i ${SLURM_ARRAY_TASK_ID}',
            'neobam': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/geobam:/mnt/data/output ' + os.path.join(sif_dir, 'neobam.simg') + ' -r reaches.json -i ${SLURM_ARRAY_TASK_ID}',
            'sad': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe/sad:/mnt/data/output ' + os.path.join(sif_dir, 'sad.simg') + ' --reachfile reaches.json --index ${SLURM_ARRAY_TASK_ID}',
            'moi': f'singularity run --env AWS_BATCH_JOB_ID="foo" --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/moi:/mnt/data/output ' + os.path.join(sif_dir, 'moi.simg') + ' -j basin.json -v -b unconstrained -s local -i ${SLURM_ARRAY_TASK_ID}',
            'unconstrained_offline': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/moi:/mnt/data/moi,{mnt_dir}/offline:/mnt/data/output ' + os.path.join(sif_dir, 'offline.simg') + ' unconstrained timeseries integrator reaches.json ${SLURM_ARRAY_TASK_ID}',
            'validation': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/moi:/mnt/data/moi, {mnt_dir}/offline:/mnt/data/offline,{mnt_dir}/validation:/mnt/data/output ' + os.path.join(sif_dir, 'validation.simg') + ' reaches.json unconstrained ${SLURM_ARRAY_TASK_ID}',
            'output': f'singularity run --bind {mnt_dir}/input:/mnt/data/input,{mnt_dir}/flpe:/mnt/data/flpe,{mnt_dir}/diagnostics:/mnt/data/diagnostics,{mnt_dir}/moi:/mnt/data/moi, {mnt_dir}/offline:/mnt/data/offline,{mnt_dir}/validation:/mnt/data/validation,{mnt_dir}/output:/mnt/data/output ' + os.path.join(sif_dir, 'output.simg') + ' -s local -j /app/metadata/metadata.json -m input priors prediagnostics momma hivdi neobam metroman sic4dvar sad validation swot -i ${SLURM_ARRAY_TASK_ID}'
        }

        def create_slurm_script(job_details=job_details, build_image=False, sif_dir='foo'):
            submission_prefix = job_details['submission_prefix']
            if build_image:
                module_name = job_details['module_name']
                image_name = module_name.replace('expanded_', '').replace('non_', '').replace('unconstrained_', '').replace('constrained_', '')
                sp.run(['singularity', 'build', '-F', os.path.join(sif_dir, image_name + '.simg'), f"docker://{job_details['docker_username']}/{image_name}"])

            file = open(os.path.join(sh_dir, f'{module_to_run}.sh'), 'w')
            file.write('#!/bin/bash \n')
            file.write(f'{submission_prefix} -o {os.path.join(report_dir, f"{module_to_run}.%a.out")}' + ' \n')

            for item in job_details:
                if item not in ['run_command', 'module_name', 'docker_username', 'submission_prefix']:
                    file.write(f'{submission_prefix} --{item}={job_details[item]} \n')
            file.write(job_details["run_command"])
            file.close()

        excluded_modules = excluded_modules
        included_modules = included_modules

        for module_to_run, run_command in command_dict.items():
            if excluded_modules:
                if module_to_run in excluded_modules:
                    continue
            elif included_modules:
                if module_to_run not in included_modules:
                    continue

            print('DIRECTORY NAME: ', run, '\nMODULE: ', module_to_run)

            if module_to_run == 'hivdi':
                docker_user_to_use = 'travissimmons'
            else:
                docker_user_to_use = docker_username

            job_details.update({
                'run_command': run_command,
                'module_name': module_to_run,
                'docker_username': docker_user_to_use,
                'submission_prefix': submission_prefix,
                'job-name': f'{module_to_run}_{run}_cfl',

            })
            create_slurm_script(job_details=job_details, build_image=True, sif_dir=sif_dir)


In [None]:
process_runs(run_list=['fs_s'], \
             included_modules= {'expanded_setfinder','expanded_combine_data', \
                                'input_fs', 'non_expanded_setfinder', 'non_expanded_combine_data', \
                                'prediagnostics_s_bb_bb', 'hivdi', 'sic4dvar', 'metroman', 'metroman_consolidation', \
                                'unconstrained_momma', 'neobam'}, \
             excluded_modules={})

In [None]:
# After running this notebook, there will be a {module name}.sh file generated in the same directory.
# You can either add in an array and submit the job using sbatch or you can fill out the top of the cfl_wrapper.sh and have it submit jobs for you.
# using the cfl_wrapper.sh is highly recommended if you are submitting a number of jobs larger than your HPC allows. I use it in all cases though.

# Edit Reaches of Interest for Unit Testing

In [None]:
# # # EF
# # # Edit reaches of interest as needed

# import json

# json_filename = 'reaches_of_interest_fs.json'
# x_elements = 10000
# # Load the JSON file
# with open(f"/nas/cee-water/cjgleason/ellie/SWOT/confluence/confluence_fs_s/fs_s_mnt/input/{json_filename}", "r") as f:
#     data = json.load(f)

# # Ensure data is a list before slicing
# if isinstance(data, list):
#     data = data[x_elements:] # Keep last x elements OR data = data[:x_elements]  # Keep only the first x elements
#     data = [str(x) for x in data] # Make sure they are in the correct data type
# # Save the modified JSON
# with open("/nas/cee-water/cjgleason/ellie/SWOT/confluence/confluence_fs_s1/fs_s1_mnt/input/reaches_of_interest.json", "w") as f:
#     json.dump(data, f, indent=4)

# print(f"Saved {x_elements} of {json_filename}")
