From 208abea54cbb6e692d61212f4e594ae5e919bcc8 Mon Sep 17 00:00:00 2001 From: Yoshifumi Nakamura Date: Wed, 13 May 2026 18:10:43 +0900 Subject: [PATCH] Add site configs and harden portal handling Signed-off-by: Yoshifumi Nakamura --- .github/workflows/gitlab-manual-ci.yml | 15 +++- benchpark-bridge/scripts/result_converter.py | 3 +- config/queue.csv | 9 +++ config/system.csv | 11 +++ config/system_info.csv | 45 +++++++---- docs/ci.md | 2 +- programs/qws/build.sh | 36 +++++++++ programs/qws/list.csv | 7 +- programs/qws/run.sh | 60 +++++++++++++++ requirements-result-server.txt | 2 +- result_server/routes/api.py | 81 +++++++++++++++----- result_server/tests/test_api_routes.py | 29 ++++++- result_server/tests/test_pagination.py | 10 +++ result_server/tests/test_results_loader.py | 13 ++++ result_server/utils/result_compare_view.py | 4 + result_server/utils/result_file.py | 34 ++++++-- result_server/utils/result_records.py | 6 +- scripts/test_submit.sh | 71 +++++++++++++++-- 18 files changed, 378 insertions(+), 60 deletions(-) diff --git a/.github/workflows/gitlab-manual-ci.yml b/.github/workflows/gitlab-manual-ci.yml index e97610c..84ac91f 100644 --- a/.github/workflows/gitlab-manual-ci.yml +++ b/.github/workflows/gitlab-manual-ci.yml @@ -44,19 +44,27 @@ jobs: name: Run GitLab CI manually runs-on: ubuntu-latest steps: - - name: Check out target ref + - name: Check out trusted workflow ref uses: actions/checkout@v4 with: fetch-depth: 0 - ref: ${{ inputs.target_ref }} + path: trusted - name: Prepare GitLab repository settings id: gitlab-repo - uses: ./.github/actions/prepare-gitlab-repo + uses: ./trusted/.github/actions/prepare-gitlab-repo with: gitlab-repo: ${{ secrets.GITLAB_REPO }} + - name: Check out target ref + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ inputs.target_ref }} + path: target + - name: Push target ref to GitLab test branch + working-directory: target env: GITLAB_TOKEN: ${{ secrets.GITLAB_TOKEN }} GITLAB_REPO_HOST_PATH: ${{ steps.gitlab-repo.outputs.host-path }} @@ -184,6 +192,7 @@ jobs: - name: Delete GitLab test branch if: always() + working-directory: trusted env: GITLAB_TOKEN: ${{ secrets.GITLAB_TOKEN }} GITLAB_REPO_HOST_PATH: ${{ steps.gitlab-repo.outputs.host-path }} diff --git a/benchpark-bridge/scripts/result_converter.py b/benchpark-bridge/scripts/result_converter.py index 26096b3..26584b5 100644 --- a/benchpark-bridge/scripts/result_converter.py +++ b/benchpark-bridge/scripts/result_converter.py @@ -10,7 +10,6 @@ import os import sys import glob -import yaml from datetime import datetime from pathlib import Path @@ -535,4 +534,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/config/queue.csv b/config/queue.csv index 681bbd0..54a4d36 100644 --- a/config/queue.csv +++ b/config/queue.csv @@ -1,4 +1,5 @@ queue,submit_cmd,template +SLURM_AI4SS,sbatch,"-p ${queue_group} -t ${elapse} -N ${nodes} --ntasks-per-node=${numproc_node} --cpus-per-task=${nthreads} --gpus-per-node=${numproc_node}" FJ,pjsub,"-L rscunit=rscunit_ft01,rscgrp=${queue_group},elapse=${elapse},node=${nodes} --mpi max-proc-per-node=${numproc_node} -x PJM_LLIO_GFSCACHE=/vol0002:/vol0003:/vol0004:/vol0005" PJM_GENKAI,pjsub,"-L rscgrp=${queue_group},elapse=${elapse},node=${nodes} --mpi proc=${proc}" SLURM_RC,sbatch,"-p ${queue_group} -t ${elapse} -N ${nodes} --ntasks-per-node=${numproc_node} --cpus-per-task=${nthreads}" @@ -7,4 +8,12 @@ PBS_Grand_C,qsub,"-q ${queue_group} -l select=${nodes}:nsockets=${cpu_per_node}, PBS_Grand_G,qsub,"-q ${queue_group} -l select=${nodes}:ngpus=1,walltime=${elapse} -W group_list=d30992" NQSV_AOBA_VE,qsub,"-Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q ${queue_group} -T necmpi --venode ${proc} -l elapstim_req=${elapse}" NQSV_AOBA_B,qsub,"-Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q ${queue_group} -T intmpi -b ${nodes} -l elapstim_req=${elapse}" +PJM_WISTERIA_O,pjsub,"-g jh260034o -L rscgrp=${queue_group},elapse=${elapse},node=${nodes} --mpi proc=${proc} --omp thread=${nthreads}" +PJM_WISTERIA_A,pjsub,"-g jh260034a -L rscgrp=${queue_group},elapse=${elapse},node=${nodes} --mpi proc=${proc} --omp thread=${nthreads}" +PBS_TSUKUBA,qsub,"-q ${queue_group} -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=${nthreads} -l walltime=${elapse}" +AGE_TSUBAME4,qsub,"-l ${queue_group}=${nodes} -l h_rt=${elapse}" +SLURM_CAMPHOR3,sbatch,"-p ${queue_group} -t ${elapse} --rsc p=${proc}:t=${nthreads}:c=${nthreads}:m=1G" +NQSV_OSAKA_CPU,qsub,"-q ${queue_group} -b ${nodes} -l elapstim_req=${elapse},cpunum_job=${nthreads}" +NQSV_OSAKA_GPU,qsub,"-q ${queue_group} -b ${nodes} -l elapstim_req=${elapse},cpunum_job=${nthreads},gpunum_job=${gpu_per_node}" +NQSV_OSAKA_VE,qsub,"-q ${queue_group} --venode ${proc} -l elapstim_req=${elapse}" none,none,none diff --git a/config/system.csv b/config/system.csv index 9513980..5d3e0e0 100644 --- a/config/system.csv +++ b/config/system.csv @@ -1,4 +1,5 @@ system,mode,tag_build,tag_run,queue,queue_group +AI4SS,cross,ai4ss_login,ai4ss_jacamar,SLURM_AI4SS,1n1gpu Fugaku,cross,fugaku_login1,fugaku_jacamar,FJ,small FugakuLN,native,,fugaku_login1,none,small FugakuCN,native,,fugaku_jacamar,FJ,small @@ -16,4 +17,14 @@ Grand_G,cross,grand_login,grand_jacamar,PBS_Grand_G,eg AOBA_A,cross,aoba_ab_login,aoba_ab_jacamar,NQSV_AOBA_VE,sx AOBA_B,cross,aoba_ab_login,aoba_ab_jacamar,NQSV_AOBA_B,lx AOBA_S,cross,aoba_s_login,aoba_s_jacamar,NQSV_AOBA_VE,sxs +Odyssey,cross,wisteria_login,wisteria-o_jacamar,PJM_WISTERIA_O,short-o +Aquarius,cross,wisteria_login,wisteria-a_jacamar,PJM_WISTERIA_A,short-a +Pegasus,cross,pegasus_login,pegasus_jacamar,PBS_TSUKUBA,regular +Sirius,cross,sirius_login,sirius_jacamar,PBS_TSUKUBA,regular +TSUBAME4,cross,tsubame4_login,tsubame4_jacamar,AGE_TSUBAME4,node_f +Camphor3,cross,camphor3_login,camphor3_jacamar,SLURM_CAMPHOR3,jha +SQUID_CPU,cross,squid_login,squid_jacamar,NQSV_OSAKA_CPU,SQUID +SQUID_GPU,cross,squid_login,squid_jacamar,NQSV_OSAKA_GPU,SQUID +SQUID_VECTOR,cross,squid_login,squid_jacamar,NQSV_OSAKA_VE,SQUID +OCTOPUS,cross,octopus_login,octopus_jacamar,NQSV_OSAKA_CPU,OCT FNCX,native,,fncx-curl-jq,none,small diff --git a/config/system_info.csv b/config/system_info.csv index 0273370..e885eff 100644 --- a/config/system_info.csv +++ b/config/system_info.csv @@ -1,18 +1,29 @@ system,name,cpu_name,cpu_per_node,cpu_cores,gpu_name,gpu_per_node,memory,display_order -Fugaku,Fugaku,A64FX,1,48,-,-,32GB,1 -FugakuCN,FugakuCN,A64FX,1,48,-,-,32GB,2 -FugakuLN,FugakuLN,Intel(R) Xeon(R) Gold 6242 CPU @ 2.80GHz,2,16,-,-,96GB,3 -MiyabiG,MiyabiG,NVIDIA Grace CPU,1,72,NVIDIA Hopper H100 GPU,1,120GB,4 -MiyabiC,MiyabiC,Intel Xeon Max 9480,2,56,-,-,128GB,5 -RC_GH200,RC_GH200,NVIDIA Grace CPU,1,72,NVIDIA Hopper H100 GPU,1,120GB,6 -RC_DGXSP,RC_DGXSP,ARM Cortex-X925 / Cortex-A725,1,20,NVIDIA GB10,1,128GB,7 -RC_GENOA,RC_GENOA,AMD EPYC 9684X,2,96,-,-,768GB,8 -RC_FX700,RC_FX700,A64FX,1,48,-,-,32GB,9 -GenkaiA,GenkaiA,Intel Xeon Platinum 8490H (Sapphire Rapids),2,60,-,-,512GiB,10 -GenkaiB,GenkaiB,Intel Xeon Platinum 8490H (Sapphire Rapids),2,60,NVIDIA H100 (Hopper),4,1024GiB,11 -GenkaiC,GenkaiC,Intel Xeon Platinum 8480+ (Sapphire Rapids),2,56,NVIDIA H100 (Hopper),8,8TiB,12 -Grand_C,Grand_C,Intel Xeon Gold 6548Y+ (Emerald Rapids),2,32,-,-,512GiB,13 -Grand_G,Grand_G,Intel Xeon Gold 6548Y+ (Emerald Rapids),2,32,NVIDIA H100 (Hopper),4,512GiB,14 -AOBA_A,AOBA_A,SX-Aurora TSUBASA VH,1,24,NEC SX-Aurora TSUBASA Type 20B VE,8,640GB,15 -AOBA_B,AOBA_B,AMD EPYC 7702,2,64,-,-,256GB,16 -AOBA_S,AOBA_S,SX-Aurora TSUBASA VH,1,64,NEC SX-Aurora TSUBASA Type 30A VE,8,256GB + 768GB,17 +AI4SS,RIKEN AI4S Supercomputer,NVIDIA Grace CPU,2,72,NVIDIA B200,4,960GiB + 692.8GiB,1 +Fugaku,Fugaku,A64FX,1,48,-,-,32GB,2 +FugakuCN,FugakuCN,A64FX,1,48,-,-,32GB,3 +FugakuLN,FugakuLN,Intel(R) Xeon(R) Gold 6242 CPU @ 2.80GHz,2,16,-,-,96GB,4 +MiyabiG,MiyabiG,NVIDIA Grace CPU,1,72,NVIDIA Hopper H100 GPU,1,120GB,5 +MiyabiC,MiyabiC,Intel Xeon Max 9480,2,56,-,-,128GB,6 +RC_GH200,RC_GH200,NVIDIA Grace CPU,1,72,NVIDIA Hopper H100 GPU,1,120GB,7 +RC_DGXSP,RC_DGXSP,ARM Cortex-X925 / Cortex-A725,1,20,NVIDIA GB10,1,128GB,8 +RC_GENOA,RC_GENOA,AMD EPYC 9684X,2,96,-,-,768GB,9 +RC_FX700,RC_FX700,A64FX,1,48,-,-,32GB,10 +GenkaiA,GenkaiA,Intel Xeon Platinum 8490H (Sapphire Rapids),2,60,-,-,512GiB,11 +GenkaiB,GenkaiB,Intel Xeon Platinum 8490H (Sapphire Rapids),2,60,NVIDIA H100 (Hopper),4,1024GiB,12 +GenkaiC,GenkaiC,Intel Xeon Platinum 8480+ (Sapphire Rapids),2,56,NVIDIA H100 (Hopper),8,8TiB,13 +Grand_C,Grand_C,Intel Xeon Gold 6548Y+ (Emerald Rapids),2,32,-,-,512GiB,14 +Grand_G,Grand_G,Intel Xeon Gold 6548Y+ (Emerald Rapids),2,32,NVIDIA H100 (Hopper),4,512GiB,15 +AOBA_A,AOBA_A,SX-Aurora TSUBASA VH,1,24,NEC SX-Aurora TSUBASA Type 20B VE,8,640GB,16 +AOBA_B,AOBA_B,AMD EPYC 7702,2,64,-,-,256GB,17 +AOBA_S,AOBA_S,SX-Aurora TSUBASA VH,1,64,NEC SX-Aurora TSUBASA Type 30A VE,8,256GB + 768GB,18 +Odyssey,Odyssey,A64FX,1,48,-,-,32GiB,19 +Aquarius,Aquarius,Intel Xeon Platinum 8360Y,2,36,NVIDIA A100,8,512GiB,20 +TSUBAME4,TSUBAME4.0,AMD EPYC 9654,2,96,NVIDIA H100 SXM5 94GB HBM2e,4,768GiB,21 +Camphor3,Camphor3,Intel Xeon CPU Max 9480,2,56,-,-,128GiB,22 +Pegasus,Pegasus,Intel Xeon Platinum 8468,2,48,NVIDIA H100 PCIe,1,128GiB + 2TiB PMem,23 +Sirius,Sirius (PACS12.0),AMD EPYC Zen 4 (MI300A APU),4,24,AMD Instinct MI300A CDNA3,4,512GB HBM3,24 +SQUID_CPU,SQUID CPU,Intel Xeon Platinum 8368,2,38,-,-,256GB,25 +SQUID_GPU,SQUID GPU,Intel Xeon Platinum 8368,2,38,NVIDIA A100 SXM4 40GB,8,512GB,26 +SQUID_VECTOR,SQUID Vector,AMD EPYC 7402P,1,24,NEC SX-Aurora TSUBASA Type20A VE,8,128GB + 384GB,27 +OCTOPUS,OCTOPUS,Intel Xeon 6980P (Granite Rapids),2,128,-,-,768GB,28 diff --git a/docs/ci.md b/docs/ci.md index 1980da5..5e4e5e2 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -260,7 +260,7 @@ Use these examples when deciding whether to split a pull request or start GitLab | Example change set / 変更例 | Expected checks / 期待される確認 | GitLab benchmark expectation / GitLab benchmark期待値 | |---|---|---| | `docs/ci.md` only / `docs/ci.md`のみ | Review the documentation diff / docs差分をreview | No benchmark run. Direct/manual GitLab pipelines should skip by rules / benchmark不要。直接/手動GitLab pipelineではrulesでskipされる想定 | -| `result_server/routes/usage.py` and `result_server/templates/*.html` / `result_server/routes/usage.py`と`result_server/templates/*.html` | `Result Server Tests` should run / `Result Server Tests`が動く | No benchmark run unless a maintainer intentionally starts one / maintainerが意図して起動しない限りbenchmark不要 | +| `result_server/routes/results_usage_routes.py` and `result_server/templates/*.html` / `result_server/routes/results_usage_routes.py`と`result_server/templates/*.html` | `Result Server Tests` should run / `Result Server Tests`が動く | No benchmark run unless a maintainer intentionally starts one / maintainerが意図して起動しない限りbenchmark不要 | | `config/system_info.csv` only / `config/system_info.csv`のみ | `Result Server Tests` should verify public site config consistency / 公開site config整合性を`Result Server Tests`で確認 | No benchmark run because this file is portal display metadata / portal表示metadataなのでbenchmark不要 | | `config/system.csv` or `config/queue.csv` for a public system / 公開system向けの`config/system.csv`または`config/queue.csv` | `Result Server Tests` should run the site config preflight / `Result Server Tests`でsite config preflightを実行 | Start `GitLab Manual CI` too when benchmark execution behavior needs validation / benchmark実行挙動の検証が必要なら`GitLab Manual CI`も起動 | | `scripts/bk_functions.sh`, `scripts/result.sh`, or `scripts/result_server/**` only / `scripts/bk_functions.sh`、`scripts/result.sh`、または`scripts/result_server/**`のみ | `Result Server Tests` should run when the path filter matches / path filter対象なら`Result Server Tests`が動く | Manual GitLab CI is optional and only needed if upload behavior affects benchmark operation / upload挙動がbenchmark運用に影響する場合だけ手動GitLab CIを検討 | diff --git a/programs/qws/build.sh b/programs/qws/build.sh index 9750137..73181a7 100644 --- a/programs/qws/build.sh +++ b/programs/qws/build.sh @@ -36,6 +36,10 @@ case "$system" in echo "Dummy build for FNCX Docker runner test" echo aaaa > main ;; + AI4SS) + module load nvhpc-hpcx/26.3 + make -j 8 omp=1 compiler=nvhpc-hpcx arch=grace rdma= mpi=1 + ;; RC_GH200) module load system/qc-gh200 nvhpc-hpcx/25.9 ### QWSはNeoverse版やGPU版はないので汎用版としてとりあえずarch=skylakeを指定している @@ -75,6 +79,38 @@ case "$system" in AOBA_B) make -j 8 fugaku_benchmark= omp=1 compiler=openmpi-gnu arch=skylake rdma= mpi=1 powerapi= CC=mpicc CXX=mpic++ ;; + Odyssey) + module load odyssey + make compiler=fujitsu_cross arch=postk -j 8 + ;; + Aquarius) + module purge + module load intel + source /work/opt/local/x86_64/cores/intel/2023.0.0/mpi/latest/env/vars.sh + make compiler=intel arch=skylake rdma= -j8 + ;; + TSUBAME4) + make -j 8 fugaku_benchmark= omp=1 compiler=openmpi-gnu arch=skylake rdma= mpi=1 powerapi= CC=mpicc CXX=mpic++ + ;; + Camphor3) + camphor3_modulepath="${MODULEPATH:-}" + if [[ -r /etc/profile.d/modules.sh ]]; then + source /etc/profile.d/modules.sh + elif [[ -r /etc/profile.d/z00_lmod.sh ]]; then + source /etc/profile.d/z00_lmod.sh + else + echo "qws: no module init script found" >&2 + fi + if [[ -n "${MODULEPATH:-}" ]]; then + camphor3_modulepath="${MODULEPATH}" + fi + module purge + if [[ -n "${camphor3_modulepath:-}" ]]; then + export MODULEPATH="${camphor3_modulepath}" + fi + module load slurm/2022 SysA/2022 intel/2023.2 intelmpi/2023.2 PrgEnvIntel/2023 + make -j 8 fugaku_benchmark= omp=1 compiler=intel arch=skylake rdma= mpi=1 powerapi= + ;; *) echo "Unknown system: $system" exit 1 diff --git a/programs/qws/list.csv b/programs/qws/list.csv index 41d1e75..beaddfd 100644 --- a/programs/qws/list.csv +++ b/programs/qws/list.csv @@ -1,4 +1,5 @@ system,enable,nodes,numproc_node,nthreads,elapse +AI4SS,yes,1,1,72,0:10:00 Fugaku,yes,1,4,12,0:10:00 FugakuLN,yes,1,1,1,0:10:00 FugakuCN,no,1,4,12,0:10:00 @@ -6,7 +7,7 @@ FugakuCN,no,2,4,12,0:10:00 RC_GH200,yes,1,1,72,0:10:00 RC_DGXSP,yes,1,1,20,0:10:00 RC_GENOA,yes,1,1,96,0:10:00 -RC_FX700,yes,1,4,12,0:10:00 +RC_FX700,yes,1,1,12,0:10:00 MiyabiG,yes,1,1,72,0:10:00 MiyabiC,yes,1,1,112,0:10:00 GenkaiA,yes,1,1,120,0:10:00 @@ -17,4 +18,8 @@ Grand_G,yes,1,1,64,0:10:00 AOBA_A,yes,1,1,8,0:10:00 AOBA_S,yes,1,1,8,0:10:00 AOBA_B,yes,1,1,128,0:10:00 +Odyssey,yes,1,1,12,0:10:00 +Aquarius,yes,1,1,8,0:10:00 +TSUBAME4,yes,1,1,192,0:10:00 +Camphor3,yes,1,1,112,0:10:00 FNCX,yes,1,1,1,0:10:00 diff --git a/programs/qws/run.sh b/programs/qws/run.sh index c555c24..329436c 100644 --- a/programs/qws/run.sh +++ b/programs/qws/run.sh @@ -84,6 +84,14 @@ case "$system" in echo 'dummy call for FNCX Docker runner test' bk_emit_result --fom 99.99 --fom-version dummy --exp FNCXTest --nodes "$nodes" --numproc-node "$numproc_node" --nthreads "$nthreads" >> ../results/result ;; + AI4SS) + module load nvhpc-hpcx/26.3 + export OMP_NUM_THREADS=72 + export OMP_PLACES=cores + export OMP_PROC_BIND=close + mpirun --bind-to none -n 1 ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 + print_results CASE0 CASE0 1 >> ../results/result + ;; RC_GH200) module load system/qc-gh200 nvhpc-hpcx/25.9 mpirun -n 1 --bind-to core --map-by ppr:1:node:PE=72 ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 @@ -141,6 +149,58 @@ case "$system" in mpirun -np ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 print_results CASE0 CASE0 ${numproc_node} >> ../results/result ;; + Odyssey) + if [[ -r /etc/profile.d/modules.sh ]]; then + source /etc/profile.d/modules.sh + else + echo "qws: /etc/profile.d/modules.sh is not readable" >&2 + fi + module unload fjmpi fj odyssey 2>/dev/null || true + module load odyssey fj fjmpi + export OMP_NUM_THREADS=12 + export PLE_MPI_STD_EMPTYFILE=off + mpiexec -n 1 -ofout CASE0 ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 + print_results CASE0 CASE0 1 >> ../results/result + ;; + Aquarius) + module purge + module load intel + source /work/opt/local/x86_64/cores/intel/2023.0.0/mpi/latest/env/vars.sh + export OMP_NUM_THREADS=8 + export I_MPI_PIN=1 + mpiexec -n 1 ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 + print_results CASE0 CASE0 1 >> ../results/result + ;; + TSUBAME4) + qws_numproc=$((nodes * numproc_node)) + mpirun -n ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 + print_results CASE0 CASE0 ${numproc_node} >> ../results/result + ;; + Camphor3) + camphor3_modulepath="${MODULEPATH:-}" + if [[ -r /etc/profile.d/modules.sh ]]; then + source /etc/profile.d/modules.sh + elif [[ -r /etc/profile.d/z00_lmod.sh ]]; then + source /etc/profile.d/z00_lmod.sh + else + echo "qws: no module init script found" >&2 + fi + if [[ -n "${MODULEPATH:-}" ]]; then + camphor3_modulepath="${MODULEPATH}" + fi + module purge + if [[ -n "${camphor3_modulepath:-}" ]]; then + export MODULEPATH="${camphor3_modulepath}" + fi + module load intel/2023.2 intelmpi/2023.2 PrgEnvIntel/2023 + export OMP_NUM_THREADS="${nthreads}" + export I_MPI_PIN=1 + if [[ "${SLURM_CONF:-}" == /etc/slurm/sysA/* ]]; then + unset SLURM_CONF + fi + srun -n 1 -c "${nthreads}" ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 + print_results CASE0 CASE0 1 >> ../results/result + ;; *) echo "Unknown Running system: $system" exit 1 diff --git a/requirements-result-server.txt b/requirements-result-server.txt index fd274bc..03b6759 100644 --- a/requirements-result-server.txt +++ b/requirements-result-server.txt @@ -1,4 +1,4 @@ -# result_server requires Python 3.12+ for safe tar extraction via tarfile filter="data". +# result_server is tested and deployed on Python 3.12+; archive handling uses explicit path/type validation. Flask>=3.0,<4.0 Flask-Session>=0.8,<1.0 Flask-WTF>=1.2,<2.0 diff --git a/result_server/routes/api.py b/result_server/routes/api.py index d7cd8d9..61f7959 100644 --- a/result_server/routes/api.py +++ b/result_server/routes/api.py @@ -7,8 +7,8 @@ import uuid import shutil import io -import sys import tarfile +import tempfile from datetime import datetime from utils.auth import verify_ingest_key @@ -145,25 +145,61 @@ def _find_result_file_by_uuid(received_dir, uuid_value): def _safe_extract_tar_bytes(file_storage, target_dir): - """Extract uploaded tar bytes with path and member-type checks. - - The explicit path normalization catches traversal attempts before writing - anything, and Python 3.12's data filter rejects non-regular archive entries - such as unsafe links or device files. - """ - if sys.version_info < (3, 12): - raise RuntimeError("Python 3.12 or later is required for safe tar extraction.") - + """Extract uploaded tar bytes with path and member-type checks.""" os.makedirs(target_dir, exist_ok=True) with tarfile.open(fileobj=file_storage.stream, mode="r:*") as tar: for member in tar.getmembers(): normalized = os.path.normpath(member.name) - if os.path.isabs(normalized) or normalized.startswith(".."): + drive, _ = os.path.splitdrive(normalized) + if ( + drive + or os.path.isabs(normalized) + or normalized == ".." + or normalized.startswith(f"..{os.sep}") + or normalized.startswith("../") + or normalized.startswith("..\\") + ): abort(400, description="Unsafe archive entry") - try: - tar.extractall(target_dir, filter="data") - except tarfile.FilterError: - abort(400, description="Unsafe archive entry") + if member.issym() or member.islnk() or member.isdev(): + abort(400, description="Unsafe archive entry") + if not (member.isfile() or member.isdir()): + abort(400, description="Unsafe archive entry") + + destination = os.path.abspath(os.path.join(target_dir, normalized)) + abs_target_dir = os.path.abspath(target_dir) + try: + if os.path.commonpath([abs_target_dir, destination]) != abs_target_dir: + abort(400, description="Unsafe archive entry") + except ValueError: + abort(400, description="Unsafe archive entry") + + if member.isdir(): + os.makedirs(destination, exist_ok=True) + continue + + os.makedirs(os.path.dirname(destination), exist_ok=True) + source = tar.extractfile(member) + if source is None: + abort(400, description="Unsafe archive entry") + with source, open(destination, "wb") as output: + shutil.copyfileobj(source, output) + + +def _replace_directory_after_success(source_dir, target_dir): + """Replace target_dir only after source_dir is fully prepared.""" + if not os.path.isdir(target_dir): + os.rename(source_dir, target_dir) + return False + + backup_dir = f"{target_dir}.bak.{uuid.uuid4().hex}" + os.rename(target_dir, backup_dir) + try: + os.rename(source_dir, target_dir) + except Exception: + os.rename(backup_dir, target_dir) + raise + shutil.rmtree(backup_dir) + return True # ========================================== @@ -264,13 +300,16 @@ def ingest_estimation_inputs(): result_stem = os.path.splitext(result_filename)[0] inputs_root = current_app.config["RECEIVED_ESTIMATION_INPUTS_DIR"] + os.makedirs(inputs_root, exist_ok=True) target_dir = os.path.join(inputs_root, result_stem) - replaced = os.path.isdir(target_dir) - if replaced: - shutil.rmtree(target_dir) - os.makedirs(target_dir, exist_ok=True) - - _safe_extract_tar_bytes(uploaded_file, target_dir) + temp_dir = tempfile.mkdtemp(prefix=f".{result_stem}.", dir=inputs_root) + try: + _safe_extract_tar_bytes(uploaded_file, temp_dir) + replaced = _replace_directory_after_success(temp_dir, target_dir) + except Exception: + if os.path.isdir(temp_dir): + shutil.rmtree(temp_dir) + raise print(f"Saved estimation inputs: {target_dir}", flush=True) return { diff --git a/result_server/tests/test_api_routes.py b/result_server/tests/test_api_routes.py index c812c53..cb450c1 100644 --- a/result_server/tests/test_api_routes.py +++ b/result_server/tests/test_api_routes.py @@ -412,6 +412,34 @@ def test_ingest_estimation_inputs_rejects_parent_path_entry(self, client, tmp_di ) assert resp.status_code == 400 + def test_ingest_estimation_inputs_keeps_existing_data_on_bad_archive(self, client, tmp_dirs): + received = tmp_dirs[0] + estimation_inputs_dir = tmp_dirs[2] + uuid_value = "12345678-1234-1234-1234-123456789abc" + result_stem = self._seed_result(received, uuid_value) + target_dir = os.path.join(estimation_inputs_dir, result_stem) + os.makedirs(target_dir, exist_ok=True) + existing_path = os.path.join(target_dir, "existing.json") + with open(existing_path, "w", encoding="utf-8") as f: + json.dump({"keep": True}, f) + + archive_bytes = io.BytesIO() + with tarfile.open(fileobj=archive_bytes, mode="w:gz") as tar: + payload = b"bad" + info = tarfile.TarInfo(name="../outside.txt") + info.size = len(payload) + tar.addfile(info, io.BytesIO(payload)) + archive_bytes.seek(0) + + resp = client.post( + "/api/ingest/estimation-inputs", + data={"id": uuid_value, "file": (archive_bytes, "estimation_inputs.tgz")}, + headers={"X-API-Key": API_KEY}, + content_type="multipart/form-data", + ) + assert resp.status_code == 400 + assert os.path.exists(existing_path) + def test_ingest_estimation_inputs_rejects_absolute_path_entry(self, client, tmp_dirs): received = tmp_dirs[0] uuid_value = "12345678-1234-1234-1234-123456789abc" @@ -497,4 +525,3 @@ def test_query_estimation_inputs_returns_archive(self, client, tmp_dirs): names = tar.getnames() assert "compute_solver_papi.tgz" in names - diff --git a/result_server/tests/test_pagination.py b/result_server/tests/test_pagination.py index e749279..559e6f9 100644 --- a/result_server/tests/test_pagination.py +++ b/result_server/tests/test_pagination.py @@ -458,6 +458,16 @@ def test_compare_route_still_works(self, flask_app, tmp_dir): resp = client.get(f"/results/compare?files={f1},{f2}") assert resp.status_code == 200 + def test_compare_route_rejects_unsafe_filename(self, flask_app, tmp_dir): + """Compare should not accept path-like filenames from query parameters.""" + uid = str(uuid.uuid4()) + f1 = f"result_20250101_000000_{uid}.json" + _write_json(tmp_dir, f1, {"code": "a", "system": "s", "FOM": 1.0}) + + with flask_app.test_client() as client: + resp = client.get(f"/results/compare?files={f1},../outside.json") + assert resp.status_code == 404 + def test_no_filter_reads_only_page_files(self, flask_app, tmp_dir): """Test case.""" _make_result_files(tmp_dir, 150) diff --git a/result_server/tests/test_results_loader.py b/result_server/tests/test_results_loader.py index 2940628..4841d3b 100644 --- a/result_server/tests/test_results_loader.py +++ b/result_server/tests/test_results_loader.py @@ -115,6 +115,19 @@ def test_preserves_metrics_vector(self, tmp_dir): assert "vector" in result["metrics"] assert result["metrics"]["vector"]["x_axis"]["name"] == "message_size" + def test_rejects_parent_path_filename(self, tmp_dir): + """Result loaders should not read JSON outside the configured directory.""" + outside_dir = tempfile.mkdtemp() + try: + _write_json(outside_dir, "outside.json", {"code": "outside"}) + result = load_result_json( + os.path.join("..", os.path.basename(outside_dir), "outside.json"), + tmp_dir, + ) + assert result is None + finally: + shutil.rmtree(outside_dir) + # ============================================================ # load_result_json_batch behavior diff --git a/result_server/utils/result_compare_view.py b/result_server/utils/result_compare_view.py index 405748b..7ec7241 100644 --- a/result_server/utils/result_compare_view.py +++ b/result_server/utils/result_compare_view.py @@ -1,3 +1,5 @@ +from flask import abort + from utils.result_file import check_file_permission from utils.result_records import build_axis_label, build_compare_headline, load_result_json_batch @@ -43,4 +45,6 @@ def load_result_compare_context(filenames, directory): for filename in filenames: check_file_permission(filename, directory) results = load_result_json_batch(filenames, directory) + if len(results) != len(filenames): + abort(404, "Result file not found") return build_result_compare_context(results) diff --git a/result_server/utils/result_file.py b/result_server/utils/result_file.py index 95825ac..eeff711 100644 --- a/result_server/utils/result_file.py +++ b/result_server/utils/result_file.py @@ -11,8 +11,8 @@ def load_result_file(filename: str, save_dir: str): - filepath = os.path.join(save_dir, filename) - if not os.path.exists(filepath): + filepath = resolve_safe_child_path(filename, save_dir) + if filepath is None or not os.path.exists(filepath): abort(404) if filename.endswith(".json"): @@ -27,7 +27,31 @@ def load_result_file(filename: str, save_dir: str): abort(400, "Invalid JSON") abs_dir = os.path.abspath(save_dir) - return send_from_directory(abs_dir, filename, as_attachment=True) + return send_from_directory(abs_dir, os.path.basename(filepath), as_attachment=True) + + +def resolve_safe_child_path( + filename: str, + base_dir: str, + *, + required_suffix: Optional[str] = None, +): + """Resolve a basename-only child path under base_dir, or return None.""" + if not filename or os.path.isabs(filename): + return None + if "/" in filename or "\\" in filename or os.path.basename(filename) != filename: + return None + if required_suffix and not filename.endswith(required_suffix): + return None + + abs_base = os.path.abspath(base_dir) + candidate = os.path.abspath(os.path.join(abs_base, filename)) + try: + if os.path.commonpath([abs_base, candidate]) != abs_base: + return None + except ValueError: + return None + return candidate def get_file_confidential_tags(filename: str, save_dir: str): @@ -134,8 +158,8 @@ def _read_confidential_from_json(json_file: str, save_dir: str): def _read_json(json_file: str, save_dir: str): - filepath = os.path.join(save_dir, json_file) - if not os.path.exists(filepath): + filepath = resolve_safe_child_path(json_file, save_dir, required_suffix=".json") + if filepath is None or not os.path.exists(filepath): return None try: diff --git a/result_server/utils/result_records.py b/result_server/utils/result_records.py index 19d9bdc..047d4cb 100644 --- a/result_server/utils/result_records.py +++ b/result_server/utils/result_records.py @@ -3,7 +3,7 @@ import re from datetime import datetime -from utils.result_file import get_file_confidential_tags +from utils.result_file import get_file_confidential_tags, resolve_safe_child_path def load_visible_result_json( @@ -30,8 +30,8 @@ def load_visible_result_json( def load_result_json(filename, directory): """Load a single JSON file from a result directory.""" - filepath = os.path.join(directory, filename) - if not os.path.isfile(filepath): + filepath = resolve_safe_child_path(filename, directory, required_suffix=".json") + if filepath is None or not os.path.isfile(filepath): return None try: diff --git a/scripts/test_submit.sh b/scripts/test_submit.sh index ea12bb8..420982c 100644 --- a/scripts/test_submit.sh +++ b/scripts/test_submit.sh @@ -100,7 +100,7 @@ echo bash programs/$code/run.sh $system $nodes $numproc_node $nthreads >> script # --- システム別ジョブ投入 --- case "$system" in - FugakuLN) + FugakuLN|FNCX) echo "Notice: system=$system → submit test will NOT be performed." exit 1 ;; @@ -124,6 +124,41 @@ case "$system" in --mpi proc=$proc \ script.sh ;; + Odyssey) + proc=$((nodes * numproc_node)) + echo pjsub -g jh260034o -L rscgrp=$queue_group,node=$nodes,elapse=$elapse \ + --mpi proc=$proc --omp thread=$nthreads \ + script.sh + pjsub -g jh260034o -L rscgrp=$queue_group,node=$nodes,elapse=$elapse \ + --mpi proc=$proc --omp thread=$nthreads \ + script.sh + ;; + Aquarius) + proc=$((nodes * numproc_node)) + echo pjsub -g jh260034a -L rscgrp=$queue_group,node=$nodes,elapse=$elapse \ + --mpi proc=$proc --omp thread=$nthreads \ + script.sh + pjsub -g jh260034a -L rscgrp=$queue_group,node=$nodes,elapse=$elapse \ + --mpi proc=$proc --omp thread=$nthreads \ + script.sh + ;; + Pegasus|Sirius) + echo qsub -q $queue_group \ + -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=${nthreads} \ + -l walltime=${elapse} script.sh + qsub -q $queue_group \ + -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=${nthreads} \ + -l walltime=${elapse} script.sh + ;; + TSUBAME4) + echo qsub -l ${queue_group}=${nodes} -l h_rt=${elapse} script.sh + qsub -l ${queue_group}=${nodes} -l h_rt=${elapse} script.sh + ;; + Camphor3) + proc=$((nodes * numproc_node)) + echo sbatch -p $queue_group -t $elapse --rsc p=${proc}:t=${nthreads}:c=${nthreads}:m=1G script.sh + sbatch -p $queue_group -t $elapse --rsc p=${proc}:t=${nthreads}:c=${nthreads}:m=1G script.sh + ;; Grand_C) cpu_per_node=$(get_system_cpu_per_node "$system") echo qsub -q $queue_group \ @@ -154,10 +189,36 @@ case "$system" in qsub -Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q $queue_group -T intmpi -b $nodes \ -l elapstim_req=$elapse script.sh ;; - RC_GH200) - echo sbatch -p qc-gh200 -N $nodes -t $elapse --ntasks-per-node=${numproc_node} --cpus-per-task=$nthreads \ + SQUID_CPU|OCTOPUS) + echo qsub -q $queue_group -b $nodes \ + -l elapstim_req=${elapse},cpunum_job=${nthreads} script.sh + qsub -q $queue_group -b $nodes \ + -l elapstim_req=${elapse},cpunum_job=${nthreads} script.sh + ;; + SQUID_GPU) + gpu_per_node=$(get_system_gpu_per_node "$system") + echo qsub -q $queue_group -b $nodes \ + -l elapstim_req=${elapse},cpunum_job=${nthreads},gpunum_job=${gpu_per_node} script.sh + qsub -q $queue_group -b $nodes \ + -l elapstim_req=${elapse},cpunum_job=${nthreads},gpunum_job=${gpu_per_node} script.sh + ;; + SQUID_VECTOR) + proc=$((nodes * numproc_node)) + echo qsub -q $queue_group --venode $proc \ + -l elapstim_req=${elapse} script.sh + qsub -q $queue_group --venode $proc \ + -l elapstim_req=${elapse} script.sh + ;; + AI4SS) + echo sbatch -p $queue_group -N $nodes -t $elapse --ntasks-per-node=${numproc_node} --cpus-per-task=$nthreads --gpus-per-node=${numproc_node} \ + --wrap="bash programs/$code/run.sh $system $nodes $numproc_node $nthreads" + sbatch -p $queue_group -N $nodes -t $elapse --ntasks-per-node=${numproc_node} --cpus-per-task=$nthreads --gpus-per-node=${numproc_node} \ + --wrap="bash programs/${code}/run.sh $system $nodes $numproc_node $nthreads" + ;; + RC_GH200|RC_DGXSP|RC_GENOA|RC_FX700) + echo sbatch -p $queue_group -N $nodes -t $elapse --ntasks-per-node=${numproc_node} --cpus-per-task=$nthreads \ --wrap="bash programs/$code/run.sh $system $nodes $numproc_node $nthreads" - sbatch -p qc-gh200 -N $nodes -t $elapse --ntasks-per-node=${numproc_node} --cpus-per-task=$nthreads \ + sbatch -p $queue_group -N $nodes -t $elapse --ntasks-per-node=${numproc_node} --cpus-per-task=$nthreads \ --wrap="bash programs/${code}/run.sh $system $nodes $numproc_node $nthreads" ;; MiyabiC) @@ -174,7 +235,7 @@ case "$system" in ;; *) echo "Error: Unknown system '$system'" - echo "Supported systems: Fugaku, FugakuCN, FugakuLN, GenkaiA, GenkaiB, GenkaiC, Grand_C, Grand_G, AOBA_A, AOBA_B, AOBA_S, RC_GH200, MiyabiC, MiyabiG" + echo "Supported systems: AI4SS, Fugaku, FugakuCN, FugakuLN, FNCX, GenkaiA, GenkaiB, GenkaiC, Odyssey, Aquarius, Pegasus, Sirius, TSUBAME4, Camphor3, SQUID_CPU, SQUID_GPU, SQUID_VECTOR, OCTOPUS, Grand_C, Grand_G, AOBA_A, AOBA_B, AOBA_S, RC_GH200, RC_DGXSP, RC_GENOA, RC_FX700, MiyabiC, MiyabiG" exit 1 ;; esac