From f6de70673f8d2c64b64870dd8c44475239a94229 Mon Sep 17 00:00:00 2001 From: Yoshifumi Nakamura Date: Tue, 21 Apr 2026 16:29:20 +0900 Subject: [PATCH] Add NCU profiler support and AOBA runner notes Document AOBA Git wrapper, proxy, and NQSV runner details, and clarify profiler metadata/result-summary comments for the NCU path. Signed-off-by: Yoshifumi Nakamura --- .github/workflows/result-server-tests.yml | 29 +- .gitlab-ci.yml | 55 +- README.md | 4 + config/queue.csv | 7 +- config/system.csv | 8 + config/system_info.csv | 8 + docs/ci.md | 12 +- docs/cx/BENCHKIT_GAP_ANALYSIS.md | 4 +- docs/guides/add-app.md | 27 +- docs/guides/add-site.md | 275 ++++++++-- docs/guides/developer-reference.md | 11 +- docs/guides/profiler-level-reference.md | 27 +- docs/guides/profiler-support.md | 50 +- programs/genesis/build.sh | 275 +++++++++- programs/genesis/list.csv | 2 + programs/genesis/run.sh | 83 +++ programs/qws/build.sh | 14 + programs/qws/list.csv | 8 + programs/qws/run.sh | 32 ++ requirements-result-server.txt | 5 +- result_server/app.py | 17 +- result_server/app_dev.py | 63 ++- result_server/routes/api.py | 38 +- result_server/routes/auth.py | 39 +- result_server/templates/_results_base.html | 1 + .../_results_table_cell_profile.html | 1 + result_server/templates/admin_users.html | 6 +- result_server/templates/auth_login.html | 4 +- result_server/templates/auth_setup.html | 1 + result_server/tests/test_api_routes.py | 149 +++++- result_server/tests/test_app_dev_security.py | 44 ++ result_server/tests/test_auth_templates.py | 4 + result_server/tests/test_csrf.py | 140 +++++ .../tests/test_portal_list_templates.py | 59 ++ .../tests/test_result_detail_template.py | 24 +- .../tests/test_result_padata_route.py | 24 + result_server/tests/test_results_loader.py | 29 + result_server/tests/test_totp_security.py | 72 +++ result_server/utils/auth.py | 65 +++ result_server/utils/csrf.py | 13 + result_server/utils/result_detail_view.py | 26 +- result_server/utils/result_file.py | 71 ++- result_server/utils/result_table_rows.py | 4 + scripts/bk_functions.sh | 273 +++++++++- scripts/job_functions.sh | 18 + scripts/matrix_generate.sh | 13 +- scripts/result.sh | 62 ++- scripts/result_server/send_results.sh | 19 +- scripts/setup_site_runner.sh | 504 ++++++++++++++++++ scripts/test_submit.sh | 51 +- scripts/tests/test_bk_profiler.sh | 105 ++++ scripts/tests/test_result_profile_data.sh | 62 ++- .../tests/test_send_results_profile_data.sh | 37 +- 53 files changed, 2786 insertions(+), 188 deletions(-) create mode 100644 result_server/tests/test_app_dev_security.py create mode 100644 result_server/tests/test_csrf.py create mode 100644 result_server/utils/auth.py create mode 100644 result_server/utils/csrf.py create mode 100755 scripts/setup_site_runner.sh diff --git a/.github/workflows/result-server-tests.yml b/.github/workflows/result-server-tests.yml index 042c1d5..0bf433b 100644 --- a/.github/workflows/result-server-tests.yml +++ b/.github/workflows/result-server-tests.yml @@ -4,7 +4,12 @@ on: pull_request: paths: - "result_server/**" - - "scripts/result_server/send_results.sh" + - "scripts/bk_functions.sh" + - "scripts/result.sh" + - "scripts/result_server/**" + - "scripts/tests/test_bk_profiler.sh" + - "scripts/tests/test_result_profile_data.sh" + - "scripts/tests/test_send_results_profile_data.sh" - "config/system.csv" - "config/queue.csv" - "config/system_info.csv" @@ -15,7 +20,12 @@ on: - "**" paths: - "result_server/**" - - "scripts/result_server/send_results.sh" + - "scripts/bk_functions.sh" + - "scripts/result.sh" + - "scripts/result_server/**" + - "scripts/tests/test_bk_profiler.sh" + - "scripts/tests/test_result_profile_data.sh" + - "scripts/tests/test_send_results_profile_data.sh" - "config/system.csv" - "config/queue.csv" - "config/system_info.csv" @@ -30,8 +40,8 @@ jobs: fail-fast: false matrix: python-version: - - "3.9" - "3.12" + - "3.13" steps: - name: Check out repository @@ -54,3 +64,16 @@ jobs: - name: Run result_server pytest suite run: python result_server/tests/run_result_server_tests.py + + profile-data-shell: + runs-on: ubuntu-latest + + steps: + - name: Check out repository + uses: actions/checkout@v4 + + - name: Run profiler and profile-data shell tests + run: | + bash scripts/tests/test_bk_profiler.sh + bash scripts/tests/test_result_profile_data.sh + bash scripts/tests/test_send_results_profile_data.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a280498..c4c5bf1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -8,6 +8,7 @@ # # Files that automatically skip CI: # - root Markdown files and docs/**/* (documentation) +# - .github/**/* (GitHub-only workflow/action files) # - result_server/**/* and config/system_info.csv (portal/server code, templates, and display metadata) # # Important Notes: @@ -85,17 +86,22 @@ generate_matrix: - if: '$CI_COMMIT_MESSAGE =~ /\[park-send\]/' when: never # [park-send]コミットメッセージでは無効 - changes: - - ".gitlab-ci.yml" - - "programs/**/*" - - "scripts/**/*" - - "config/system.csv" - - "config/queue.csv" + compare_to: "refs/heads/$CI_DEFAULT_BRANCH" + paths: + - ".gitlab-ci.yml" + - "programs/**/*" + - "scripts/**/*" + - "config/system.csv" + - "config/queue.csv" when: always - changes: - - "*.md" - - "docs/**/*" - - "result_server/**/*" - - "config/system_info.csv" + compare_to: "refs/heads/$CI_DEFAULT_BRANCH" + paths: + - ".github/**/*" + - "*.md" + - "docs/**/*" + - "result_server/**/*" + - "config/system_info.csv" when: never - when: always @@ -121,17 +127,22 @@ trigger_child_pipeline: - if: '$CI_COMMIT_MESSAGE =~ /\[park-send\]/' when: never # [park-send]コミットメッセージでは無効 - changes: - - ".gitlab-ci.yml" - - "programs/**/*" - - "scripts/**/*" - - "config/system.csv" - - "config/queue.csv" + compare_to: "refs/heads/$CI_DEFAULT_BRANCH" + paths: + - ".gitlab-ci.yml" + - "programs/**/*" + - "scripts/**/*" + - "config/system.csv" + - "config/queue.csv" when: always - changes: - - "*.md" - - "docs/**/*" - - "result_server/**/*" - - "config/system_info.csv" + compare_to: "refs/heads/$CI_DEFAULT_BRANCH" + paths: + - ".github/**/*" + - "*.md" + - "docs/**/*" + - "result_server/**/*" + - "config/system_info.csv" when: never - when: always # BenchPark Monitor Jobs @@ -163,7 +174,9 @@ generate_benchpark_matrix: - if: '$CI_COMMIT_MESSAGE =~ /\[benchpark\]/' when: always - changes: - - "benchpark-bridge/**/*" + compare_to: "refs/heads/$CI_DEFAULT_BRANCH" + paths: + - "benchpark-bridge/**/*" when: always - when: never @@ -197,7 +210,9 @@ trigger_benchpark_pipeline: - if: '$CI_COMMIT_MESSAGE =~ /\[benchpark\]/' when: always - changes: - - "benchpark-bridge/**/*" + compare_to: "refs/heads/$CI_DEFAULT_BRANCH" + paths: + - "benchpark-bridge/**/*" when: always - when: never diff --git a/README.md b/README.md index 48f12b7..ca3ab2e 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,10 @@ This includes: - system-specific execution environments - runtime requirements +## Runtime Requirements + +- `result_server` requires Python 3.12 or later. + ## License This project is licensed under the BSD 3-Clause License. See [LICENSE](LICENSE). diff --git a/config/queue.csv b/config/queue.csv index 1900325..681bbd0 100644 --- a/config/queue.csv +++ b/config/queue.csv @@ -1,5 +1,10 @@ queue,submit_cmd,template FJ,pjsub,"-L rscunit=rscunit_ft01,rscgrp=${queue_group},elapse=${elapse},node=${nodes} --mpi max-proc-per-node=${numproc_node} -x PJM_LLIO_GFSCACHE=/vol0002:/vol0003:/vol0004:/vol0005" +PJM_GENKAI,pjsub,"-L rscgrp=${queue_group},elapse=${elapse},node=${nodes} --mpi proc=${proc}" SLURM_RC,sbatch,"-p ${queue_group} -t ${elapse} -N ${nodes} --ntasks-per-node=${numproc_node} --cpus-per-task=${nthreads}" -PBS_Miyabi,qsub,"-q ${queue_group} -l select=${nodes} -l walltime=${elapse} -W group_list=jh260034" +PBS_Miyabi,qsub,"-q ${queue_group} -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=${nthreads} -l walltime=${elapse} -W group_list=jh260034" +PBS_Grand_C,qsub,"-q ${queue_group} -l select=${nodes}:nsockets=${cpu_per_node},walltime=${elapse} -W group_list=d30992" +PBS_Grand_G,qsub,"-q ${queue_group} -l select=${nodes}:ngpus=1,walltime=${elapse} -W group_list=d30992" +NQSV_AOBA_VE,qsub,"-Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q ${queue_group} -T necmpi --venode ${proc} -l elapstim_req=${elapse}" +NQSV_AOBA_B,qsub,"-Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q ${queue_group} -T intmpi -b ${nodes} -l elapstim_req=${elapse}" none,none,none diff --git a/config/system.csv b/config/system.csv index 618f62b..9513980 100644 --- a/config/system.csv +++ b/config/system.csv @@ -8,4 +8,12 @@ RC_GENOA,native,,cloud_jacamar,SLURM_RC,genoa RC_FX700,native,,cloud_jacamar,SLURM_RC,fx700 MiyabiG,cross,miyabi_g_login,miyabi_g_jacamar,PBS_Miyabi,debug-g MiyabiC,cross,miyabi_c_login,miyabi_c_jacamar,PBS_Miyabi,debug-c +GenkaiA,cross,genkai_login,genkai_jacamar,PJM_GENKAI,a-batch +GenkaiB,cross,genkai_login,genkai_jacamar,PJM_GENKAI,b-batch +GenkaiC,cross,genkai_login,genkai_jacamar,PJM_GENKAI,c-batch +Grand_C,cross,grand_login,grand_jacamar,PBS_Grand_C,lc +Grand_G,cross,grand_login,grand_jacamar,PBS_Grand_G,eg +AOBA_A,cross,aoba_ab_login,aoba_ab_jacamar,NQSV_AOBA_VE,sx +AOBA_B,cross,aoba_ab_login,aoba_ab_jacamar,NQSV_AOBA_B,lx +AOBA_S,cross,aoba_s_login,aoba_s_jacamar,NQSV_AOBA_VE,sxs FNCX,native,,fncx-curl-jq,none,small diff --git a/config/system_info.csv b/config/system_info.csv index 4d35ae7..0273370 100644 --- a/config/system_info.csv +++ b/config/system_info.csv @@ -8,3 +8,11 @@ RC_GH200,RC_GH200,NVIDIA Grace CPU,1,72,NVIDIA Hopper H100 GPU,1,120GB,6 RC_DGXSP,RC_DGXSP,ARM Cortex-X925 / Cortex-A725,1,20,NVIDIA GB10,1,128GB,7 RC_GENOA,RC_GENOA,AMD EPYC 9684X,2,96,-,-,768GB,8 RC_FX700,RC_FX700,A64FX,1,48,-,-,32GB,9 +GenkaiA,GenkaiA,Intel Xeon Platinum 8490H (Sapphire Rapids),2,60,-,-,512GiB,10 +GenkaiB,GenkaiB,Intel Xeon Platinum 8490H (Sapphire Rapids),2,60,NVIDIA H100 (Hopper),4,1024GiB,11 +GenkaiC,GenkaiC,Intel Xeon Platinum 8480+ (Sapphire Rapids),2,56,NVIDIA H100 (Hopper),8,8TiB,12 +Grand_C,Grand_C,Intel Xeon Gold 6548Y+ (Emerald Rapids),2,32,-,-,512GiB,13 +Grand_G,Grand_G,Intel Xeon Gold 6548Y+ (Emerald Rapids),2,32,NVIDIA H100 (Hopper),4,512GiB,14 +AOBA_A,AOBA_A,SX-Aurora TSUBASA VH,1,24,NEC SX-Aurora TSUBASA Type 20B VE,8,640GB,15 +AOBA_B,AOBA_B,AMD EPYC 7702,2,64,-,-,256GB,16 +AOBA_S,AOBA_S,SX-Aurora TSUBASA VH,1,64,NEC SX-Aurora TSUBASA Type 30A VE,8,256GB + 768GB,17 diff --git a/docs/ci.md b/docs/ci.md index c331c6e..1980da5 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -245,10 +245,10 @@ app support matrix、partial support、app entrypoint不足、`list.csv` 内の |---|---|---|---| | Root Markdown or `docs/**/*` only / root Markdownまたは`docs/**/*`のみ | No benchmark-specific GitHub workflow / ベンチマーク用GitHub workflowなし | Skipped by `.gitlab-ci.yml` rules / `.gitlab-ci.yml` rulesでskip | Keep docs-only changes separate from benchmark logic changes / docsのみの変更はbenchmark logic変更と分ける | | `result_server/**/*` / `result_server/**/*` | `Result Server Tests` | Skipped by `.gitlab-ci.yml` rules / `.gitlab-ci.yml` rulesでskip | Portal regressions should be caught by lightweight Python tests / portal回帰はlightweight Python testで捕捉する | -| Portal metadata `config/system_info.csv` / portal表示メタデータ`config/system_info.csv` | `Result Server Tests`, including site config preflight / site config preflightを含む`Result Server Tests` | Skipped by `.gitlab-ci.yml` rules / `.gitlab-ci.yml` rulesでskip | Public systems listed in `system_info.csv` must also exist in `system.csv` and reference a queue defined in `queue.csv` / `system_info.csv`に載せる公開systemは`system.csv`にも存在し、`queue.csv`定義済みqueueを参照する必要がある | -| Portal upload helper / portal upload helper | `Result Server Tests` when covered by its path filter / path filter対象なら`Result Server Tests` | Not automatic for pull requests; runs only if a maintainer starts `GitLab Manual CI` / pull requestでは自動起動せず、maintainerが`GitLab Manual CI`を起動した場合のみ実行 | Upload helpers live under `scripts/**/*`, so manual benchmark validation may include them / upload helperは`scripts/**/*`配下なので手動benchmark検証の対象になりうる | -| Benchmark app, shared scripts, `config/system.csv`, or `config/queue.csv` / benchmark app、共通script、`config/system.csv`、`config/queue.csv` | Normal GitHub review checks only / 通常のGitHub review checkのみ | Run through `GitLab Manual CI` when maintainer starts it / maintainerが`GitLab Manual CI`を起動した場合に実行 | Use `code` and `system` filters when broad validation is unnecessary / 広範な検証が不要なら`code`と`system`を指定する | -| GitHub workflow/action / GitHub workflow/action | Workflow-specific checks when paths match / path一致時にworkflowごとのcheck | Not required by itself / それ単体では不要 | GitHub workflow/action changes affect API-calling or sync control logic. Validate them on the GitHub side; they are pushed to GitLab with `ci.skip` during protected-branch sync / GitHub workflow/action変更はAPI呼び出しやsync制御に影響する。GitHub側で確認する。protected-branch syncでは`ci.skip`付きでGitLabへpushされる | +| Public site config or portal metadata `config/system.csv`, `config/queue.csv`, `config/system_info.csv` / 公開site configまたはportal表示メタデータ`config/system.csv`、`config/queue.csv`、`config/system_info.csv` | `Result Server Tests`, including site config preflight / site config preflightを含む`Result Server Tests` | `config/system.csv` and `config/queue.csv` run by `.gitlab-ci.yml`; `config/system_info.csv` is skipped / `config/system.csv`と`config/queue.csv`は`.gitlab-ci.yml`で実行、`config/system_info.csv`はskip | Public systems listed in `system_info.csv` must also exist in `system.csv` and reference a queue defined in `queue.csv` / `system_info.csv`に載せる公開systemは`system.csv`にも存在し、`queue.csv`定義済みqueueを参照する必要がある | +| Portal upload or profile-data helper `scripts/bk_functions.sh`, `scripts/result.sh`, `scripts/result_server/**` / portal uploadまたはprofile-data helper `scripts/bk_functions.sh`、`scripts/result.sh`、`scripts/result_server/**` | `Result Server Tests` when covered by its path filter / path filter対象なら`Result Server Tests` | Not automatic for pull requests; runs only if a maintainer starts `GitLab Manual CI` / pull requestでは自動起動せず、maintainerが`GitLab Manual CI`を起動した場合のみ実行 | These helpers shape result JSON / upload behavior without requiring a full benchmark by default / これらのhelperはfull benchmarkを既定で要求せずにResult JSONやupload挙動へ影響する | +| Benchmark app code or other shared scripts / benchmark appコードまたはその他の共通script | Normal GitHub review checks only / 通常のGitHub review checkのみ | Run through `GitLab Manual CI` when maintainer starts it / maintainerが`GitLab Manual CI`を起動した場合に実行 | Use `code` and `system` filters when broad validation is unnecessary / 広範な検証が不要なら`code`と`system`を指定する | +| GitHub workflow/action `.github/**/*` / GitHub workflow/action `.github/**/*` | Workflow-specific checks when paths match / path一致時にworkflowごとのcheck | Skipped by `.gitlab-ci.yml` rules / `.gitlab-ci.yml` rulesでskip | GitHub workflow/action changes affect API-calling or sync control logic. Validate them on the GitHub side; they are pushed to GitLab with `ci.skip` during protected-branch sync / GitHub workflow/action変更はAPI呼び出しやsync制御に影響する。GitHub側で確認する。protected-branch syncでは`ci.skip`付きでGitLabへpushされる | | `.gitlab-ci.yml` / `.gitlab-ci.yml` | Normal GitHub review checks only / 通常のGitHub review checkのみ | Run through `GitLab Manual CI` when a maintainer needs to validate GitLab pipeline behavior / GitLab pipeline挙動の検証が必要な場合にmaintainerが`GitLab Manual CI`で実行 | This file defines GitLab benchmark pipeline behavior / このファイルはGitLab benchmark pipeline挙動を定義する | ## Representative Change Sets / 代表的な変更セット @@ -263,9 +263,9 @@ Use these examples when deciding whether to split a pull request or start GitLab | `result_server/routes/usage.py` and `result_server/templates/*.html` / `result_server/routes/usage.py`と`result_server/templates/*.html` | `Result Server Tests` should run / `Result Server Tests`が動く | No benchmark run unless a maintainer intentionally starts one / maintainerが意図して起動しない限りbenchmark不要 | | `config/system_info.csv` only / `config/system_info.csv`のみ | `Result Server Tests` should verify public site config consistency / 公開site config整合性を`Result Server Tests`で確認 | No benchmark run because this file is portal display metadata / portal表示metadataなのでbenchmark不要 | | `config/system.csv` or `config/queue.csv` for a public system / 公開system向けの`config/system.csv`または`config/queue.csv` | `Result Server Tests` should run the site config preflight / `Result Server Tests`でsite config preflightを実行 | Start `GitLab Manual CI` too when benchmark execution behavior needs validation / benchmark実行挙動の検証が必要なら`GitLab Manual CI`も起動 | -| `scripts/result_server/send_results.sh` only / `scripts/result_server/send_results.sh`のみ | `Result Server Tests` should run when the path filter matches / path filter対象なら`Result Server Tests`が動く | Manual GitLab CI is optional and only needed if upload behavior affects benchmark operation / upload挙動がbenchmark運用に影響する場合だけ手動GitLab CIを検討 | +| `scripts/bk_functions.sh`, `scripts/result.sh`, or `scripts/result_server/**` only / `scripts/bk_functions.sh`、`scripts/result.sh`、または`scripts/result_server/**`のみ | `Result Server Tests` should run when the path filter matches / path filter対象なら`Result Server Tests`が動く | Manual GitLab CI is optional and only needed if upload behavior affects benchmark operation / upload挙動がbenchmark運用に影響する場合だけ手動GitLab CIを検討 | | `programs/qws/**/*` or `scripts/job/**/*` / `programs/qws/**/*`または`scripts/job/**/*` | Normal GitHub review checks / 通常のGitHub review check | Start `GitLab Manual CI` when benchmark validation is needed, preferably with explicit `code` and `system` filters / benchmark検証が必要なら`code`と`system`を明示して`GitLab Manual CI`を起動 | -| `.github/workflows/sync-to-gitlab.yml` or `.github/actions/prepare-gitlab-repo/action.yml` / `.github/workflows/sync-to-gitlab.yml`または`.github/actions/prepare-gitlab-repo/action.yml` | Validate on the GitHub Actions side / GitHub Actions側で確認 | Not a GitLab benchmark trigger by itself. Protected-branch sync pushes it with `ci.skip` / それ単体ではGitLab benchmark起動対象ではない。protected-branch syncでは`ci.skip`付きでpushされる | +| `.github/workflows/sync-to-gitlab.yml` or `.github/actions/prepare-gitlab-repo/action.yml` / `.github/workflows/sync-to-gitlab.yml`または`.github/actions/prepare-gitlab-repo/action.yml` | Validate on the GitHub Actions side / GitHub Actions側で確認 | Skipped by `.gitlab-ci.yml` rules when changed alone; protected-branch sync pushes it with `ci.skip` / 単独変更なら`.gitlab-ci.yml` rulesでskip。protected-branch syncでは`ci.skip`付きでpushされる | | `.gitlab-ci.yml` / `.gitlab-ci.yml` | Review the GitLab rule diff carefully / GitLab rule差分を慎重にreview | Start `GitLab Manual CI` if rule behavior itself needs validation / rule挙動そのものの検証が必要なら`GitLab Manual CI`を起動 | ## Contributor Guidance / コントリビュータ向け注意 diff --git a/docs/cx/BENCHKIT_GAP_ANALYSIS.md b/docs/cx/BENCHKIT_GAP_ANALYSIS.md index 251024b..af7654e 100644 --- a/docs/cx/BENCHKIT_GAP_ANALYSIS.md +++ b/docs/cx/BENCHKIT_GAP_ANALYSIS.md @@ -314,8 +314,8 @@ Once the estimation specification is clarified, many other design decisions beco CI 関連の残 GAP は、「仕組みを新規に置く」段階から「対象範囲を運用に耐える形へ広げ、古くならないようにする」段階へ移っている。 短期的な実装・確認は次の状態まで進んでいる。 -1. `result-server-tests.yml` の path filter は、`result_server/**/*`、`scripts/result_server/send_results.sh`、`config/system.csv`、`config/queue.csv`、`config/system_info.csv`、`requirements-result-server.txt` を対象にする形へ更新済みである。 -2. `.gitlab-ci.yml` の heavy benchmark skip rules と `docs/ci.md` の説明は、root Markdown、`docs/**/*`、`result_server/**/*`、`config/system_info.csv` の扱いが一致するよう同期済みである。 +1. `result-server-tests.yml` の path filter は、`result_server/**/*`、`scripts/bk_functions.sh`、`scripts/result.sh`、`scripts/result_server/**`、profile-data shell tests、`config/system.csv`、`config/queue.csv`、`config/system_info.csv`、`requirements-result-server.txt` を対象にする形へ更新済みである。 +2. `.gitlab-ci.yml` の heavy benchmark skip rules と `docs/ci.md` の説明は、root Markdown、`docs/**/*`、`result_server/**/*`、public site config / profile-data helper 周辺の lightweight verification 経路の扱いが一致するよう同期済みである。 3. 手動 GitLab CI は、`qws` / `MiyabiG` の最小実行で GitLab pipeline 起動から推定まで確認済みである。Pipeline API variables は JSON payload で渡す。 4. protected branch sync は、`ci.skip` により GitLab mirror 更新時に GitLab CI が自動起動しないことを運用上確認済みである。 diff --git a/docs/guides/add-app.md b/docs/guides/add-app.md index df58446..402a2cb 100644 --- a/docs/guides/add-app.md +++ b/docs/guides/add-app.md @@ -328,7 +328,7 @@ tar -czf ../results/padata0.tgz ./pa ### Fugaku で `fapp` を使う場合 Fugaku 系アプリでは、アプリ側が profiler tool を内部で選び、BenchKit 共通の `bk_profiler` helper に渡す形が扱いやすいです。 -`bk_profiler` は profiler ごとの raw data / postprocess report をまとめて `results/padata*.tgz` に保存し、archive の root に `meta.json` を入れます。BenchKit や推定 package はこの `meta.json` を見て、tool、level、report kind を機械的に判断できます。 +`bk_profiler` は profiler ごとの raw data / postprocess report をまとめて `results/padata*.tgz` に保存し、archive 内の `bk_profiler_artifact/meta.json` に metadata を入れます。BenchKit や推定 package はこの `meta.json` を見て、tool、level、report kind を機械的に判断できます。 `fapp` では共通 level として次を扱います。 @@ -340,7 +340,7 @@ Fugaku 系アプリでは、アプリ側が profiler tool を内部で選び、B `single` は既定で text summary、`simple/standard/detailed` は既定で text + CSV report を保存します。CSV は `fapp` 固有の report として扱い、ほかの profiler が同じ形式を持つ必要はありません。 ```bash -# qws は Fugaku 系 build / run の内部で fapp + single を利用 +# qws は Fugaku 系 build / run の内部で fapp + detailed を利用 bash programs/qws/build.sh Fugaku bash programs/qws/run.sh Fugaku 1 4 12 ``` @@ -365,7 +365,8 @@ bk_profiler_artifact/ meta.json raw/ rep1/ - rep2/ + ... + rep17/ reports/ fapp_A_rep1.txt cpu_pa_rep1.csv @@ -374,6 +375,26 @@ bk_profiler_artifact/ より一般的な profiler helper の設計方針は [Profiler Support Guide](profiler-support.md) を参照してください。 level の早見表と portal 上の見え方は [Profiler Level Reference](profiler-level-reference.md) にまとめています。 +### GPU アプリで `ncu` を使う場合 + +NVIDIA GPU 向けアプリでは、Nsight Compute CLI (`ncu`) を `bk_profiler` 経由で使えます。 +MPI launcher 経由のアプリでは、`bk_profiler ncu` が既定で `--target-processes all` を付け、child process の CUDA kernel も採取対象にします。 +MiyabiG と RC_GH200 のように計算ノード構成が同じ Grace-Hopper GPU 系の場合は、ジョブ投入方式だけを system 設定に任せ、アプリ側の build/run と profiler 採取は共通化するのが自然です。 + +```bash +BK_PROFILER_ARGS="--set full --kernel-name regex:your_kernel" \ +bk_profiler ncu --level single --archive ../results/padata0.tgz --raw-dir ncu -- \ + mpirun -np 1 ./your_gpu_app input.inp +``` + +`ncu` の既定 level は `single` です。最初は採取時間を抑えるため、`single` または `simple` から始めてください。 +raw report は `padata*.tgz` 内の `bk_profiler_artifact/raw/rep1/` に保存され、可能な場合は `bk_profiler_artifact/reports/ncu_import_rep1.txt` に text report が保存されます。 +site の既定 module に `ncu` が含まれない場合は、アプリ側で module を load するか、system 固有の module 変数を用意してください。 +Genesis GH200 参照実装では `GENESIS_MIYABIG_MODULE` / `GENESIS_GH200_MODULE` で module を上書きできます。 +既定の `ncu` が PATH にない場合は warning を出して profiler なしで benchmark 本体を実行しますが、`GENESIS_MIYABIG_PROFILER_TOOL=ncu`、`GENESIS_GH200_PROFILER_TOOL=ncu`、または `GENESIS_PROFILER_TOOL=ncu` を明示した場合は採取不能として失敗します。 +profiler なしを明示する場合は `GENESIS_MIYABIG_PROFILER_TOOL=none`、`GENESIS_GH200_PROFILER_TOOL=none`、または `GENESIS_PROFILER_TOOL=none` を使えます。 +level は `GENESIS_MIYABIG_PROFILER_LEVEL` / `GENESIS_GH200_PROFILER_LEVEL`、または共通の `GENESIS_PROFILER_LEVEL` で上書きできます。 + --- ## 6. ローカルテスト diff --git a/docs/guides/add-site.md b/docs/guides/add-site.md index fb33759..db185ad 100644 --- a/docs/guides/add-site.md +++ b/docs/guides/add-site.md @@ -6,16 +6,17 @@ GitLab Runner と Jacamar-CI をユーザ権限でセットアップし、CI/CD ## 目次 1. [前提条件](#1-前提条件) -2. [ディレクトリ構成](#2-ディレクトリ構成) -3. [GitLab Runner のインストール](#3-gitlab-runner-のインストール) -4. [Jacamar-CI のビルド・インストール](#4-jacamar-ci-のビルドインストール) -5. [カスタムランナースクリプトの作成](#5-カスタムランナースクリプトの作成) -6. [ランナーの登録](#6-ランナーの登録) -7. [Jacamar 用ランナーの設定](#7-jacamar-用ランナーの設定) -8. [config.toml の設定](#8-configtoml-の設定) -9. [BenchKit への拠点登録](#9-benchkit-への拠点登録) -10. [ランナーの常駐化(systemd user mode)](#10-ランナーの常駐化systemd-user-mode) -11. [トラブルシューティング](#11-トラブルシューティング) +2. [クイックセットアップ(推奨)](#クイックセットアップ推奨) +3. [ディレクトリ構成](#2-ディレクトリ構成) +4. [GitLab Runner のインストール](#3-gitlab-runner-のインストール) +5. [Jacamar-CI のビルド・インストール](#4-jacamar-ci-のビルドインストール) +6. [カスタムランナースクリプトの作成](#5-カスタムランナースクリプトの作成) +7. [ランナーの登録](#6-ランナーの登録) +8. [Jacamar 用ランナーの設定](#7-jacamar-用ランナーの設定) +9. [config.toml の設定](#8-configtoml-の設定) +10. [BenchKit への拠点登録](#9-benchkit-への拠点登録) +11. [ランナーの常駐化(systemd user mode)](#10-ランナーの常駐化systemd-user-mode) +12. [トラブルシューティング](#11-トラブルシューティング) --- @@ -30,6 +31,88 @@ GitLab Runner と Jacamar-CI をユーザ権限でセットアップし、CI/CD --- +## クイックセットアップ(推奨) + +通常は `scripts/setup_site_runner.sh` を使えば、GitLab Runner の取得、Jacamar-CI のビルド、frontend runner と Jacamar runner の登録、`custom-config.toml` / `config.toml` 相当の設定、systemd user service の作成までまとめて実行できます。 + +`--login-token` と `--jacamar-token` には、GitLab で作成した各 runner の authentication token を指定します。URL は両 runner で共通です。 + +### 実行前の疎通確認 + +セットアップ前に、対象ログインノードから GitLab サーバへ到達できるか確認します。GitLab Runner は GitLab 側から接続されるのではなく、ログインノード上の常駐プロセスが GitLab へ job を取りに行きます。 + +```bash +GITLAB_URL="https://YOUR_GITLAB_SERVER" + +hostname -s +getent hosts "$(printf '%s\n' "$GITLAB_URL" | sed -E 's#^https?://([^/]+).*#\1#')" + +env | grep -Ei '^(http_proxy|https_proxy|HTTP_PROXY|HTTPS_PROXY|no_proxy|NO_PROXY)=' || true +grep -Rihn -i proxy ~/.bashrc ~/.bash_profile ~/.profile /etc/profile /etc/profile.d 2>/dev/null || true + +env -u http_proxy -u https_proxy -u HTTP_PROXY -u HTTPS_PROXY \ + curl -I --connect-timeout 5 "$GITLAB_URL" +``` + +direct 接続が timeout し、サイト側で proxy が指定されている場合は、その proxy で疎通確認します。 + +```bash +RUNNER_PROXY="http://PROXY_HOST:PORT" + +curl -I --connect-timeout 5 -x "$RUNNER_PROXY" "$GITLAB_URL" +``` + +proxy 経由でだけ成功する場合は、`setup_site_runner.sh` 実行時に `--proxy "$RUNNER_PROXY"` を指定します。`systemd --user` のサービスはログインシェルの proxy 環境変数を継承しないことがあるため、proxy は runner の systemd unit に明示しておくのが安全です。 + +AMD64 ログインノードの例: + +```bash +curl -fsSL https://raw.githubusercontent.com/RIKEN-RCCS/benchkit/main/scripts/setup_site_runner.sh \ + | bash -s -- \ + --arch amd64 \ + --site your_site \ + --gitlab-url https://YOUR_GITLAB_SERVER \ + --login-token "$LOGIN_RUNNER_TOKEN" \ + --jacamar-token "$JACAMAR_RUNNER_TOKEN" \ + --scheduler pbs \ + --service-host "$(hostname -s)" +``` + +proxy が必要な拠点では、上のコマンドに `--proxy "$RUNNER_PROXY"` を追加します。 + +ARM64 ログインノードでは `--arch arm64` を指定します。 + +よく使う指定: + +- `--site your_site` + - Runner description と、期待する tag 表示に使います +- `--login-tag` / `--jacamar-tag` + - Runner authentication token workflow では tag は GitLab 側で設定します。このオプションはスクリプト末尾の確認表示用です +- `--scheduler pbs|slurm|pjm` + - Jacamar の executor を指定します +- `--jacamar-repo URL` + - Jacamar-CI の clone 元を明示します。省略時は `--scheduler pjm` の場合だけ PJM 対応 fork `https://gitlab.com/yoshifuminakamura/jacamar-ci.git` を使い、それ以外は upstream `https://gitlab.com/ecp-ci/jacamar-ci.git` を使います +- `--base-dir /path/to/gitlab-runner_jacamar-ci_amd` + - 既定は `$HOME/gitlab-runner_jacamar-ci_amd` または `$HOME/gitlab-runner_jacamar-ci_arm` +- `--libseccomp auto|system|local|none` + - 既定は `auto` です。利用可能な system libseccomp があれば使い、なければ gperf と libseccomp をローカルビルドします +- `--with-libseccomp` + - `--libseccomp local` の短縮形です。常に gperf と libseccomp をローカルビルドします +- `--jacamar-pbs-tools tools.go` + - PBS の完了判定にサイト固有パッチが必要な場合に使います +- `--unrestricted-cmd-line` + - Jacamar の `GIT_ASKPASS` credential helper が効かず、`get_sources` で `fatal: unable to get password from user` になる場合の回避策です。runner generated command line に job token が現れる可能性があるため、単一ユーザ運用や `/proc` の閲覧制限がある環境で使ってください +- `--proxy http://PROXY_HOST:PORT` + - runner の systemd user service に `http_proxy` / `https_proxy` / `HTTP_PROXY` / `HTTPS_PROXY` を設定します。`http://` または `https://` を省略した場合は `http://` を補います +- `--no-proxy LIST` + - runner の systemd user service に `no_proxy` / `NO_PROXY` を設定します +- `--no-systemd` / `--no-start` + - systemd user service を作らない、または作るだけで起動しない場合に使います + +このスクリプトは `config.toml` の `environment` に `PATH=$BASE_DIR/bin:...` を登録時点で入れるため、アーティファクト保存時に `gitlab-runner` が見つからない問題も避けられます。以下の手動手順は、スクリプトが失敗した場合の切り分けや、サイト固有に調整したい場合の参照として使ってください。 + +--- + ## 2. ディレクトリ構成 ARM系とx86系で共有ボリュームをマウントしている環境では、アーキテクチャ別にディレクトリを分けます。 @@ -140,6 +223,8 @@ rm -rf jacamar-ci go - `Exit_status` の取得: `-H -f` オプションで履歴から抽出(テキスト形式) - ジョブが履歴に残らない場合は正常終了と見なす +AOBA/NQSV のように `qstat -H` が使えない環境では、ジョブが `qstat` から消えた後に終了コードを後追い取得できない場合があります。この場合は `qsub` 直後から `qwait -w exited ` で待ち、出力される `exited N` を parse して `N != 0` を GitLab job failure として扱うパッチが必要です。`qwait` 自体の戻り値が 0 でも、ジョブの終了コードは `exited N` 側に入る点に注意してください。 + パッチの適用方法: ```bash git clone https://gitlab.com/ecp-ci/jacamar-ci.git @@ -264,7 +349,7 @@ exit 0 ### `run.sh` - ジョブ実行 ```bash -#!/usr/bin/bash +#!/usr/bin/env bash source ~/.bashrc set -eo pipefail exec "$@" @@ -272,18 +357,23 @@ exec "$@" ### `cleanup.sh` - ジョブ後片付け ```bash -#!/bin/bash -set -e +#!/usr/bin/env bash +set -euo pipefail -LOGFILE="${CUSTOM_DIR}/custom_cleanup.log" +BASE_DIR="/path/to/gitlab-runner_jacamar-ci_amd" # ← 実際のパスに変更 +LOGFILE="${CUSTOM_DIR:-${BASE_DIR}}/custom_cleanup.log" echo "CLEANUP STARTED at $(date)" >> "$LOGFILE" -echo "CUSTOM_ENV_CI_JOB_ID=$CUSTOM_ENV_CI_JOB_ID" >> "$LOGFILE" -BUILD_DIR="${CUSTOM_UNIQUE_BUILD_DIR}" -CACHE_DIR="${CUSTOM_UNIQUE_CACHE_DIR}" +BUILD_DIR="${CUSTOM_UNIQUE_BUILD_DIR:-}" +CACHE_DIR="${CUSTOM_UNIQUE_CACHE_DIR:-}" -[ -n "$BUILD_DIR" ] && [ -d "$BUILD_DIR" ] && rm -rf "$BUILD_DIR" -[ -n "$CACHE_DIR" ] && [ -d "$CACHE_DIR" ] && rm -rf "$CACHE_DIR" +case "$BUILD_DIR" in + "${BASE_DIR}/builds/"*) [[ -d "$BUILD_DIR" ]] && rm -rf -- "$BUILD_DIR" ;; +esac + +case "$CACHE_DIR" in + "${BASE_DIR}/cache/"*) [[ -d "$CACHE_DIR" ]] && rm -rf -- "$CACHE_DIR" ;; +esac echo "CLEANUP DONE at $(date)" >> "$LOGFILE" ``` @@ -309,12 +399,9 @@ chmod +x "$BASE_DIR"/{config,prepare,run,cleanup}.sh "$BASE_DIR/bin/gitlab-runner" register \ --non-interactive \ --url "https://YOUR_GITLAB_SERVER" \ - --registration-token "YOUR_TOKEN" \ + --token "YOUR_TOKEN" \ --executor "custom" \ --description "site-login" \ - --tag-list "your_site_login" \ - --run-untagged="false" \ - --locked="false" \ --builds-dir "$BASE_DIR/builds" \ --cache-dir "$BASE_DIR/cache" \ --config "$BASE_DIR/config.toml" \ @@ -329,12 +416,9 @@ chmod +x "$BASE_DIR"/{config,prepare,run,cleanup}.sh "$BASE_DIR/bin/gitlab-runner" register \ --non-interactive \ --url "https://YOUR_GITLAB_SERVER" \ - --registration-token "YOUR_TOKEN" \ + --token "YOUR_TOKEN" \ --executor "custom" \ --description "site-jacamar" \ - --tag-list "your_site_jacamar" \ - --run-untagged="false" \ - --locked="false" \ --builds-dir "$BASE_DIR/builds" \ --cache-dir "$BASE_DIR/cache" \ --config "$BASE_DIR/config.toml" \ @@ -345,6 +429,7 @@ chmod +x "$BASE_DIR"/{config,prepare,run,cleanup}.sh ``` > **Note**: Jacamar 用ランナーの `--custom-*-exec` は登録時のプレースホルダです。実際の引数は `config.toml` で設定します(次セクション参照)。 +> **Note**: Runner authentication token を使う GitLab Runner 18 系の workflow では、tag、locked、run-untagged などは GitLab server 側で設定します。register コマンドに `--tag-list` や `--locked` を渡すと失敗します。 --- @@ -439,7 +524,9 @@ queue,submit_cmd,template PBS_NewSystem,qsub,"-q ${queue_group} -l select=${nodes} -l walltime=${elapse} -W group_list=your_group" ``` -テンプレート内で使える変数:`${queue_group}`, `${nodes}`, `${numproc_node}`, `${nthreads}`, `${elapse}` +テンプレート内で使える変数:`${queue_group}`, `${nodes}`, `${numproc_node}`, `${nthreads}`, `${elapse}`, `${proc}`(`nodes * numproc_node`), `${cpu_per_node}`, `${gpu_per_node}`, `${cpu_sockets}`(`nodes * cpu_per_node`), `${gpu_cards}`(`nodes * gpu_per_node`) + +`${cpu_per_node}` と `${gpu_per_node}` は `config/system_info.csv` から取得します。CPU socket 数や GPU card 数を scheduler に明示するサイトでは、`system_info.csv` の値も投入条件に使われます。 ### `config/system_info.csv` に表示用メタデータを追加 @@ -547,7 +634,8 @@ mkdir -p ~/.config/systemd/user [Unit] Description=GitLab Runner service (user mode, amd64) After=network.target -ConditionHost=your-login-node # ← 実際のホスト名に変更 +# 実際のホスト名に変更 +ConditionHost=your-login-node [Service] ExecStart=%h/gitlab-runner_jacamar-ci_amd/bin/gitlab-runner run --config %h/gitlab-runner_jacamar-ci_amd/config.toml --working-directory %h @@ -565,7 +653,8 @@ WantedBy=default.target [Unit] Description=GitLab Runner service (user mode, arm64) After=network.target -ConditionHost=your-arm-login-node # ← 実際のホスト名に変更 +# 実際のホスト名に変更 +ConditionHost=your-arm-login-node [Service] ExecStart=%h/gitlab-runner_jacamar-ci_arm/bin/gitlab-runner run --config %h/gitlab-runner_jacamar-ci_arm/config.toml --working-directory %h @@ -578,7 +667,7 @@ StandardError=append:%h/gitlab-runner_jacamar-ci_arm/gitlab-runner.err WantedBy=default.target ``` -`ConditionHost=` を設定することで、同じホームディレクトリを複数ノードで共有していても、指定したホストでのみサービスが起動します。 +`ConditionHost=` を設定することで、同じホームディレクトリを複数ノードで共有していても、指定したホストでのみサービスが起動します。systemd の unit file では行末コメントを値として解釈するので、コメントは別行に置いてください。 ### サービスの有効化・起動 @@ -629,7 +718,125 @@ environment = ["PATH=/path/to/gitlab-runner_jacamar-ci_amd/bin:..."] ### ランナーが GitLab に接続できない - ログインノードから GitLab サーバへの HTTPS 通信が可能か確認 -- プロキシ設定が必要な場合は `config.toml` の `environment` に `https_proxy` を追加 +- プロキシ設定が必要な場合は `setup_site_runner.sh --proxy` で systemd user service に proxy を明示 + +ログインシェルでは `curl -I https://gitlab.swc.r-ccs.riken.jp` が成功するのに、常駐ランナーが +`Checking for jobs... failed` や `lookup gitlab.swc.r-ccs.riken.jp on [::1]:53` で失敗する場合は、 +`systemd --user` のサービスがログインシェルの proxy 環境変数を継承していない可能性があります。 + +```bash +env | grep -Ei 'proxy|http|https|no_proxy' +curl -I https://gitlab.swc.r-ccs.riken.jp +systemctl --user show gitlab-runner--amd.service -p Environment +``` + +`systemctl --user show` の `Environment=` が空なら、サービスに proxy を明示します。 + +```bash +systemctl --user edit gitlab-runner--amd.service +``` + +```ini +[Service] +Environment="http_proxy=http://PROXY_HOST:PORT" +Environment="https_proxy=http://PROXY_HOST:PORT" +Environment="HTTP_PROXY=http://PROXY_HOST:PORT" +Environment="HTTPS_PROXY=http://PROXY_HOST:PORT" +``` + +```bash +systemctl --user daemon-reload +systemctl --user restart gitlab-runner--amd.service +systemctl --user show gitlab-runner--amd.service -p Environment +``` + +### 計算ノードに `git` がない場合 + +一部の計算ノードでは、ログインノードやフロントエンドランナーでは `git` が使えても、バッチジョブ内では `git: コマンドが見つかりません` になることがあります。アプリの `run.sh` が実行時に外部ソースを clone する場合は、計算ノード側でも `git` 相当のコマンドが必要です。 + +Singularity/Apptainer が計算ノードで使える場合は、共有ファイルシステム上に `git` 入りコンテナと wrapper を置く方法が有効です。 + +```bash +BASE=/uhome//gitlab-runner_jacamar-ci_amd +SING=/path/to/singularity + +mkdir -p "$BASE/containers" "$BASE/bin" +"$SING" build --sandbox "$BASE/containers/git" docker://alpine/git:latest +``` + +`$BASE/bin/git`: + +```bash +#!/bin/bash +set -e + +# GitLab Runner の get_sources はログインノード上の認証 helper を使うため、 +# ホストの git がある場合はそちらへ委譲する。 +if [[ -x /usr/bin/git ]]; then + exec /usr/bin/git "$@" +fi + +SING=/path/to/singularity +IMG=/uhome//gitlab-runner_jacamar-ci_amd/containers/git + +exec "$SING" exec \ + --bind /mnt:/mnt \ + --bind /uhome:/uhome \ + --pwd "$PWD" \ + "$IMG" \ + git "$@" +``` + +```bash +chmod +x "$BASE/bin/git" +``` + +Jacamar 用ランナーの `config.toml` では wrapper のある `bin` を `PATH` に入れます。ただし `get_sources` までコンテナ内 `git` に置き換えると、GitLab Runner/Jacamar が生成する credential helper をコンテナ内から実行できず、`fatal: cannot exec .../pass` で失敗することがあります。上記のように、ログインノードでは `/usr/bin/git` に委譲し、計算ノードでだけコンテナ内 `git` を使う wrapper にしてください。 + +確認: + +```bash +git --version +git ls-remote https://github.com/RIKEN-LQCD/qws.git HEAD +``` + +### 計算ノードから外部 proxy に届かない場合 + +`queue.csv` で `qsub -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY` を指定していても、計算ノードからその proxy に TCP 接続できるとは限りません。例えば `git` や `curl` が `Trying PROXY_HOST...` のまま進まず、ジョブが経過時間超過で kill される場合は、proxy 変数ではなくネットワーク到達性を疑います。 + +計算ノードで確認: + +```bash +hostname -I +ip route +cat /etc/resolv.conf 2>/dev/null || true +cat /etc/hosts 2>/dev/null || true +env | sort | egrep -i '^(http_proxy|https_proxy|HTTP_PROXY|HTTPS_PROXY|no_proxy|NO_PROXY)=' || true + +timeout 5 bash -c '_*_run) + cache=/uhome//gitlab-runner_jacamar-ci_amd/site-cache/qws + if [ ! -d qws ]; then + echo "[site pre_build] copying qws source from $cache" + cp -a "$cache" qws + fi + test -d qws || { echo "[site pre_build] qws source is missing" >&2; exit 1; } + ;; +esac +""" +``` ### ARM/x86 混在環境での注意 同じ共有ボリュームを異なるアーキテクチャのマシンがマウントしている場合、必ずアーキテクチャ別のディレクトリ(`_amd` / `_arm`)を使い分けてください。バイナリの混在はランタイムエラーの原因になります。 @@ -658,10 +865,14 @@ qstat -f # ジョブ履歴の確認 qstat -H -f + +# NQSV で qstat -H がない場合は、投入直後に qwait で終了コードを確認 +qwait -w exited ``` **解決方法:** JSON形式の `qstat` がサポートされていない場合は、セクション4「サイト固有パッチについて」に記載の `tools.go` パッチを適用してください。 +NQSV のように `qstat -H` がない場合は、`qwait -w exited ` の出力を使って終了コードを返す `tools.go` パッチを適用してください。 ### NFS同期でファイルが見えない diff --git a/docs/guides/developer-reference.md b/docs/guides/developer-reference.md index da08f97..1277f89 100644 --- a/docs/guides/developer-reference.md +++ b/docs/guides/developer-reference.md @@ -197,7 +197,7 @@ Typical requirements include: - Bash and standard shell tooling - GitLab CI runner support - site-specific scheduler/runtime support -- Python for result shaping, estimation support, and portal components +- Python 3.12 or later for result shaping, estimation support, and portal components - Flask-related Python packages for `result_server` - optional profiler tools depending on system support @@ -211,6 +211,15 @@ For the lightweight `result_server` verification path: - run the portal test suite with `python result_server/tests/run_result_server_tests.py` - CI coverage for portal-only changes is provided by `.github/workflows/result-server-tests.yml` +For production portal deployments: + +- Set `FLASK_SECRET_KEY` to a strong secret and run `result_server/app.py`, not `app_dev.py`. +- `app.py` binds to `127.0.0.1:8800` by default; set `RESULT_SERVER_HOST` and `RESULT_SERVER_PORT` explicitly when the deployment requires a different bind address. +- Set runner-scoped ingest keys with `RESULT_SERVER_KEYS=runner-a:key-a,runner-b:key-b`. +- The legacy `RESULT_SERVER_KEY` variable is still accepted as runner `default` for compatibility, but should be rotated to `RESULT_SERVER_KEYS`. +- `REDIS_URL` must point to a monitored Redis instance; production authentication refuses login when Redis is unavailable. +- `app_dev.py` is localhost-only, uses ephemeral development secrets when none are provided, and enables the Werkzeug debugger only with `RESULT_SERVER_DEV_DEBUG=1`. + ### Result Quality Visibility BenchKit keeps result-quality scoring inside the portal. Normal pull requests should not be blocked on quality scoring beyond producing valid result JSON with a FOM value. diff --git a/docs/guides/profiler-level-reference.md b/docs/guides/profiler-level-reference.md index 06e2abc..32add7d 100644 --- a/docs/guides/profiler-level-reference.md +++ b/docs/guides/profiler-level-reference.md @@ -5,15 +5,16 @@ This note complements `bk_profiler` and focuses on the shared level names used b ## Shared Levels - `single` - - one measurement run + - minimal profile coverage - `simple` - - five measurement runs + - lightweight profile coverage - `standard` - - eleven measurement runs + - standard profile coverage - `detailed` - - seventeen measurement runs + - detailed profile coverage These names are BenchKit-level presets. Each profiler adapter defines the concrete behavior behind them. +For example, `fapp` maps these levels to multiple event-set runs, while `ncu` maps them to Nsight Compute options such as section set, launch count, and NVTX filtering. ## Current `fapp` Mapping @@ -39,9 +40,23 @@ Default report behavior for `fapp` is: Here `both` means text summaries plus CSV reports. +## Current `ncu` Mapping + +- `single` + - `--set basic --launch-count 1` +- `simple` + - `--set basic --launch-count 5` +- `standard` + - `--set full --launch-count 1` +- `detailed` + - `--set full --nvtx` + +Default report behavior for `ncu` is `text`. +BenchKit stores the Nsight Compute raw report under `bk_profiler_artifact/raw/rep1/` and, when import succeeds, a text details page under `bk_profiler_artifact/reports/ncu_import_rep1.txt`. + ## Portal Summary -BenchKit stores profiler metadata in `meta.json` inside `padata.tgz`, and also copies a compact summary into `result.json` as `profile_data`. +BenchKit stores profiler metadata in `bk_profiler_artifact/meta.json` inside `padata.tgz`, and also copies a compact summary into `result.json` as `profile_data`. This makes it possible to inspect profiler coverage without downloading the archive first. @@ -49,7 +64,7 @@ This makes it possible to inspect profiler coverage without downloading the arch - `Profiler` shows `tool / level` - the secondary line shows `report_format` and run count - Result detail - - `PA Data Summary` shows tool-specific events, explicit events, and report kinds + - `PA Data Summary` shows tool-specific details, explicit events when applicable, NCU options when present, and report kinds ## Why This Helps diff --git a/docs/guides/profiler-support.md b/docs/guides/profiler-support.md index 220f81e..4c97f67 100644 --- a/docs/guides/profiler-support.md +++ b/docs/guides/profiler-support.md @@ -55,7 +55,7 @@ bk_profiler [options] -- `single/simple/standard/detailed` は BenchKit の共通語彙として扱う。 ただし、その具体的意味は profiler tool ごとに adapter が定義する。 -このため、ある tool では 4 段階すべてを持ってもよいし、別の tool では 1 段階だけでもよい。 +このため、ある tool では複数の測定 run に対応し、別の tool では単一 run の profiler option や採取範囲に対応してよい。 ## 4. `fapp` の level 定義 @@ -76,7 +76,22 @@ bk_profiler [options] -- ここでいう CSV は `fapp` 固有の CPU performance analysis report を指す。 BenchKit は「CSV があること」を共通必須にはしない。 -## 5. Archive の考え方 +## 5. `ncu` の level 定義 + +`ncu` では現在、次の対応を採る。 + +- `single` → `--set basic --launch-count 1` +- `simple` → `--set basic --launch-count 5` +- `standard` → `--set full --launch-count 1` +- `detailed` → `--set full --nvtx` + +既定の report format は `text` とする。 +raw report は archive 内の `bk_profiler_artifact/raw/rep1/profile*.ncu-rep` または Nsight Compute の出力形式に従う report file として保存し、可能な場合は `ncu --import ... --page details` の出力を `bk_profiler_artifact/reports/ncu_import_rep1.txt` に保存する。 + +MPI launcher 経由の GPU application では、既定で `--target-processes all` を付けて child process も採取対象にする。 +追加の kernel filter、section set、NVTX filter などは `BK_PROFILER_ARGS` で `ncu` に渡す。 + +## 6. Archive の考え方 `bk_profiler` は archive の中に少なくとも次を置く。 @@ -104,7 +119,7 @@ bk_profiler_artifact/ cpu_pa_rep2.csv ``` -## 6. `meta.json` の役割 +## 7. `meta.json` の役割 `meta.json` は、archive の内容を BenchKit や推定 package が機械的に判断するための最小 metadata とする。 @@ -138,7 +153,7 @@ bk_profiler_artifact/ を見て、その artifact が適用可能かどうかを判断できる。 -## 7. アプリ側の責務 +## 8. アプリ側の責務 アプリ側は profiler helper を直接一般化しすぎず、次だけを持てばよい。 @@ -150,11 +165,34 @@ bk_profiler_artifact/ 例として `qws` では、 - Fugaku 系 build で `profiler=fapp` を渡す -- Fugaku 系 run で `bk_profiler fapp --level single -- ...` を呼ぶ +- Fugaku 系 run で `bk_profiler fapp --level detailed -- ...` を呼ぶ だけを持つ。 -## 8. 今は固定しないこと +`genesis` では、MiyabiG と RC_GH200 を同じ Grace-Hopper GPU 系の計算ノードとして扱い、GPU build / run に対して、 + +- build で `--enable-gpu`、`--enable-openmp`、`--with-gpuarch=sm_90` を指定する +- MiyabiG の既定 build では外部 LAPACK を要求せず、必要な場合だけ `GENESIS_MIYABIG_LAPACK_LIBS` で有効化する +- `.fpp` 前処理では GENESIS の traditional cpp flags を保持しつつ、GPU/single/MPI/OpenMP/FFTE の define を `PPFLAGS` 経由で明示する +- CUDA 12.9 以降向けに `src/spdyn/gpu_sp_energy.cu` の `nvToolsExt.h` include を `nvtx3/nvToolsExt.h` に補正する +- `mpif90` の実体が `nvfortran` の環境向けに、GENESIS の compiler 判定を NVHPC/PGI 系として補正する +- GENESIS の古い PGI flag (`-Mcuda` など) は `configure.ac` 側で NVHPC 25.x/aarch64 向けの `-cuda -gpu=cc90` へ補正する +- NVHPC 25.x では古い PGI pinned-array 経路の `PGICUDA` define を外し、GPU kernel 用の domain fields を保持する +- run では、`ncu` が PATH にある場合に `bk_profiler ncu --level single -- ...` を呼ぶ + +形を参照実装とする。ジョブ投入方式は MiyabiG が PBS、RC_GH200 が SLURM で異なるが、アプリ側の実行方法と profiler 採取方法は共通化する。 + +CUDA prefix、compiler wrapper、module、profiler tool は site 側の module 構成に合わせて上書きできる。 + +- build/run 共通の module: `GENESIS_MIYABIG_MODULE`, `GENESIS_GH200_MODULE` +- build 時の CUDA/compiler/config: `GENESIS_MIYABIG_CUDA_PATH`, `GENESIS_MIYABIG_FC`, `GENESIS_MIYABIG_CC`, `GENESIS_MIYABIG_CONFIG_ARGS` +- run 時の profiler: `GENESIS_MIYABIG_PROFILER_TOOL`, `GENESIS_GH200_PROFILER_TOOL`, `GENESIS_MIYABIG_PROFILER_LEVEL`, `GENESIS_GH200_PROFILER_LEVEL`, または共通の `GENESIS_PROFILER_TOOL` / `GENESIS_PROFILER_LEVEL` + +Genesis GH200 run の profiler 既定値は `ncu` だが、これは暗黙の既定値としてだけ扱う。`ncu` が PATH にない環境では warning を出して profiler なしで benchmark 本体を実行する。 +一方、`GENESIS_PROFILER_TOOL=ncu` または system 固有の `GENESIS_*_PROFILER_TOOL=ncu` を明示した場合は、`ncu` が見つからなければ失敗させる。 +profiler なしを明示したい場合は、`GENESIS_PROFILER_TOOL=none` または system 固有の `GENESIS_*_PROFILER_TOOL=none` を指定する。 + +## 9. 今は固定しないこと 現時点では、次は固定しない。 diff --git a/programs/genesis/build.sh b/programs/genesis/build.sh index a87c4f1..b398880 100644 --- a/programs/genesis/build.sh +++ b/programs/genesis/build.sh @@ -4,8 +4,8 @@ set -x system="$1" REPO_DIR="genesis" -REPO_URL="https://github.com/genesis-release-r-ccs/${REPO_DIR}.git" -BRANCH="main" +REPO_URL="${GENESIS_REPO_URL:-https://github.com/genesis-release-r-ccs/${REPO_DIR}.git}" +BRANCH="${GENESIS_BRANCH:-main}" echo "[${REPO_DIR}] Building on system: $system" mkdir -p artifacts @@ -18,6 +18,242 @@ cd ${REPO_DIR} || { exit 1 } +# Append flags without losing values that a site module or CI variable already +# set before this build script runs. +append_env_flags() { + local var_name="$1" + local new_flags="$2" + local current_flags="${!var_name:-}" + + if [ -n "$new_flags" ]; then + if [ -n "$current_flags" ]; then + export "${var_name}=${current_flags} ${new_flags}" + else + export "${var_name}=${new_flags}" + fi + fi +} + +# Resolve the CUDA root for NVHPC/Grace-Hopper builds. NVHPC often puts nvcc in +# a compiler bin directory while libcudart lives in a sibling cuda directory, so +# command -v nvcc alone is not enough for GENESIS configure. +detect_cuda_path() { + local nvcc_path="" + local nvcc_prefix="" + local nvhpc_root="" + local cuda_candidate="" + + if [ -n "${CUDA_HOME:-}" ]; then + printf '%s\n' "$CUDA_HOME" + return 0 + fi + if [ -n "${CUDA_PATH:-}" ]; then + printf '%s\n' "$CUDA_PATH" + return 0 + fi + if ! nvcc_path=$(command -v nvcc 2>/dev/null); then + return 1 + fi + + nvcc_prefix=$(cd "$(dirname "$(dirname "$nvcc_path")")" && pwd) + nvhpc_root=$(cd "$(dirname "$nvcc_prefix")" && pwd) + for cuda_candidate in "${nvhpc_root}"/cuda/* "${nvhpc_root}"/cuda; do + if [ -f "${cuda_candidate}/lib64/libcudart.so" ] || [ -f "${cuda_candidate}/targets/sbsa-linux/lib/libcudart.so" ]; then + printf '%s\n' "$cuda_candidate" + return 0 + fi + done + + if [ -d "${nvcc_prefix}/include" ]; then + printf '%s\n' "$nvcc_prefix" + return 0 + fi + + return 1 +} + +# Export CUDA include/library flags in the forms GENESIS configure currently +# checks. The sbsa-linux paths are needed by GH200-style NVHPC installations. +configure_cuda_environment() { + local cuda_prefix="$1" + local cuda_arch="$2" + local incflags="" + local ldflags="" + local inc_dir="" + local lib_dir="" + local cudart_lib="" + + [ -n "$cuda_prefix" ] || return 0 + + export CUDA_HOME="$cuda_prefix" + export CUDA_PATH="$cuda_prefix" + + for inc_dir in \ + "${cuda_prefix}/include" \ + "${cuda_prefix}/targets/sbsa-linux/include" \ + "${cuda_prefix}/targets/sbsa-linux/include/nvtx3"; do + if [ -d "$inc_dir" ]; then + incflags="${incflags:+${incflags} }-I${inc_dir}" + fi + done + + for lib_dir in \ + "${cuda_prefix}/targets/sbsa-linux/lib" \ + "${cuda_prefix}/lib64"; do + if [ -d "$lib_dir" ]; then + ldflags="${ldflags:+${ldflags} }-L${lib_dir}" + fi + done + + for cudart_lib in \ + "${cuda_prefix}/targets/sbsa-linux/lib/libcudart.so" \ + "${cuda_prefix}/lib64/libcudart.so"; do + if [ -f "$cudart_lib" ]; then + export GENESIS_CUDART_LIB="$cudart_lib" + break + fi + done + + append_env_flags CPPFLAGS "$incflags" + append_env_flags NVCCFLAG "$incflags" + append_env_flags LDFLAGS "$ldflags" + + if [ "$cuda_arch" = "90" ] || [ "$cuda_arch" = "sm_90" ]; then + append_env_flags NVCCFLAG '--generate-code=arch=compute_90,code="sm_90,compute_90"' + fi +} + +# GENESIS still includes the legacy NVTX header. Newer CUDA/NVHPC stacks install +# the compatibility header below nvtx3, so patch the checked-out source locally. +apply_genesis_nvtx_include_patch() { + local target="src/spdyn/gpu_sp_energy.cu" + + if [ -f "$target" ] && grep -q 'nvToolsExt.h' "$target" && ! grep -q 'nvtx3/nvToolsExt.h' "$target"; then + sed -i -e 's|nvToolsExt.h|nvtx3/nvToolsExt.h|g' "$target" + fi +} + +# The upstream configure.ac recognizes pgfortran but not nvfortran. Treating +# nvfortran as the same compiler family keeps the rest of GENESIS' PGI/NVHPC +# configuration path active without carrying a fork of the source tree. +apply_genesis_nvfortran_configure_patch() { + if [ ! -f configure.ac ] || grep -q 'x"${vtok}" = x"nvfortran"' configure.ac; then + return 0 + fi + + perl -0pi -e 's/(elif test x"\$\{vtok\}" = x"pgfortran"; then\s+FC_ACT="pgf90"\s+break)/elif test x"\${vtok}" = x"nvfortran"; then\nFC_ACT="pgf90"\nbreak\n\1/' configure.ac + if ! grep -q 'x"${vtok}" = x"nvfortran"' configure.ac; then + echo "Failed to patch configure.ac for nvfortran detection" >&2 + exit 1 + fi +} + +# Replace obsolete PGI CUDA flags with NVHPC flags and remove options that fail +# on aarch64 Grace-Hopper nodes. This keeps the patch local to the CI checkout. +apply_genesis_nvhpc_configure_flags_patch() { + if [ ! -f configure.ac ]; then + return 0 + fi + + GENESIS_NVHPC_GPU_FLAGS="${GENESIS_NVHPC_GPU_FLAGS:--cuda -gpu=cc90}" \ + perl -0pi -e ' + my $cudart_lib = $ENV{"GENESIS_CUDART_LIB"}; + my $gpu_flags = $ENV{"GENESIS_NVHPC_GPU_FLAGS"}; + if ($cudart_lib) { + s/-L\$\{cuda_lib_path\} -lcudart/$cudart_lib/g; + } + s/-Mcuda/$gpu_flags/g; + s/[[:space:]]+-Msmartalloc=huge//g; + s/[[:space:]]+-Mipa=fast,inline//g; + s/[[:space:]]+-fastsse//g; + s/[[:space:]]+-pc 64//g; + s/[[:space:]]+-mcmodel=medium//g; + s/\n[[:space:]]*AC_DEFINE\(PGICUDA, 1, \[defined if pgi and cuda are used\.\]\)//g; + s/\n[[:space:]]*DEFINED_VARIABLES\+=" -DPGICUDA"//g; + ' configure.ac + if grep -q 'PGICUDA' configure.ac; then + echo "Failed to patch configure.ac for NVHPC PGICUDA handling" >&2 + exit 1 + fi +} + +# GENESIS releases vary between shipping bootstrap and relying on autoreconf. +bootstrap_genesis() { + if [ -x ./bootstrap ]; then + bash ./bootstrap + else + autoreconf -i + fi +} + +# Shared configuration for GH200-class systems. env_prefix lets each site +# override modules, compilers, CUDA path, GPU arch, and configure args without +# duplicating the whole build block for MiyabiG/RC_GH200. +configure_genesis_gh200_gpu() { + local system_name="$1" + local env_prefix="$2" + local default_module="$3" + local module_var="${env_prefix}_MODULE" + local fc_var="${env_prefix}_FC" + local cc_var="${env_prefix}_CC" + local cxx_var="${env_prefix}_CXX" + local f77_var="${env_prefix}_F77" + local config_args_var="${env_prefix}_CONFIG_ARGS" + local gpu_arch_var="${env_prefix}_GPU_ARCH" + local cuda_path_var="${env_prefix}_CUDA_PATH" + local lapack_libs_var="${env_prefix}_LAPACK_LIBS" + local ppflags_var="${env_prefix}_PPFLAGS" + local default_ppflags="-traditional-cpp -traditional -D_SINGLE -DHAVE_MPI_GENESIS -DOMP -DFFTE -DUSE_GPU" + local gpu_arch_value="${!gpu_arch_var:-sm_90}" + local cuda_arch_number="${gpu_arch_value#sm_}" + local gpu_arch="sm_${cuda_arch_number}" + local cuda_prefix="" + + local module_name="${!module_var:-$default_module}" + if [ "$module_name" != "none" ] && command -v module >/dev/null 2>&1; then + read -r -a module_names <<< "$module_name" + module load "${module_names[@]}" + fi + + # GENESIS configure probes compiler versions through this variable. + version="--version" + FC="${!fc_var:-mpif90}" + CC="${!cc_var:-mpicc}" + CXX="${!cxx_var:-mpicxx}" + F77="${!f77_var:-mpif77}" + + cuda_prefix="${!cuda_path_var:-}" + if [ -z "$cuda_prefix" ]; then + cuda_prefix=$(detect_cuda_path || true) + fi + configure_cuda_environment "$cuda_prefix" "$cuda_arch_number" + export GENESIS_NVHPC_GPU_FLAGS="${GENESIS_NVHPC_GPU_FLAGS:--cuda -gpu=cc${cuda_arch_number}}" + + # Site-specific CONFIG_ARGS is a full replacement. Otherwise use a portable + # single-precision MPI/OpenMP GPU configuration and add CUDA/LAPACK only when + # the corresponding site paths are available. + if [ -n "${!config_args_var:-}" ]; then + read -r -a CONFIG_ARGS <<< "${!config_args_var}" + else + CONFIG_ARGS=(--enable-single --with-simd=auto --enable-mpi --without-lapack --enable-gpu --enable-openmp "--with-gpuarch=${gpu_arch}") + if [ -n "$cuda_prefix" ]; then + CONFIG_ARGS+=("--with-cuda=${cuda_prefix}") + fi + if [ -n "${!lapack_libs_var:-}" ]; then + export LAPACK_LIBS="${!lapack_libs_var}" + CONFIG_ARGS=("${CONFIG_ARGS[@]/--without-lapack/--with-lapack}") + CONFIG_ARGS+=("LAPACK_LIBS=${!lapack_libs_var}") + fi + fi + + append_env_flags PPFLAGS "${!ppflags_var:-$default_ppflags}" + + apply_genesis_nvtx_include_patch + apply_genesis_nvfortran_configure_patch + apply_genesis_nvhpc_configure_flags_patch + echo "Configured ${system_name} as Grace-Hopper GPU build" +} + case "$system" in Fugaku) comp=frtpx @@ -38,15 +274,44 @@ case "$system" in LAPACK_LIBS="-L/vol0004/apps/oss/spack-v0.21/opt/spack/linux-rhel8-cascadelake/gcc-13.2.0/openblas-0.3.24-on6q3arf3iucukiz4tfai26noq3kz4a7/lib/ -lopenblas" CONFIG_ARGS=(--enable-mixed "LAPACK_LIBS=$LAPACK_LIBS") ;; + + MiyabiG) + configure_genesis_gh200_gpu "$system" GENESIS_MIYABIG none + ;; + + RC_GH200) + configure_genesis_gh200_gpu "$system" GENESIS_GH200 "system/qc-gh200 nvhpc/25.9" + ;; + + *) + echo "Unknown system: $system" + exit 1 + ;; esac echo "FC=$FC" echo "CC=$CC" +echo "CXX=${CXX:-}" +echo "F77=${F77:-}" echo "configure args: ${CONFIG_ARGS[@]}" -autoreconf -i -./configure CC="$CC" FC="$FC" "${CONFIG_ARGS[@]}" -make -j > make.log 2>&1 +bootstrap_genesis +configure_env=(CC="$CC" FC="$FC") +if [ -n "${CXX:-}" ]; then + configure_env+=(CXX="$CXX") +fi +if [ -n "${F77:-}" ]; then + configure_env+=(F77="$F77") +fi +./configure "${configure_env[@]}" "${CONFIG_ARGS[@]}" +apply_genesis_nvtx_include_patch +if ! make -j > make.log 2>&1; then + echo "make failed. Error-like lines from make.log:" >&2 + grep -n -i -E 'error|fatal|undefined reference|no such file|cannot|failed|unknown switch|unsupported|stop\.' make.log | tail -n 200 >&2 || true + echo "make failed. Last 1000 lines of make.log:" >&2 + tail -n 1000 make.log >&2 || true + exit 1 +fi make install cp "bin/spdyn" "../artifacts/" echo "done." diff --git a/programs/genesis/list.csv b/programs/genesis/list.csv index e7f19bf..d25424d 100644 --- a/programs/genesis/list.csv +++ b/programs/genesis/list.csv @@ -2,3 +2,5 @@ system,enable,nodes,numproc_node,nthreads,elapse Fugaku,yes,2,4,12,0:10:00 FugakuLN,no,1,8,2,0:10:00 FugakuCN,no,1,8,2,0:10:00 +MiyabiG,yes,1,8,9,0:10:00 +RC_GH200,yes,1,8,9,0:10:00 diff --git a/programs/genesis/run.sh b/programs/genesis/run.sh index c63f98f..dc15d40 100644 --- a/programs/genesis/run.sh +++ b/programs/genesis/run.sh @@ -1,5 +1,6 @@ #!/bin/bash set -e +set -o pipefail system="$1" nodes="$2" numproc_node="$3" @@ -104,6 +105,81 @@ if [[ ! -f ${inputdir}/apoa1.rst ]]; then fi cp ${inputdir}/apoa1.rst . +# Shared GH200-class run path. The env_prefix pattern mirrors build.sh so each +# site can override modules, MPI launcher, GPU visibility, and profiler policy +# independently while keeping the benchmark invocation identical. +run_genesis_gh200_gpu() { + local system_name="$1" + local env_prefix="$2" + local default_module="$3" + local module_var="${env_prefix}_MODULE" + local mpi_cmd_var="${env_prefix}_MPI_CMD" + local mpi_args_var="${env_prefix}_MPI_ARGS" + local cuda_visible_devices_var="${env_prefix}_CUDA_VISIBLE_DEVICES" + local profiler_tool_var="${env_prefix}_PROFILER_TOOL" + local profiler_level_var="${env_prefix}_PROFILER_LEVEL" + + local module_name="${!module_var:-$default_module}" + if [ "$module_name" != "none" ] && command -v module >/dev/null 2>&1; then + read -r -a module_names <<< "$module_name" + module load "${module_names[@]}" + fi + + read -r -a mpi_cmd <<< "${!mpi_cmd_var:-mpirun -np ${numproc}}" + if [ -n "${!mpi_args_var:-}" ]; then + read -r -a gh200_mpi_args <<< "${!mpi_args_var}" + mpi_cmd+=("${gh200_mpi_args[@]}") + fi + + export OMP_NUM_THREADS=${nthreads} + if [ -n "${!cuda_visible_devices_var:-}" ]; then + export CUDA_VISIBLE_DEVICES="${!cuda_visible_devices_var}" + fi + + local genesis_profiler_requested="" + local genesis_profiler_explicit=0 + # GH200 systems default to Nsight Compute because the GPU path is the new + # behavior being validated. Explicit requests are strict; the default falls + # back to an unprofiled run when ncu is unavailable. + if [ -n "${!profiler_tool_var:-}" ]; then + genesis_profiler_requested="${!profiler_tool_var}" + genesis_profiler_explicit=1 + elif [ -n "${GENESIS_PROFILER_TOOL:-}" ]; then + genesis_profiler_requested="${GENESIS_PROFILER_TOOL}" + genesis_profiler_explicit=1 + else + genesis_profiler_requested="ncu" + fi + + genesis_profiler_tool=$(bk_get_profiler_tool "$genesis_profiler_requested") || return 1 + genesis_profiler_level="${!profiler_level_var:-${GENESIS_PROFILER_LEVEL:-single}}" + if [ -n "$genesis_profiler_tool" ]; then + if [ "$genesis_profiler_tool" = "ncu" ] && ! command -v ncu >/dev/null 2>&1; then + if [ "$genesis_profiler_explicit" -eq 1 ]; then + echo "Genesis ${system_name}: ncu profiler requested but ncu is not in PATH." >&2 + echo "Load Nsight Compute with ${module_var}, or set ${profiler_tool_var}=none / GENESIS_PROFILER_TOOL=none to run without profiling." >&2 + return 1 + fi + echo "Genesis ${system_name}: default ncu profiler is not in PATH; running without profiling." >&2 + echo "Set ${profiler_tool_var}=ncu or GENESIS_PROFILER_TOOL=ncu to require Nsight Compute profiling." >&2 + genesis_profiler_tool="" + genesis_profiler_requested="none" + fi + fi + + echo "Running ${system_name} as Grace-Hopper GPU run with profiler=${genesis_profiler_requested:-none} level=${genesis_profiler_level}" + if [ -n "$genesis_profiler_tool" ]; then + # set -o pipefail at script entry keeps profiler or MPI failures visible + # even though stdout/stderr are also streamed through tee for artifacts. + bk_profiler "$genesis_profiler_tool" \ + --level "$genesis_profiler_level" \ + --archive "${resultsdir}/padata0.tgz" \ + --raw-dir ncu \ + -- "${mpi_cmd[@]}" ./${binary} ${input}.sub 2>&1 | tee ${output} + else + "${mpi_cmd[@]}" ./${binary} ${input}.sub 2>&1 | tee ${output} + fi +} case "$system" in Fugaku) @@ -125,8 +201,15 @@ case "$system" in export OMP_NUM_THREADS=${nthreads} ${mpi_cmd} ./${binary} ${input}.sub 2>&1 | tee ${output} ;; + MiyabiG) + run_genesis_gh200_gpu "$system" GENESIS_MIYABIG none + ;; + RC_GH200) + run_genesis_gh200_gpu "$system" GENESIS_GH200 "system/qc-gh200 nvhpc/25.9" + ;; *) echo "Unknown Running system: $system" + exit 1 ;; esac diff --git a/programs/qws/build.sh b/programs/qws/build.sh index 477bbd5..9750137 100644 --- a/programs/qws/build.sh +++ b/programs/qws/build.sh @@ -61,6 +61,20 @@ case "$system" in MiyabiC) make -j 8 fugaku_benchmark= omp=1 compiler=intel arch=skylake rdma= mpi=1 powerapi= ;; + GenkaiA|GenkaiB|GenkaiC) + module load intel/2023.2 mvapich/3.0-intel2023.2 + make -j 8 fugaku_benchmark= omp=1 compiler=intel arch=skylake rdma= mpi=1 powerapi= CC=mpicc CXX=mpicxx + ;; + Grand_C|Grand_G) + module load intel impi + make -j 8 fugaku_benchmark= omp=1 compiler=intel arch=skylake rdma= mpi=1 powerapi= + ;; + AOBA_A|AOBA_S) + make -j 8 fugaku_benchmark= omp=1 compiler=nec arch=sx rdma= mpi=1 powerapi= + ;; + AOBA_B) + make -j 8 fugaku_benchmark= omp=1 compiler=openmpi-gnu arch=skylake rdma= mpi=1 powerapi= CC=mpicc CXX=mpic++ + ;; *) echo "Unknown system: $system" exit 1 diff --git a/programs/qws/list.csv b/programs/qws/list.csv index bc25f19..41d1e75 100644 --- a/programs/qws/list.csv +++ b/programs/qws/list.csv @@ -9,4 +9,12 @@ RC_GENOA,yes,1,1,96,0:10:00 RC_FX700,yes,1,4,12,0:10:00 MiyabiG,yes,1,1,72,0:10:00 MiyabiC,yes,1,1,112,0:10:00 +GenkaiA,yes,1,1,120,0:10:00 +GenkaiB,yes,1,1,120,0:10:00 +GenkaiC,yes,1,1,112,0:10:00 +Grand_C,yes,1,1,64,0:10:00 +Grand_G,yes,1,1,64,0:10:00 +AOBA_A,yes,1,1,8,0:10:00 +AOBA_S,yes,1,1,8,0:10:00 +AOBA_B,yes,1,1,128,0:10:00 FNCX,yes,1,1,1,0:10:00 diff --git a/programs/qws/run.sh b/programs/qws/run.sh index 396c46a..c555c24 100644 --- a/programs/qws/run.sh +++ b/programs/qws/run.sh @@ -109,6 +109,38 @@ case "$system" in mpirun -n 1 ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 print_results CASE0 CASE0 1 >> ../results/result ;; + GenkaiA|GenkaiB|GenkaiC) + qws_numproc=$((nodes * numproc_node)) + module load intel/2023.2 mvapich/3.0-intel2023.2 + mpirun -n ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 + print_results CASE0 CASE0 ${numproc_node} >> ../results/result + ;; + Grand_C|Grand_G) + qws_numproc=$((nodes * numproc_node)) + module load intel impi + if [[ -n "${I_MPI_ROOT:-}" && -d "${I_MPI_ROOT}/bin" ]]; then + export PATH="${I_MPI_ROOT}/bin:${PATH}" + fi + qws_mpi_launcher=$(command -v mpirun || command -v mpiexec || command -v mpiexec.hydra || true) + if [[ -z "$qws_mpi_launcher" ]]; then + echo "qws: mpirun/mpiexec/mpiexec.hydra not found after module load intel impi" >&2 + echo "qws: PATH=${PATH}" >&2 + echo "qws: MPI launcher candidates:" >&2 + type -a mpirun mpiexec mpiexec.hydra mpiicc mpiicpc mpiicpx 2>&1 >&2 || true + echo "qws: loaded modules:" >&2 + module list >&2 || true + echo "qws: environment:" >&2 + env | sort >&2 + exit 1 + fi + "$qws_mpi_launcher" -n ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 + print_results CASE0 CASE0 ${numproc_node} >> ../results/result + ;; + AOBA_A|AOBA_B|AOBA_S) + qws_numproc=$((nodes * numproc_node)) + mpirun -np ${qws_numproc} ./main 32 6 4 3 1 1 1 1 -1 -1 6 50 > CASE0 + print_results CASE0 CASE0 ${numproc_node} >> ../results/result + ;; *) echo "Unknown Running system: $system" exit 1 diff --git a/requirements-result-server.txt b/requirements-result-server.txt index cd48dc7..fd274bc 100644 --- a/requirements-result-server.txt +++ b/requirements-result-server.txt @@ -1,8 +1,11 @@ +# result_server requires Python 3.12+ for safe tar extraction via tarfile filter="data". Flask>=3.0,<4.0 Flask-Session>=0.8,<1.0 +Flask-WTF>=1.2,<2.0 +gunicorn>=23.0,<24.0 redis>=5.0,<6.0 pyotp>=2.9,<3.0 qrcode[pil]>=8.0,<9.0 -pytest>=8.0,<9.0 +pytest>=9.0.3,<10.0 hypothesis>=6.0,<7.0 fakeredis>=2.23,<3.0 diff --git a/result_server/app.py b/result_server/app.py index d6766fd..c7ec38e 100644 --- a/result_server/app.py +++ b/result_server/app.py @@ -2,19 +2,21 @@ import sys from datetime import timedelta -from flask import Flask, current_app, render_template +from flask import Flask, render_template from flask_session import Session from routes.api import api_bp from routes.estimated import estimated_bp from routes.home import register_home_routes from routes.results import results_bp +from utils.auth import parse_ingest_keys +from utils.csrf import init_csrf -EXPECTED_API_KEY = os.environ.get("RESULT_SERVER_KEY") +INGEST_KEYS = parse_ingest_keys() -if not EXPECTED_API_KEY: - print("ERROR: RESULT_SERVER_KEY is not set.", file=sys.stderr) +if not INGEST_KEYS: + print("ERROR: RESULT_SERVER_KEYS or RESULT_SERVER_KEY is not set.", file=sys.stderr) sys.exit(1) @@ -41,6 +43,7 @@ def _configure_redis(app, prefix): app.config["REDIS_CONN"] = redis.from_url(redis_url, decode_responses=True) app.config["REDIS_PREFIX"] = "dev:" if prefix == "/dev" else "main:" app.config["SESSION_COOKIE_NAME"] = "session_dev" if prefix == "/dev" else "session_main" + app.config["AUTH_REQUIRES_REDIS"] = True def _configure_user_store(app): @@ -92,12 +95,14 @@ def create_app(prefix="", base_dir=None): if not secret_key: raise RuntimeError("FLASK_SECRET_KEY must be set in production") app.secret_key = secret_key + app.config["INGEST_KEYS"] = INGEST_KEYS.copy() _configure_session(app, base_dir) _configure_redis(app, prefix) _configure_user_store(app) _configure_totp_issuer(app, prefix) _configure_result_directories(app, base_dir) + init_csrf(app, exempt_blueprints=(api_bp,)) register_home_routes(app, prefix=prefix) _register_portal_blueprints(app, prefix) @@ -127,4 +132,6 @@ def systemlist(): if __name__ == "__main__": - app.run(host="0.0.0.0", port=8800) + host = os.environ.get("RESULT_SERVER_HOST", "127.0.0.1") + port = int(os.environ.get("RESULT_SERVER_PORT", "8800")) + app.run(host=host, port=port) diff --git a/result_server/app_dev.py b/result_server/app_dev.py index cf011a5..46e8538 100644 --- a/result_server/app_dev.py +++ b/result_server/app_dev.py @@ -15,16 +15,32 @@ import argparse import json import os +import secrets import sys import types import uuid +import warnings from datetime import datetime, timedelta +LOOPBACK_HOSTS = {"127.0.0.1", "localhost", "::1"} + def setup_dev_environment(base_dir): """Configure development environment variables and runtime directories.""" - os.environ.setdefault("RESULT_SERVER_KEY", "dev-api-key") - os.environ.setdefault("FLASK_SECRET_KEY", "dev-secret-key") + if not os.environ.get("RESULT_SERVER_KEYS") and not os.environ.get("RESULT_SERVER_KEY"): + os.environ["RESULT_SERVER_KEYS"] = f"local-dev:{secrets.token_urlsafe(32)}" + warnings.warn( + "RESULT_SERVER_KEYS not set; using an ephemeral dev API key.", + RuntimeWarning, + stacklevel=2, + ) + if not os.environ.get("FLASK_SECRET_KEY"): + os.environ["FLASK_SECRET_KEY"] = secrets.token_hex(32) + warnings.warn( + "FLASK_SECRET_KEY not set; using an ephemeral dev secret key.", + RuntimeWarning, + stacklevel=2, + ) os.environ.setdefault("BASE_PATH", base_dir) os.environ["DEV_MODE"] = "1" @@ -44,6 +60,19 @@ def setup_dev_environment(base_dir): os.makedirs(os.path.join(base_dir, sub), exist_ok=True) +def validate_dev_runtime(host): + """Abort when the development launcher is used outside local-only mode.""" + if os.environ.get("FLASK_ENV") == "production": + sys.exit("app_dev.py must not be used in production. Use app.py.") + if host not in LOOPBACK_HOSTS: + sys.exit(f"app_dev.py refuses to bind to {host}. Use app.py for production.") + + +def dev_debug_enabled(): + """Return whether the Werkzeug debugger was explicitly enabled.""" + return os.environ.get("RESULT_SERVER_DEV_DEBUG") == "1" + + def _create_stub_totp_manager(): """Return a stub TOTP module that always validates setup and login.""" mod = types.ModuleType("utils.totp_manager") @@ -130,15 +159,19 @@ def create_dev_app(base_dir): from flask_session import Session from routes.home import register_home_routes + from utils.auth import parse_ingest_keys + from utils.csrf import init_csrf from utils.system_info import get_all_systems_info, summarize_systems_info app = Flask(__name__, template_folder="templates") - app.secret_key = "dev-secret-key" + app.secret_key = os.environ["FLASK_SECRET_KEY"] app.config.update( SESSION_TYPE="filesystem", SESSION_FILE_DIR=os.path.join(base_dir, "main", "flask_session"), SESSION_PERMANENT=False, + AUTH_REQUIRES_REDIS=False, + INGEST_KEYS=parse_ingest_keys(), ) Session(app) @@ -166,6 +199,10 @@ def create_dev_app(base_dir): register_home_routes(app) # Register all portal blueprints. + from routes.api import api_bp + + app.register_blueprint(api_bp) + from routes.results import results_bp app.register_blueprint(results_bp, url_prefix="/results") @@ -180,6 +217,8 @@ def create_dev_app(base_dir): from routes.admin import admin_bp + init_csrf(app, exempt_blueprints=(api_bp,)) + app.register_blueprint(admin_bp, url_prefix="/admin") @app.route("/systemlist") @@ -472,10 +511,17 @@ def generate_sample_data(received_dir): def main(): parser = argparse.ArgumentParser(description="BenchKit Result Server - Dev Mode") + parser.add_argument( + "--host", + default="127.0.0.1", + help="Loopback host to bind (default: 127.0.0.1)", + ) parser.add_argument("--port", type=int, default=8800, help="Port number (default: 8800)") parser.add_argument("--generate-sample", action="store_true", help="Generate sample data") args = parser.parse_args() + validate_dev_runtime(args.host) + # Development base directory. script_dir = os.path.dirname(os.path.abspath(__file__)) base_dir = os.path.join(script_dir, "_dev_data") @@ -492,15 +538,18 @@ def main(): print("Generating sample data...") generate_sample_data(received_dir) - print(f"\nStarting dev server on http://localhost:{args.port}") - print(f" Results: http://localhost:{args.port}/results") - print(f" Systems: http://localhost:{args.port}/systemlist") + print(f"\nStarting dev server on http://{args.host}:{args.port}") + print(f" Results: http://{args.host}:{args.port}/results") + print(f" Systems: http://{args.host}:{args.port}/systemlist") print(f" Data dir: {base_dir}") print() # Create and launch the Flask app directly. app = create_dev_app(base_dir) - app.run(host="127.0.0.1", port=args.port, debug=True) + debug = dev_debug_enabled() + if debug: + app.logger.warning("Werkzeug debugger enabled for local development.") + app.run(host=args.host, port=args.port, debug=debug) if __name__ == "__main__": diff --git a/result_server/routes/api.py b/result_server/routes/api.py index 5ab734d..d7cd8d9 100644 --- a/result_server/routes/api.py +++ b/result_server/routes/api.py @@ -7,12 +7,13 @@ import uuid import shutil import io +import sys import tarfile from datetime import datetime -api_bp = Blueprint("api", __name__) +from utils.auth import verify_ingest_key -EXPECTED_API_KEY = os.environ.get("RESULT_SERVER_KEY") +api_bp = Blueprint("api", __name__) # ========================================== @@ -20,10 +21,19 @@ # ========================================== def require_api_key(): - """Validate the request API key.""" - api_key = request.headers.get("X-API-Key") - if api_key != EXPECTED_API_KEY: + """Validate the request API key and return the authenticated runner id.""" + runner_id = verify_ingest_key(request.headers.get("X-API-Key", "")) + if not runner_id: abort(401, description="Invalid API Key") + current_app.logger.info( + "api key accepted", + extra={ + "runner_id": runner_id, + "endpoint": request.path, + "ip": request.remote_addr, + }, + ) + return runner_id def save_json_file(data, prefix, out_dir, given_uuid=None): @@ -135,6 +145,16 @@ def _find_result_file_by_uuid(received_dir, uuid_value): def _safe_extract_tar_bytes(file_storage, target_dir): + """Extract uploaded tar bytes with path and member-type checks. + + The explicit path normalization catches traversal attempts before writing + anything, and Python 3.12's data filter rejects non-regular archive entries + such as unsafe links or device files. + """ + if sys.version_info < (3, 12): + raise RuntimeError("Python 3.12 or later is required for safe tar extraction.") + + os.makedirs(target_dir, exist_ok=True) with tarfile.open(fileobj=file_storage.stream, mode="r:*") as tar: for member in tar.getmembers(): normalized = os.path.normpath(member.name) @@ -142,8 +162,8 @@ def _safe_extract_tar_bytes(file_storage, target_dir): abort(400, description="Unsafe archive entry") try: tar.extractall(target_dir, filter="data") - except TypeError: - tar.extractall(target_dir) + except tarfile.FilterError: + abort(400, description="Unsafe archive entry") # ========================================== @@ -178,9 +198,7 @@ def ingest_estimate(): @api_bp.route("/api/ingest/padata", methods=["POST"]) def ingest_padata(): """Receive and store a PA Data archive.""" - api_key = request.headers.get("X-API-Key") - if api_key != EXPECTED_API_KEY: - abort(401, description="Invalid API Key") + require_api_key() uuid_str = request.form.get("id") if not uuid_str or not is_valid_uuid(uuid_str): diff --git a/result_server/routes/auth.py b/result_server/routes/auth.py index 5258a94..216cf86 100644 --- a/result_server/routes/auth.py +++ b/result_server/routes/auth.py @@ -2,6 +2,7 @@ from flask import ( Blueprint, + abort, current_app, flash, redirect, @@ -25,6 +26,40 @@ auth_bp = Blueprint("auth", __name__, url_prefix="/auth") +def _redis_ping_ok(redis_conn): + """Return whether the configured Redis connection is currently usable.""" + if not redis_conn: + return False + try: + redis_conn.ping() + return True + except Exception: + current_app.logger.exception("Redis ping failed during authentication") + return False + + +def _get_redis_or_fail(): + """Return Redis for auth tracking, failing closed when configured to require it.""" + redis_conn = current_app.config.get("REDIS_CONN") + requires_redis = current_app.config.get("AUTH_REQUIRES_REDIS", False) + + if not redis_conn: + if requires_redis: + current_app.logger.error("Redis unavailable; refusing login") + abort(503, description="Authentication service temporarily unavailable.") + return None + + if _redis_ping_ok(redis_conn): + return redis_conn + + if requires_redis: + current_app.logger.error("Redis unavailable; refusing login") + abort(503, description="Authentication service temporarily unavailable.") + + current_app.logger.warning("Redis unavailable; continuing without auth throttling") + return None + + def _render_login_totp_step(email): return render_template("auth_login.html", step="totp", email=email) @@ -50,6 +85,8 @@ def login(): email = request.form.get("email", "").strip() totp_code = request.form.get("totp_code", "").strip() + redis_conn = _get_redis_or_fail() + prefix = current_app.config.get("REDIS_PREFIX", "") # Step 1: email submitted -> show the TOTP entry form. if email and not totp_code: @@ -64,8 +101,6 @@ def login(): return redirect(url_for("auth.login")) # Enforce rate limiting when Redis-backed tracking is available. - redis_conn = current_app.config.get("REDIS_CONN") - prefix = current_app.config.get("REDIS_PREFIX", "") if redis_conn: is_locked, remaining = check_rate_limit(redis_conn, prefix, email) if is_locked: diff --git a/result_server/templates/_results_base.html b/result_server/templates/_results_base.html index 21e6ec6..16852c4 100644 --- a/result_server/templates/_results_base.html +++ b/result_server/templates/_results_base.html @@ -2,6 +2,7 @@ + {% block title %}Results{% endblock %} {% include "_table_base.html" %} diff --git a/result_server/templates/_results_table_cell_profile.html b/result_server/templates/_results_table_cell_profile.html index 619a031..e70944a 100644 --- a/result_server/templates/_results_table_cell_profile.html +++ b/result_server/templates/_results_table_cell_profile.html @@ -10,6 +10,7 @@ {% if row.profile_summary_meta.subline %}
{{ row.profile_summary_meta.subline }}{% endif %} {% if row.data_link %}
archive: available{% endif %} {% if row.profile_summary_meta.events %}
events: {{ row.profile_summary_meta.events | join(', ') }}{% endif %} + {% if row.profile_summary_meta.ncu_options %}
ncu options: {{ row.profile_summary_meta.ncu_options | join(' ') }}{% endif %} {% if row.profile_summary_meta.report_kinds %}
reports: {{ row.profile_summary_meta.report_kinds | join(', ') }}{% endif %} diff --git a/result_server/templates/admin_users.html b/result_server/templates/admin_users.html index c9d50b0..3e22a09 100644 --- a/result_server/templates/admin_users.html +++ b/result_server/templates/admin_users.html @@ -88,7 +88,8 @@

Add User

Create a user record and generate a new invitation link for TOTP setup.

- + {% if csrf_token is defined %}{% endif %} +
@@ -122,16 +123,19 @@

Registered Users

+ {% if csrf_token is defined %}{% endif %}
{% if u.email != session.get('user_email') %}
+ {% if csrf_token is defined %}{% endif %}
+ {% if csrf_token is defined %}{% endif %}
{% endif %} diff --git a/result_server/templates/auth_login.html b/result_server/templates/auth_login.html index 6d78a49..33c5850 100644 --- a/result_server/templates/auth_login.html +++ b/result_server/templates/auth_login.html @@ -87,9 +87,10 @@ {% if step == "email" %}

Start with the email address associated with your portal account. You will enter your authenticator code on the next step.

+ {% if csrf_token is defined %}{% endif %}
- +
@@ -99,6 +100,7 @@

Enter the 6-digit code from your authenticator app to complete sign-in.

Step 2 of 2 + {% if csrf_token is defined %}{% endif %}
diff --git a/result_server/templates/auth_setup.html b/result_server/templates/auth_setup.html index b83ab19..5d71919 100644 --- a/result_server/templates/auth_setup.html +++ b/result_server/templates/auth_setup.html @@ -121,6 +121,7 @@
{{ secret }}
+ {% if csrf_token is defined %}{% endif %}
' in html + assert 'autocomplete="username"' in html def test_auth_setup_template_renders_portal_shell(): @@ -46,6 +48,7 @@ def test_admin_users_template_renders_portal_table(): app = build_portal_shell_app( templates_dir=os.path.join(os.path.dirname(__file__), "..", "templates"), ) + app.jinja_env.globals["csrf_token"] = lambda: "test-csrf-token" with app.test_request_context("/admin/users"): from flask import render_template, session @@ -71,3 +74,4 @@ def test_admin_users_template_renders_portal_table(): assert "Review current user access" in html assert "Registered" in html assert "Pending" in html + assert 'name="csrf_token" value="test-csrf-token"' in html diff --git a/result_server/tests/test_csrf.py b/result_server/tests/test_csrf.py new file mode 100644 index 0000000..55520e0 --- /dev/null +++ b/result_server/tests/test_csrf.py @@ -0,0 +1,140 @@ +"""Tests for CSRF enforcement on browser POST routes.""" + +import json +import os +import shutil +import sys +import tempfile + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from test_support import build_api_route_app, build_portal_route_app, install_portal_test_stubs +from utils.csrf import init_csrf + +install_portal_test_stubs() + +API_KEY = "test-api-key-12345" + + +class _Store: + def __init__(self): + self._users = { + "admin@test.com": { + "email": "admin@test.com", + "totp_secret": "SECRET", + "affiliations": ["admin"], + }, + "user@test.com": { + "email": "user@test.com", + "totp_secret": "SECRET2", + "affiliations": ["dev"], + }, + } + + def get_affiliations(self, email): + user = self._users.get(email) + return user["affiliations"] if user else [] + + def list_users(self): + return list(self._users.values()) + + def has_totp_secret(self, email): + user = self._users.get(email) + return bool(user and user.get("totp_secret")) + + def delete_user(self, email): + return self._users.pop(email, None) is not None + + def user_exists(self, email): + return email in self._users + + def update_affiliations(self, email, affiliations): + self._users[email]["affiliations"] = affiliations + return True + + def clear_totp_secret(self, email): + self._users[email]["totp_secret"] = "" + return True + + def create_invitation(self, email, affiliations): + return "token-1" + + +def _portal_app(): + received = tempfile.mkdtemp() + estimated = tempfile.mkdtemp() + app = build_portal_route_app( + templates_dir=os.path.join(os.path.dirname(__file__), "..", "templates"), + received_dir=received, + estimated_dir=estimated, + user_store=_Store(), + ) + init_csrf(app) + return app, (received, estimated) + + +def test_admin_post_without_csrf_token_is_rejected(): + app, temp_dirs = _portal_app() + try: + with app.test_client() as client: + with client.session_transaction() as sess: + sess["authenticated"] = True + sess["user_email"] = "admin@test.com" + sess["user_affiliations"] = ["admin"] + resp = client.post("/admin/users/user@test.com/delete") + + assert resp.status_code == 400 + finally: + for path in temp_dirs: + shutil.rmtree(path) + + +def test_admin_post_with_invalid_csrf_token_is_rejected(): + app, temp_dirs = _portal_app() + try: + with app.test_client() as client: + with client.session_transaction() as sess: + sess["authenticated"] = True + sess["user_email"] = "admin@test.com" + sess["user_affiliations"] = ["admin"] + resp = client.post( + "/admin/users/user@test.com/delete", + data={"csrf_token": "not-a-valid-token"}, + ) + + assert resp.status_code == 400 + finally: + for path in temp_dirs: + shutil.rmtree(path) + + +def test_api_ingest_is_exempt_from_csrf(): + received = tempfile.mkdtemp() + received_padata = tempfile.mkdtemp() + received_estimation_inputs = tempfile.mkdtemp() + estimated = tempfile.mkdtemp() + try: + app = build_api_route_app( + received_dir=received, + received_padata_dir=received_padata, + received_estimation_inputs_dir=received_estimation_inputs, + estimated_dir=estimated, + ) + app.secret_key = "test-secret" + app.config["INGEST_KEYS"] = {API_KEY: "test-runner"} + + from routes.api import api_bp + + init_csrf(app, exempt_blueprints=(api_bp,)) + + with app.test_client() as client: + resp = client.post( + "/api/ingest/result", + data=json.dumps({"code": "test"}), + headers={"X-API-Key": API_KEY, "Content-Type": "application/json"}, + ) + + assert resp.status_code == 200 + finally: + for path in (received, received_padata, received_estimation_inputs, estimated): + shutil.rmtree(path) diff --git a/result_server/tests/test_portal_list_templates.py b/result_server/tests/test_portal_list_templates.py index 62c6701..bb4a878 100644 --- a/result_server/tests/test_portal_list_templates.py +++ b/result_server/tests/test_portal_list_templates.py @@ -89,6 +89,65 @@ def test_results_template_renders_table_note(): assert "#10" in html +def test_results_template_renders_ncu_options_tooltip(): + app = build_portal_shell_app( + templates_dir=os.path.join(os.path.dirname(__file__), "..", "templates"), + ) + with app.test_request_context("/results"): + from flask import render_template + + html = render_template( + "results.html", + columns=[ + {"label": "Timestamp", "key": "timestamp"}, + {"label": "Profiler / PA", "key": "profile_summary"}, + {"label": "JSON", "key": "json_link"}, + ], + rows=[ + { + "timestamp": "2026-04-13 12:00:00", + "profile_summary": "ncu / single", + "profile_summary_meta": { + "has_profile_data": True, + "headline": "ncu / single", + "subline": "text, 1 run", + "events": [], + "ncu_options": ["--target-processes", "all", "--set", "basic", "--launch-count", "1"], + "report_kinds": ["ncu_report", "summary_text"], + }, + "data_link": "/results/padata0.tgz", + "json_link": "/results/result0.json", + "detail_link": "/results/detail/result0.json", + "filename": "result0.json", + "source_info": None, + "quality": {"level": "ready", "label": "Ready", "summary": "Breakdown is present."}, + "system": "RC_GH200", + "code": "genesis", + "fom": 1.0, + "exp": "CASE0", + "fom_version": "test", + "nodes": "1", + "numproc_node": "8", + "nthreads": "9", + "ci_trigger": "push", + "pipeline_id": "10", + "source_hash": "-", + } + ], + pagination={"total": 1, "page": 1, "total_pages": 1}, + current_per_page=50, + current_system="", + current_code="", + current_exp="", + filter_options={"systems": ["RC_GH200"], "codes": ["genesis"], "exps": ["CASE0"]}, + systems_info={}, + ) + + assert "ncu / single" in html + assert "ncu options: --target-processes all --set basic --launch-count 1" in html + assert "ncu_report" in html + + def test_estimated_results_template_renders_table_note(): app = build_portal_shell_app( templates_dir=os.path.join(os.path.dirname(__file__), "..", "templates"), diff --git a/result_server/tests/test_result_detail_template.py b/result_server/tests/test_result_detail_template.py index 64b9ad0..f66b276 100644 --- a/result_server/tests/test_result_detail_template.py +++ b/result_server/tests/test_result_detail_template.py @@ -120,11 +120,33 @@ def test_pa_data_summary_section(self, app): assert "PA Data Summary" in html assert "fapp" in html assert "single" in html - assert "Tool-Specific Events" in html + assert "Tool-Specific Detail" in html assert "fapp event set: pa1" in html assert "summary_text" in html assert "pa1" in html + def test_ncu_pa_data_summary_shows_ncu_options_without_generic_events(self, app): + result = { + **FULL_RESULT, + "profile_data": { + "tool": "ncu", + "level": "single", + "report_format": "text", + "run_count": 1, + "events": [], + "ncu_options": ["--target-processes", "all", "--set", "basic", "--launch-count", "1"], + "report_kinds": ["ncu_report", "summary_text"], + }, + } + with app.test_request_context(): + html = _render_result_detail(result, FULL_QUALITY) + + assert "Tool-Specific Detail" in html + assert "ncu options: --target-processes all --set basic --launch-count 1" in html + assert "NCU Options" in html + assert "ncu_report" in html + assert ">Events<" not in html + def test_vector_data_table(self, app): with app.test_request_context(): html = _render_result_detail(FULL_RESULT, FULL_QUALITY) diff --git a/result_server/tests/test_result_padata_route.py b/result_server/tests/test_result_padata_route.py index 1b0d6cd..1c3a950 100644 --- a/result_server/tests/test_result_padata_route.py +++ b/result_server/tests/test_result_padata_route.py @@ -50,3 +50,27 @@ def test_results_route_serves_padata_from_received_padata_dir(client, tmp_dirs): resp = client.get(f"/results/{tgz_name}") assert resp.status_code == 200 assert resp.data == b"fake tgz content" + + +def test_results_route_blocks_confidential_padata_matched_by_server_uuid(client, tmp_dirs): + received, received_padata = tmp_dirs + uid = "12345678-1234-1234-1234-123456789abc" + tgz_name = f"padata_20250101_120000_{uid}.tgz" + + with open(os.path.join(received, "result0.json"), "w", encoding="utf-8") as f: + json.dump( + { + "code": "qws", + "system": "Fugaku", + "FOM": 1.0, + "_server_uuid": uid, + "confidential": ["dev"], + }, + f, + ) + + with open(os.path.join(received_padata, tgz_name), "wb") as f: + f.write(b"fake tgz content") + + resp = client.get(f"/results/{tgz_name}") + assert resp.status_code == 403 diff --git a/result_server/tests/test_results_loader.py b/result_server/tests/test_results_loader.py index ea961d4..2940628 100644 --- a/result_server/tests/test_results_loader.py +++ b/result_server/tests/test_results_loader.py @@ -320,8 +320,37 @@ def test_profile_summary_is_built_from_profile_data(self, flask_app, tmp_dir): assert row["profile_summary_meta"]["headline"] == "fapp / detailed" assert row["profile_summary_meta"]["subline"] == "both, 17 runs" assert row["profile_summary_meta"]["events"][0] == "pa1" + assert row["profile_summary_meta"]["ncu_options"] == [] assert "cpu_pa_csv" in row["profile_summary_meta"]["report_kinds"] + def test_profile_summary_keeps_ncu_options_separate_from_events(self, flask_app, tmp_dir): + uid = str(uuid.uuid4()) + _write_json(tmp_dir, f"result_20250101_120000_{uid}.json", { + "code": "genesis", + "system": "RC_GH200", + "Exp": "CASE0", + "FOM": 1.0, + "profile_data": { + "tool": "ncu", + "level": "single", + "report_format": "text", + "run_count": 1, + "events": [], + "ncu_options": ["--target-processes", "all", "--set", "basic", "--launch-count", "1"], + "report_kinds": ["ncu_report", "summary_text"], + }, + }) + + with flask_app.test_request_context(): + rows, _, _ = load_results_table(tmp_dir, public_only=True) + + assert len(rows) == 1 + row = rows[0] + assert row["profile_summary"] == "ncu / single" + assert row["profile_summary_meta"]["events"] == [] + assert row["profile_summary_meta"]["ncu_options"][:2] == ["--target-processes", "all"] + assert "ncu_report" in row["profile_summary_meta"]["report_kinds"] + class TestSummarizeResultQuality: def test_basic_quality_without_breakdown(self): diff --git a/result_server/tests/test_totp_security.py b/result_server/tests/test_totp_security.py index 7f4079c..d3f2616 100644 --- a/result_server/tests/test_totp_security.py +++ b/result_server/tests/test_totp_security.py @@ -168,6 +168,11 @@ def has_totp_secret(self, email): return bool(user and user.get("totp_secret")) +class _BrokenRedis: + def ping(self): + raise ConnectionError("redis down") + + @pytest.fixture def admin_app(): """Create a Flask app for admin self-delete protection tests.""" @@ -207,6 +212,73 @@ def systemlist(): shutil.rmtree(temp_dir) +@pytest.fixture +def auth_app(): + """Create a Flask app for focused auth Redis availability tests.""" + app = Flask( + __name__, + template_folder=os.path.join(os.path.dirname(__file__), "..", "templates"), + ) + app.secret_key = "test-secret" + app.config["TESTING"] = True + app.config["USER_STORE"] = _StubUserStore() + + from routes.admin import admin_bp + from routes.auth import auth_bp + from routes.estimated import estimated_bp + from routes.home import register_home_routes + from routes.results import results_bp + + register_home_routes(app) + app.register_blueprint(admin_bp, url_prefix="/admin") + app.register_blueprint(auth_bp, url_prefix="/auth") + app.register_blueprint(results_bp, url_prefix="/results") + app.register_blueprint(estimated_bp, url_prefix="/estimated") + + temp_dir = tempfile.mkdtemp() + app.config["RECEIVED_DIR"] = temp_dir + app.config["ESTIMATED_DIR"] = temp_dir + + @app.route("/systemlist") + def systemlist(): + return "systems" + + yield app + shutil.rmtree(temp_dir) + + +class TestAuthRedisFailClosed: + """Tests production login behavior when Redis is unavailable.""" + + def test_requires_redis_without_connection_returns_503(self, auth_app): + auth_app.config["AUTH_REQUIRES_REDIS"] = True + auth_app.config["REDIS_CONN"] = None + + with auth_app.test_client() as client: + resp = client.post("/auth/login", data={"email": "user@test.com"}) + + assert resp.status_code == 503 + + def test_requires_redis_with_failed_ping_returns_503(self, auth_app): + auth_app.config["AUTH_REQUIRES_REDIS"] = True + auth_app.config["REDIS_CONN"] = _BrokenRedis() + + with auth_app.test_client() as client: + resp = client.post("/auth/login", data={"email": "user@test.com"}) + + assert resp.status_code == 503 + + def test_dev_mode_without_redis_continues_login_flow(self, auth_app): + auth_app.config["AUTH_REQUIRES_REDIS"] = False + auth_app.config["REDIS_CONN"] = None + + with auth_app.test_client() as client: + resp = client.post("/auth/login", data={"email": "user@test.com"}) + + assert resp.status_code == 200 + assert b"Step 2 of 2" in resp.data + + class TestAdminSelfDeletePrevention: """Tests that admins cannot delete their own account.""" diff --git a/result_server/utils/auth.py b/result_server/utils/auth.py new file mode 100644 index 0000000..8b16ad1 --- /dev/null +++ b/result_server/utils/auth.py @@ -0,0 +1,65 @@ +"""Authentication helpers shared by result_server API routes.""" + +from __future__ import annotations + +import hmac +import os +import warnings +from collections.abc import Mapping +from typing import Optional + +from flask import current_app + + +def parse_ingest_keys(env: Mapping[str, str] | None = None) -> dict[str, str]: + """Parse RESULT_SERVER_KEYS/RESULT_SERVER_KEY into {api_key: runner_id}.""" + env = env or os.environ + keys: dict[str, str] = {} + + multi_key_spec = env.get("RESULT_SERVER_KEYS", "").strip() + if multi_key_spec: + for entry in multi_key_spec.split(","): + if not entry.strip(): + continue + if ":" not in entry: + warnings.warn( + "Ignoring RESULT_SERVER_KEYS entry without runner_id:key format.", + RuntimeWarning, + stacklevel=2, + ) + continue + runner_id, key = (part.strip() for part in entry.split(":", 1)) + if not runner_id or not key: + warnings.warn( + "Ignoring RESULT_SERVER_KEYS entry with empty runner_id or key.", + RuntimeWarning, + stacklevel=2, + ) + continue + keys[key] = runner_id + + legacy_key = env.get("RESULT_SERVER_KEY", "").strip() + if legacy_key: + warnings.warn( + "RESULT_SERVER_KEY is deprecated; use RESULT_SERVER_KEYS=runner-id:key.", + DeprecationWarning, + stacklevel=2, + ) + keys.setdefault(legacy_key, "default") + + return keys + + +def verify_ingest_key(presented: str | None) -> Optional[str]: + """Return the runner_id for a valid ingest key, otherwise None.""" + if not presented: + return None + + keys = current_app.config.get("INGEST_KEYS") + if keys is None: + keys = parse_ingest_keys() + + for configured_key, runner_id in keys.items(): + if hmac.compare_digest(presented, configured_key): + return runner_id + return None diff --git a/result_server/utils/csrf.py b/result_server/utils/csrf.py new file mode 100644 index 0000000..6b70072 --- /dev/null +++ b/result_server/utils/csrf.py @@ -0,0 +1,13 @@ +"""CSRF extension setup for the result server.""" + +from flask_wtf.csrf import CSRFProtect + +csrf = CSRFProtect() + + +def init_csrf(app, *, exempt_blueprints=()): + """Initialize CSRF protection and exempt non-browser API blueprints.""" + for blueprint in exempt_blueprints: + csrf.exempt(blueprint) + csrf.init_app(app) + return csrf diff --git a/result_server/utils/result_detail_view.py b/result_server/utils/result_detail_view.py index b95fc90..c4cb2c7 100644 --- a/result_server/utils/result_detail_view.py +++ b/result_server/utils/result_detail_view.py @@ -43,21 +43,35 @@ def _build_profile_rows(profile_data): return [] events = profile_data.get("events") or [] + ncu_options = profile_data.get("ncu_options") or [] report_kinds = profile_data.get("report_kinds") or [] - return build_labeled_value_rows([ + rows = build_labeled_value_rows([ ("Tool", profile_data.get("tool", "N/A")), ("Level", profile_data.get("level", "N/A")), ("Report Format", profile_data.get("report_format", "N/A")), ("Run Count", profile_data.get("run_count", "N/A")), - ("Tool-Specific Events", _build_tool_specific_events_description(profile_data)), - ("Events", ", ".join(events) if events else "none"), - ("Report Kinds", ", ".join(report_kinds) if report_kinds else "none"), ]) + tool_specific_detail = _build_tool_specific_detail(profile_data) + if tool_specific_detail: + rows.append({"label": "Tool-Specific Detail", "value": tool_specific_detail}) + if events: + rows.append({"label": "Events", "value": ", ".join(events)}) + if ncu_options: + rows.append({"label": "NCU Options", "value": " ".join(ncu_options)}) + if report_kinds: + rows.append({"label": "Report Kinds", "value": ", ".join(report_kinds)}) + return rows + +def _build_tool_specific_detail(profile_data): + if profile_data.get("tool") == "ncu": + ncu_options = profile_data.get("ncu_options") or [] + if ncu_options: + return f"ncu options: {' '.join(ncu_options)}" + return "ncu options recorded in archive metadata when available" -def _build_tool_specific_events_description(profile_data): if profile_data.get("tool") != "fapp": - return "tool-specific event set" + return "tool-specific metadata" level = profile_data.get("level") mapping = { diff --git a/result_server/utils/result_file.py b/result_server/utils/result_file.py index 3f8dfc7..95825ac 100644 --- a/result_server/utils/result_file.py +++ b/result_server/utils/result_file.py @@ -44,11 +44,22 @@ def get_file_confidential_tags(filename: str, save_dir: str): if not uuid_match: return [] - uuid = uuid_match.group(0) + uuid = uuid_match.group(0).lower() + tags = [] for json_filename in os.listdir(save_dir): - if json_filename.endswith(".json") and uuid in json_filename: - return _read_confidential_from_json(json_filename, save_dir) - return [] + if not json_filename.endswith(".json"): + continue + if uuid in json_filename.lower(): + tags.extend(_read_confidential_from_json(json_filename, save_dir)) + continue + + data = _read_json(json_filename, save_dir) + if not isinstance(data, dict): + continue + server_uuid = data.get("_server_uuid") + if server_uuid is not None and str(server_uuid).lower() == uuid: + tags.extend(_extract_confidential_tags(data)) + return _unique_tags(tags) def check_file_permission(filename: str, dir_path: str) -> None: @@ -116,28 +127,48 @@ def load_authenticated_result_json( def _read_confidential_from_json(json_file: str, save_dir: str): + data = _read_json(json_file, save_dir) + if not isinstance(data, dict): + return [] + return _extract_confidential_tags(data) + + +def _read_json(json_file: str, save_dir: str): filepath = os.path.join(save_dir, json_file) if not os.path.exists(filepath): - return [] + return None try: with open(filepath, "r", encoding="utf-8") as f: - data = json.load(f) + return json.load(f) + except Exception: + return None - confidential_value = data.get("confidential", None) - if isinstance(confidential_value, list): - return [ - str(item).strip() - for item in confidential_value - if item and str(item).lower() != "null" - ] +def _extract_confidential_tags(data): + confidential_value = data.get("confidential", None) - if isinstance(confidential_value, str): - confidential_value = confidential_value.strip() - if confidential_value.lower() != "null" and confidential_value != "": - return [confidential_value] + if isinstance(confidential_value, list): + return [ + str(item).strip() + for item in confidential_value + if item and str(item).lower() != "null" + ] - return [] - except Exception: - return [] + if isinstance(confidential_value, str): + confidential_value = confidential_value.strip() + if confidential_value.lower() != "null" and confidential_value != "": + return [confidential_value] + + return [] + + +def _unique_tags(tags): + unique = [] + seen = set() + for tag in tags: + if tag in seen: + continue + seen.add(tag) + unique.append(tag) + return unique diff --git a/result_server/utils/result_table_rows.py b/result_server/utils/result_table_rows.py index 6cb09dd..db025a5 100644 --- a/result_server/utils/result_table_rows.py +++ b/result_server/utils/result_table_rows.py @@ -105,6 +105,8 @@ def _format_profile_summary(profile_data): return " / ".join(headline_parts) if headline_parts else "profile data" +# The template expects a stable, display-ready shape even when older result JSON +# files do not have profile_data or when only one profiler family is present. def _build_profile_summary_meta(profile_data): if not isinstance(profile_data, dict) or not profile_data: return { @@ -112,6 +114,7 @@ def _build_profile_summary_meta(profile_data): "headline": "", "subline": "", "events": [], + "ncu_options": [], "report_kinds": [], } @@ -128,5 +131,6 @@ def _build_profile_summary_meta(profile_data): "headline": _format_profile_summary(profile_data), "subline": ", ".join(subline_parts), "events": profile_data.get("events") if isinstance(profile_data.get("events"), list) else [], + "ncu_options": profile_data.get("ncu_options") if isinstance(profile_data.get("ncu_options"), list) else [], "report_kinds": profile_data.get("report_kinds") if isinstance(profile_data.get("report_kinds"), list) else [], } diff --git a/scripts/bk_functions.sh b/scripts/bk_functions.sh index bc8ea6f..c73a69a 100644 --- a/scripts/bk_functions.sh +++ b/scripts/bk_functions.sh @@ -1,8 +1,8 @@ -#!/bin/sh +#!/bin/bash # bk_functions.sh - Common functions for standardized benchmark result output. -# Source this file from Run_Scripts: source scripts/bk_functions.sh +# Source this file from BenchKit bash run/build/estimate scripts. # -# POSIX compatible (no jq dependency). +# Bash is required for the estimation and profiler helpers below. # bk_emit_result - Output a standardized FOM result line. # @@ -621,9 +621,14 @@ bk_run_estimation_data_collection() { # # BenchKit keeps the common wrapper in bk_functions.sh, while each application # decides whether to use a profiler and which profiler tool / level to request. +# A level is translated per tool: fapp levels expand to one or more counter +# event runs, while ncu levels expand to a single Nsight Compute invocation with +# preset command-line options. Both tools stage raw data and meta.json in the +# same archive shape so result generation and the portal do not need to infer +# tool-specific filenames. # # Positional arguments: -# $1 - profiler tool (empty|none|off|fapp) +# $1 - profiler tool (empty|none|off|fapp|ncu) # # Supported variables: # BK_PROFILER_LEVEL optional profiler level override @@ -632,6 +637,8 @@ bk_run_estimation_data_collection() { # BK_PROFILER_REPORT_ARGS optional extra postprocess flags # BK_PROFILER_DIR raw profile output dir (default: pa) # BK_PROFILER_STAGE_DIR temporary staging dir for archive creation +# BK_PROFILER_ARGS and BK_PROFILER_REPORT_ARGS are expanded as shell fragments +# because sites often need to pass multiple profiler flags from CI variables. bk_get_profiler_tool() { _bk_profiler_tool="${1:-}" case "$_bk_profiler_tool" in @@ -639,7 +646,7 @@ bk_get_profiler_tool() { printf '%s\n' "" return 0 ;; - fapp) + fapp|ncu) printf '%s\n' "$_bk_profiler_tool" return 0 ;; @@ -669,6 +676,9 @@ bk_get_profiler_level() { fapp) _bk_profiler_level="single" ;; + ncu) + _bk_profiler_level="single" + ;; esac fi @@ -677,6 +687,10 @@ bk_get_profiler_level() { printf '%s\n' "$_bk_profiler_level" return 0 ;; + ncu:single|ncu:simple|ncu:standard|ncu:detailed) + printf '%s\n' "$_bk_profiler_level" + return 0 + ;; *) echo "bk_get_profiler_level: unsupported level '${_bk_profiler_level}' for tool '${_bk_profiler_tool}'" >&2 return 1 @@ -702,6 +716,9 @@ bk_get_profiler_report_format() { fapp:simple|fapp:standard|fapp:detailed) _bk_profiler_report_format="both" ;; + ncu:single|ncu:simple|ncu:standard|ncu:detailed) + _bk_profiler_report_format="text" + ;; esac fi @@ -717,6 +734,8 @@ bk_get_profiler_report_format() { esac } +# fapp requires a separate profiler run for each counter group. The public +# level names stay stable even if the underlying pa* event set changes. bk_profiler_fapp_level_events() { case "$1" in single) @@ -752,6 +771,73 @@ bk_profiler_fapp_postprocess_command() { return 1 } +# Nsight Compute levels are intentionally short presets. Callers can append +# site- or application-specific flags with BK_PROFILER_ARGS. +bk_profiler_ncu_level_args() { + case "$1" in + single) + printf '%s\n' "--set basic --launch-count 1" + ;; + simple) + printf '%s\n' "--set basic --launch-count 5" + ;; + standard) + printf '%s\n' "--set full --launch-count 1" + ;; + detailed) + printf '%s\n' "--set full --nvtx" + ;; + *) + echo "bk_profiler_ncu_level_args: unsupported level '$1'" >&2 + return 1 + ;; + esac +} + +# Nsight Compute changed report suffixes across releases, so look for all names +# we have seen instead of hard-coding only .ncu-rep. +bk_profiler_find_ncu_report() { + _bk_ncu_report_dir="$1" + find "$_bk_ncu_report_dir" -maxdepth 1 -type f \( \ + -name '*.ncu-rep' -o \ + -name '*.nsight-cuprof' -o \ + -name 'profile*' \ + \) | head -n 1 +} + +bk_json_escape() { + _bk_json_value="$1" + _bk_json_value=${_bk_json_value//\\/\\\\} + _bk_json_value=${_bk_json_value//\"/\\\"} + _bk_json_value=${_bk_json_value//$'\t'/\\t} + _bk_json_value=${_bk_json_value//$'\r'/\\r} + _bk_json_value=${_bk_json_value//$'\n'/\\n} + printf '%s' "$_bk_json_value" +} + +bk_json_string() { + printf '"' + bk_json_escape "$1" + printf '"' +} + +bk_json_string_array() { + _bk_json_first=1 + printf '[' + for _bk_json_item in "$@"; do + if [ "$_bk_json_first" -eq 0 ]; then + printf ', ' + fi + bk_json_string "$_bk_json_item" + _bk_json_first=0 + done + printf ']' +} + +# Write a compact, tool-neutral manifest for the profiler archive. Result JSON +# generation reads this manifest to expose summary fields without opening every +# raw profiler artifact. For fapp, run_events contains counter names; for ncu it +# carries the selected level so optional hooks still receive useful context. bk_profiler_write_meta() { _bk_meta_stage_dir="$1" _bk_meta_tool="$2" @@ -759,6 +845,8 @@ bk_profiler_write_meta() { _bk_meta_report_format="$4" _bk_meta_run_names="$5" _bk_meta_run_events="$6" + _bk_meta_profiler_args="$7" + _bk_meta_report_args="$8" _bk_meta_file="${_bk_meta_stage_dir}/meta.json" IFS=',' read -r -a _bk_meta_names <<< "$_bk_meta_run_names" IFS=',' read -r -a _bk_meta_events <<< "$_bk_meta_run_events" @@ -769,19 +857,94 @@ bk_profiler_write_meta() { printf ' "level": "%s",\n' "$_bk_meta_level" printf ' "report_format": "%s",\n' "$_bk_meta_report_format" printf ' "raw_dir": "raw",\n' + printf ' "measurement": {\n' + printf ' "run_count": %s,\n' "${#_bk_meta_names[@]}" + printf ' "profiler_args": ' + bk_json_string "$_bk_meta_profiler_args" + printf ',\n' + printf ' "report_args": ' + bk_json_string "$_bk_meta_report_args" + case "$_bk_meta_tool" in + fapp) + printf ',\n' + printf ' "fapp_events": ' + bk_json_string_array "${_bk_meta_events[@]}" + printf '\n' + ;; + ncu) + _bk_meta_ncu_level_args=$(bk_profiler_ncu_level_args "$_bk_meta_level") + read -r -a _bk_meta_ncu_level_arg_array <<< "$_bk_meta_ncu_level_args" + printf ',\n' + printf ' "ncu_options": ' + bk_json_string_array "--target-processes" "all" "${_bk_meta_ncu_level_arg_array[@]}" + printf '\n' + ;; + *) + printf '\n' + ;; + esac + printf ' },\n' printf ' "runs": [\n' for _bk_meta_idx in "${!_bk_meta_names[@]}"; do _bk_meta_name="${_bk_meta_names[$_bk_meta_idx]}" _bk_meta_event="${_bk_meta_events[$_bk_meta_idx]:-}" - _bk_meta_text_path="reports/fapp_A_${_bk_meta_name}.txt" - _bk_meta_csv_path="reports/cpu_pa_${_bk_meta_name}.csv" - _bk_meta_text_abs="${_bk_meta_stage_dir}/${_bk_meta_text_path}" - _bk_meta_csv_abs="${_bk_meta_stage_dir}/${_bk_meta_csv_path}" + case "$_bk_meta_tool" in + fapp) + _bk_meta_text_path="reports/fapp_A_${_bk_meta_name}.txt" + _bk_meta_csv_path="reports/cpu_pa_${_bk_meta_name}.csv" + _bk_meta_text_abs="${_bk_meta_stage_dir}/${_bk_meta_text_path}" + _bk_meta_csv_abs="${_bk_meta_stage_dir}/${_bk_meta_csv_path}" + _bk_meta_ncu_report_path="" + _bk_meta_ncu_report_abs="" + ;; + ncu) + _bk_meta_text_path="reports/ncu_import_${_bk_meta_name}.txt" + _bk_meta_csv_path="" + _bk_meta_text_abs="${_bk_meta_stage_dir}/${_bk_meta_text_path}" + _bk_meta_csv_abs="" + _bk_meta_ncu_report_abs=$(bk_profiler_find_ncu_report "${_bk_meta_stage_dir}/raw/${_bk_meta_name}" || true) + if [ -n "$_bk_meta_ncu_report_abs" ]; then + _bk_meta_ncu_report_path="${_bk_meta_ncu_report_abs#${_bk_meta_stage_dir}/}" + else + _bk_meta_ncu_report_path="" + fi + ;; + *) + _bk_meta_text_path="" + _bk_meta_csv_path="" + _bk_meta_text_abs="" + _bk_meta_csv_abs="" + _bk_meta_ncu_report_path="" + _bk_meta_ncu_report_abs="" + ;; + esac printf ' {\n' printf ' "name": "%s",\n' "$_bk_meta_name" printf ' "event": "%s",\n' "$_bk_meta_event" printf ' "raw_path": "raw/%s",\n' "$_bk_meta_name" + printf ' "measurement": {\n' + case "$_bk_meta_tool" in + fapp) + printf ' "counter": ' + bk_json_string "$_bk_meta_event" + printf ',\n' + printf ' "options": ' + bk_json_string_array "-C" "-d" "raw/${_bk_meta_name}" "-Hevent=${_bk_meta_event}" + printf '\n' + ;; + ncu) + _bk_meta_ncu_level_args=$(bk_profiler_ncu_level_args "$_bk_meta_level") + read -r -a _bk_meta_ncu_level_arg_array <<< "$_bk_meta_ncu_level_args" + printf ' "options": ' + bk_json_string_array "-o" "raw/${_bk_meta_name}/profile" "--target-processes" "all" "${_bk_meta_ncu_level_arg_array[@]}" + printf '\n' + ;; + *) + printf ' "options": []\n' + ;; + esac + printf ' },\n' printf ' "reports": [\n' _bk_meta_has_report=0 if [ -f "$_bk_meta_text_abs" ]; then @@ -795,6 +958,13 @@ bk_profiler_write_meta() { printf ' {"kind": "cpu_pa_csv", "path": "%s"}' "$_bk_meta_csv_path" _bk_meta_has_report=1 fi + if [ -n "$_bk_meta_ncu_report_path" ] && [ -f "$_bk_meta_ncu_report_abs" ]; then + if [ "$_bk_meta_has_report" -eq 1 ]; then + printf ',\n' + fi + printf ' {"kind": "ncu_report", "path": "%s"}' "$_bk_meta_ncu_report_path" + _bk_meta_has_report=1 + fi if [ "$_bk_meta_has_report" -eq 1 ]; then printf '\n' fi @@ -810,6 +980,9 @@ bk_profiler_write_meta() { } > "$_bk_meta_file" } +# Optional hooks let site scripts wrap individual profiler runs, for example to +# load per-run modules or print scheduler diagnostics. Hook arguments are: +# tool, level, run name, fapp event or ncu level, then the profiled command. bk_profiler_call_optional_hook() { _bk_hook_name="$1" shift || true @@ -900,6 +1073,9 @@ bk_profiler() { mkdir -p "$_bk_stage_dir/reports" _bk_profiler_run_names="" _bk_profiler_run_events="" + _bk_profiler_status=0 + _bk_profiler_extra_args="${BK_PROFILER_ARGS:-}" + _bk_profiler_report_extra_args="${BK_PROFILER_REPORT_ARGS:-}" case "$_bk_profiler_tool" in fapp) @@ -912,10 +1088,20 @@ bk_profiler() { mkdir -p "$_bk_fapp_rep_dir" echo "bk_profiler[fapp]: starting ${_bk_fapp_rep_name} event=${_bk_fapp_event}" >&2 bk_profiler_call_optional_hook bk_profiler_before_run "$_bk_profiler_tool" "$_bk_profiler_level" "$_bk_fapp_rep_name" "$_bk_fapp_event" "$@" || return 1 + # BK_PROFILER_ARGS is intentionally word-split into fapp options. # shellcheck disable=SC2086 - fapp -C -d "$_bk_fapp_rep_dir" ${BK_PROFILER_ARGS:-} -Hevent="${_bk_fapp_event}" "$@" + if fapp -C -d "$_bk_fapp_rep_dir" ${_bk_profiler_extra_args} -Hevent="${_bk_fapp_event}" "$@"; then + _bk_fapp_status=0 + else + _bk_fapp_status=$? + fi bk_profiler_call_optional_hook bk_profiler_after_run "$_bk_profiler_tool" "$_bk_profiler_level" "$_bk_fapp_rep_name" "$_bk_fapp_event" "$@" || return 1 - echo "bk_profiler[fapp]: completed ${_bk_fapp_rep_name} event=${_bk_fapp_event}" >&2 + if [ "$_bk_fapp_status" -eq 0 ]; then + echo "bk_profiler[fapp]: completed ${_bk_fapp_rep_name} event=${_bk_fapp_event}" >&2 + else + echo "bk_profiler[fapp]: failed ${_bk_fapp_rep_name} event=${_bk_fapp_event} status=${_bk_fapp_status}" >&2 + _bk_profiler_status="$_bk_fapp_status" + fi cp -R "$_bk_fapp_rep_dir" "$_bk_stage_dir/raw/${_bk_fapp_rep_name}" if [ -n "$_bk_profiler_run_names" ]; then _bk_profiler_run_names="${_bk_profiler_run_names},${_bk_fapp_rep_name}" @@ -925,10 +1111,44 @@ bk_profiler() { _bk_profiler_run_events="${_bk_fapp_event}" fi _bk_fapp_run_index=$((_bk_fapp_run_index + 1)) + if [ "$_bk_fapp_status" -ne 0 ]; then + break + fi done ;; + ncu) + if ! command -v ncu >/dev/null 2>&1; then + echo "bk_profiler[ncu]: ncu not found in PATH" >&2 + return 1 + fi + _bk_ncu_rep_name="rep1" + _bk_ncu_rep_dir="${_bk_profiler_dir}/${_bk_ncu_rep_name}" + _bk_ncu_profile_base="${_bk_ncu_rep_dir}/profile" + mkdir -p "$_bk_ncu_rep_dir" + _bk_ncu_level_args=$(bk_profiler_ncu_level_args "$_bk_profiler_level") || return 1 + echo "bk_profiler[ncu]: starting ${_bk_ncu_rep_name} level=${_bk_profiler_level}" >&2 + bk_profiler_call_optional_hook bk_profiler_before_run "$_bk_profiler_tool" "$_bk_profiler_level" "$_bk_ncu_rep_name" "$_bk_profiler_level" "$@" || return 1 + # BK_PROFILER_ARGS is intentionally word-split into ncu options. + # shellcheck disable=SC2086 + if ncu -o "$_bk_ncu_profile_base" --target-processes all ${_bk_ncu_level_args} ${_bk_profiler_extra_args} "$@"; then + _bk_profiler_status=0 + else + _bk_profiler_status=$? + fi + bk_profiler_call_optional_hook bk_profiler_after_run "$_bk_profiler_tool" "$_bk_profiler_level" "$_bk_ncu_rep_name" "$_bk_profiler_level" "$@" || return 1 + if [ "$_bk_profiler_status" -eq 0 ]; then + echo "bk_profiler[ncu]: completed ${_bk_ncu_rep_name} level=${_bk_profiler_level}" >&2 + else + echo "bk_profiler[ncu]: failed ${_bk_ncu_rep_name} level=${_bk_profiler_level} status=${_bk_profiler_status}" >&2 + fi + cp -R "$_bk_ncu_rep_dir" "$_bk_stage_dir/raw/${_bk_ncu_rep_name}" + _bk_profiler_run_names="${_bk_ncu_rep_name}" + _bk_profiler_run_events="${_bk_profiler_level}" + ;; esac + # Report import/postprocess is best-effort: keep the raw archive even when a + # text/CSV summary cannot be produced on the run node. case "$_bk_profiler_tool" in fapp) if _bk_fapp_post_cmd=$(bk_profiler_fapp_postprocess_command); then @@ -937,23 +1157,46 @@ bk_profiler() { for _bk_fapp_rep_name in "${_bk_fapp_run_name_list[@]}"; do _bk_fapp_rep_dir="${_bk_profiler_dir}/${_bk_fapp_rep_name}" if [ "$_bk_profiler_report_format" = "text" ] || [ "$_bk_profiler_report_format" = "both" ]; then + # BK_PROFILER_REPORT_ARGS is intentionally word-split into fapp/fapppx options. # shellcheck disable=SC2086 - "$_bk_fapp_post_cmd" -A -d "$_bk_fapp_rep_dir" ${BK_PROFILER_REPORT_ARGS:-} > "$_bk_stage_dir/reports/fapp_A_${_bk_fapp_rep_name}.txt" 2>&1 || true + "$_bk_fapp_post_cmd" -A -d "$_bk_fapp_rep_dir" ${_bk_profiler_report_extra_args} > "$_bk_stage_dir/reports/fapp_A_${_bk_fapp_rep_name}.txt" 2>&1 || true fi if [ "$_bk_profiler_report_format" = "csv" ] || [ "$_bk_profiler_report_format" = "both" ]; then + # BK_PROFILER_REPORT_ARGS is intentionally word-split into fapp/fapppx options. # shellcheck disable=SC2086 - "$_bk_fapp_post_cmd" -A -d "$_bk_fapp_rep_dir" ${BK_PROFILER_REPORT_ARGS:-} -Icpupa -tcsv -o "$_bk_stage_dir/reports/cpu_pa_${_bk_fapp_rep_name}.csv" >/dev/null 2>&1 || true + "$_bk_fapp_post_cmd" -A -d "$_bk_fapp_rep_dir" ${_bk_profiler_report_extra_args} -Icpupa -tcsv -o "$_bk_stage_dir/reports/cpu_pa_${_bk_fapp_rep_name}.csv" >/dev/null 2>&1 || true fi done else echo "fapp/fapppx not found in PATH" > "$_bk_stage_dir/reports/fapp_A_missing.txt" fi ;; + ncu) + IFS=',' read -r -a _bk_ncu_run_name_list <<< "$_bk_profiler_run_names" + for _bk_ncu_rep_name in "${_bk_ncu_run_name_list[@]}"; do + _bk_ncu_report_file=$(bk_profiler_find_ncu_report "$_bk_profiler_dir/${_bk_ncu_rep_name}" || true) + if [ -n "$_bk_ncu_report_file" ] && { [ "$_bk_profiler_report_format" = "text" ] || [ "$_bk_profiler_report_format" = "both" ]; }; then + # BK_PROFILER_REPORT_ARGS is intentionally word-split into ncu --import options. + # shellcheck disable=SC2086 + ncu --import "$_bk_ncu_report_file" --page details ${_bk_profiler_report_extra_args} > "$_bk_stage_dir/reports/ncu_import_${_bk_ncu_rep_name}.txt" 2>&1 || true + fi + done + ;; esac - bk_profiler_write_meta "$_bk_stage_dir" "$_bk_profiler_tool" "$_bk_profiler_level" "$_bk_profiler_report_format" "$_bk_profiler_run_names" "$_bk_profiler_run_events" - tar -czf "$_bk_profiler_archive" "$_bk_stage_dir" + # Preserve the profiler command status after metadata/archive creation. If the + # archive itself cannot be written, that failure is more actionable to CI. + bk_profiler_write_meta "$_bk_stage_dir" "$_bk_profiler_tool" "$_bk_profiler_level" "$_bk_profiler_report_format" "$_bk_profiler_run_names" "$_bk_profiler_run_events" "$_bk_profiler_extra_args" "$_bk_profiler_report_extra_args" + if tar -czf "$_bk_profiler_archive" "$_bk_stage_dir"; then + _bk_profiler_archive_status=0 + else + _bk_profiler_archive_status=$? + fi rm -rf "$_bk_stage_dir" + if [ "$_bk_profiler_archive_status" -ne 0 ]; then + return "$_bk_profiler_archive_status" + fi + return "$_bk_profiler_status" } # bk_emit_overlap - Backward-compatible wrapper for overlap-like section timing. diff --git a/scripts/job_functions.sh b/scripts/job_functions.sh index 7da16a6..51cad84 100644 --- a/scripts/job_functions.sh +++ b/scripts/job_functions.sh @@ -64,6 +64,24 @@ get_system_queue_group() { return 0 } +# Queue templates can request aggregate CPU sockets or GPU cards. Pull the +# per-node values from system_info.csv instead of duplicating them in queue.csv. +get_system_cpu_per_node() { + local system="$1" + local info_file="${SYSTEM_INFO_FILE:-config/system_info.csv}" + awk -F, -v s="$system" '$1==s {print $4}' "$info_file" + return 0 +} + +# A dash in system_info.csv means "no GPU"; matrix_generate.sh normalizes +# non-numeric values to zero before doing scheduler arithmetic. +get_system_gpu_per_node() { + local system="$1" + local info_file="${SYSTEM_INFO_FILE:-config/system_info.csv}" + awk -F, -v s="$system" '$1==s {print $7}' "$info_file" + return 0 +} + # System_CSVからtag_buildを取得する # $1: システム名 # mode=nativeの場合は空文字を返す(tag_buildカラム自体が空) diff --git a/scripts/matrix_generate.sh b/scripts/matrix_generate.sh index 396aa66..3f922b7 100644 --- a/scripts/matrix_generate.sh +++ b/scripts/matrix_generate.sh @@ -10,6 +10,7 @@ set -euo pipefail SYSTEM_FILE="config/system.csv" QUEUE_FILE="config/queue.csv" +SYSTEM_INFO_FILE="config/system_info.csv" OUTPUT_FILE=".gitlab-ci.generated.yml" source ./scripts/job_functions.sh @@ -74,7 +75,16 @@ for listfile in programs/*/list.csv; do job_prefix="${program}_${system}_N${nodes}_P${numproc_node}_T${nthreads}" program_path="$program_dir" - export elapse nodes queue_group numproc_node nthreads + # queue.csv templates can use both direct list.csv values and derived + # scheduler quantities such as total ranks, CPU sockets, and GPU cards. + proc=$((nodes * numproc_node)) + cpu_per_node=$(get_system_cpu_per_node "$system") + gpu_per_node=$(get_system_gpu_per_node "$system") + [[ "$cpu_per_node" =~ ^[0-9]+$ ]] || cpu_per_node=0 + [[ "$gpu_per_node" =~ ^[0-9]+$ ]] || gpu_per_node=0 + cpu_sockets=$((nodes * cpu_per_node)) + gpu_cards=$((nodes * gpu_per_node)) + export elapse nodes queue_group numproc_node nthreads proc cpu_per_node gpu_per_node cpu_sockets gpu_cards read -r submit_cmd template <<< "$(get_queue_template "$system")" if [[ -z "$submit_cmd" || -z "$template" ]]; then @@ -211,4 +221,3 @@ ${job_prefix}_build_run: done < "$listfile" done - diff --git a/scripts/result.sh b/scripts/result.sh index 3f11e3e..b82a274 100644 --- a/scripts/result.sh +++ b/scripts/result.sh @@ -24,6 +24,56 @@ node_count='how_many' numproc_node="" nthreads="" +# Read the lightweight profiler manifest from a padata archive and turn it into +# the small profile_data block stored in result*.json. Missing or unreadable +# archives are ignored so FOM result generation is not blocked by profiler +# postprocessing problems. +build_profile_data_summary() { + local tgz_file="$1" + + if [[ ! -f "$tgz_file" ]]; then + printf '%s' "" + return 0 + fi + + local meta_member + meta_member=$(tar -tzf "$tgz_file" 2>/dev/null | grep 'meta\.json$' | head -n 1 || true) + if [[ -z "$meta_member" ]]; then + printf '%s' "" + return 0 + fi + + local meta_json + meta_json=$(tar -xOf "$tgz_file" "$meta_member" 2>/dev/null || true) + if [[ -z "$meta_json" ]]; then + printf '%s' "" + return 0 + fi + + echo "$meta_json" | jq -c ' + { + tool: .tool, + level: .level, + report_format: .report_format, + raw_dir: .raw_dir, + run_count: ((.runs // []) | length), + events: ( + if .tool == "fapp" + then ((.runs // []) | map(.event) | map(select(. != null and . != ""))) + else [] + end + ), + ncu_options: ( + if .tool == "ncu" and ((.measurement.ncu_options // null) | type) == "array" + then .measurement.ncu_options + else [] + end + ), + report_kinds: ((.runs // []) | map(.reports // []) | add | map(.kind) | unique) + } + ' 2>/dev/null || true +} + # Read source_info.env if it exists (written by bk_fetch_source in build stage) source_info_block="null" if [ -f results/source_info.env ]; then @@ -99,6 +149,16 @@ write_result_json() { \"pipeline_id\": $pipeline_id" fi + # Attach the profiler summary that matches this FOM index. fapp exposes + # counter events, while ncu exposes the Nsight Compute option preset. + local profile_data_block="" + local profile_data_summary="" + profile_data_summary=$(build_profile_data_summary "results/padata${idx}.tgz") + if [ -n "$profile_data_summary" ]; then + profile_data_block=", + \"profile_data\": ${profile_data_summary}" + fi + # Build fom_breakdown if sections exist if [ -n "$sections_json" ]; then # Validate overlap section names @@ -139,7 +199,7 @@ write_result_json() { "nthreads": "$nthreads", "description": "$description", "confidential": "$confidential", - "source_info": $source_info_block${fom_breakdown_block}${timing_block}${mode_block}${trigger_block}${build_job_block}${run_job_block}${pipeline_id_block} + "source_info": $source_info_block${profile_data_block}${fom_breakdown_block}${timing_block}${mode_block}${trigger_block}${build_job_block}${run_job_block}${pipeline_id_block} } EOF diff --git a/scripts/result_server/send_results.sh b/scripts/result_server/send_results.sh index 809ca19..dc7d597 100644 --- a/scripts/result_server/send_results.sh +++ b/scripts/result_server/send_results.sh @@ -8,6 +8,10 @@ ls results/ meta_file="results/server_result_meta.json" echo "{}" > "$meta_file" +# Backfill profile_data for older result JSONs that were produced before +# result.sh learned to embed profiler summaries. The summary comes from +# bk_profiler_artifact/meta.json inside the matching padata archive; raw +# profiler files stay in the archive and are uploaded separately below. build_profile_data_summary() { local tgz_file="$1" @@ -37,7 +41,18 @@ build_profile_data_summary() { report_format: .report_format, raw_dir: .raw_dir, run_count: ((.runs // []) | length), - events: ((.runs // []) | map(.event) | map(select(. != null and . != ""))), + events: ( + if .tool == "fapp" + then ((.runs // []) | map(.event) | map(select(. != null and . != ""))) + else [] + end + ), + ncu_options: ( + if .tool == "ncu" and ((.measurement.ncu_options // null) | type) == "array" + then .measurement.ncu_options + else [] + end + ), report_kinds: ((.runs // []) | map(.reports // []) | add | map(.kind) | unique) } ' 2>/dev/null || true @@ -47,7 +62,7 @@ build_profile_data_summary() { for json_file in results/result*.json; do [[ ! -f "$json_file" ]] && continue - # Determine corresponding TGZ name + # Match result12.json with padata12.tgz, and result.json with padata.tgz. tgz_base="padata" if [[ "$json_file" =~ result([0-9]+)\.json$ ]]; then diff --git a/scripts/setup_site_runner.sh b/scripts/setup_site_runner.sh new file mode 100755 index 0000000..1f9c99e --- /dev/null +++ b/scripts/setup_site_runner.sh @@ -0,0 +1,504 @@ +#!/usr/bin/env bash +set -euo pipefail + +runner_version="v18.5.0" +go_version="1.25.0" +arch="" +site="" +gitlab_url="" +login_token="" +jacamar_token="" +login_tag="" +jacamar_tag="" +scheduler="pbs" +jacamar_repo="" +base_dir="" +service_host="" +allow_user="${USER:-}" +command_delay="30s" +install_systemd=1 +start_service=1 +libseccomp_mode="auto" +jacamar_pbs_tools="" +unrestricted_cmd_line=false +runner_proxy="" +runner_no_proxy="" + +usage() { + cat <<'EOF' +Usage: + setup_site_runner.sh --site SITE --gitlab-url URL --login-token TOKEN --jacamar-token TOKEN [options] + +Required: + --site SITE Site prefix used for tags if tags are omitted. + --gitlab-url URL GitLab URL shared by both runners. + --login-token TOKEN Runner token for the login/frontend runner. + --jacamar-token TOKEN Runner token for the Jacamar/batch runner. + +Options: + --arch amd64|arm64 Target architecture. Default: auto-detect. + --login-tag TAG Expected login runner tag for display only. + With runner authentication tokens, tags are set on GitLab. + --jacamar-tag TAG Expected Jacamar runner tag for display only. + With runner authentication tokens, tags are set on GitLab. + --scheduler pbs|slurm|pjm + --jacamar-repo URL Jacamar-CI repository. Default: PJM fork for + --scheduler pjm, upstream otherwise. + --base-dir DIR Default: $HOME/gitlab-runner_jacamar-ci_{amd,arm} + --service-host HOST Default: hostname -s. + --allow-user USER Jacamar user_allowlist entry. Default: $USER. + --runner-version VER Default: v18.5.0. + --go-version VER Default: 1.25.0. + --command-delay VALUE Jacamar batch command_delay. Default: 30s. + --jacamar-pbs-tools PATH Copy PATH to jacamar-ci/internal/executors/pbs/tools.go before build. + --unrestricted-cmd-line Allow Jacamar to keep runner generated Git/token commands + on the command line. Useful when GIT_ASKPASS fails. + --proxy URL Set http_proxy/https_proxy for the runner systemd service. + If URL has no scheme, http:// is prepended. + --no-proxy LIST Set no_proxy/NO_PROXY for the runner systemd service. + --libseccomp auto|system|local|none + Default: auto. Use system libseccomp if available, + build local gperf/libseccomp if missing. + --with-libseccomp Alias for --libseccomp local. + --without-libseccomp Alias for --libseccomp none. + --no-systemd Do not create a systemd user service. + --no-start Create and enable service, but do not start it. + -h, --help Show this help. + +Example: + curl -fsSL https://raw.githubusercontent.com/RIKEN-RCCS/benchkit/main/scripts/setup_site_runner.sh \ + | bash -s -- --arch amd64 --site genkai \ + --gitlab-url https://gitlab.example.jp \ + --login-token "$LOGIN_TOKEN" --jacamar-token "$JACAMAR_TOKEN" \ + --scheduler pjm --service-host genkai0001 +EOF +} + +die() { + echo "ERROR: $*" >&2 + exit 1 +} + +info() { + echo "[setup-site-runner] $*" +} + +systemd_env_escape() { + local value="$1" + value="${value//\\/\\\\}" + value="${value//\"/\\\"}" + printf '%s' "$value" +} + +write_systemd_env() { + local unit_path="$1" + local name="$2" + local value="$3" + [[ -n "$value" ]] || return 0 + printf 'Environment="%s=%s"\n' "$name" "$(systemd_env_escape "$value")" >> "$unit_path" +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --arch) arch="${2:-}"; shift 2 ;; + --site) site="${2:-}"; shift 2 ;; + --gitlab-url) gitlab_url="${2:-}"; shift 2 ;; + --login-token) login_token="${2:-}"; shift 2 ;; + --jacamar-token) jacamar_token="${2:-}"; shift 2 ;; + --login-tag) login_tag="${2:-}"; shift 2 ;; + --jacamar-tag) jacamar_tag="${2:-}"; shift 2 ;; + --scheduler) scheduler="${2:-}"; shift 2 ;; + --jacamar-repo) jacamar_repo="${2:-}"; shift 2 ;; + --base-dir) base_dir="${2:-}"; shift 2 ;; + --service-host) service_host="${2:-}"; shift 2 ;; + --allow-user) allow_user="${2:-}"; shift 2 ;; + --runner-version) runner_version="${2:-}"; shift 2 ;; + --go-version) go_version="${2:-}"; shift 2 ;; + --command-delay) command_delay="${2:-}"; shift 2 ;; + --jacamar-pbs-tools) jacamar_pbs_tools="${2:-}"; shift 2 ;; + --unrestricted-cmd-line) unrestricted_cmd_line=true; shift ;; + --proxy) runner_proxy="${2:-}"; shift 2 ;; + --no-proxy) runner_no_proxy="${2:-}"; shift 2 ;; + --libseccomp) libseccomp_mode="${2:-}"; shift 2 ;; + --with-libseccomp) libseccomp_mode="local"; shift ;; + --without-libseccomp) libseccomp_mode="none"; shift ;; + --no-systemd) install_systemd=0; shift ;; + --no-start) start_service=0; shift ;; + -h|--help) usage; exit 0 ;; + *) die "Unknown option: $1" ;; + esac +done + +[[ -n "$site" ]] || die "--site is required" +[[ -n "$gitlab_url" ]] || die "--gitlab-url is required" +[[ -n "$login_token" ]] || die "--login-token is required" +[[ -n "$jacamar_token" ]] || die "--jacamar-token is required" +[[ -n "$allow_user" ]] || die "--allow-user is required when USER is empty" + +if [[ -z "$arch" ]]; then + case "$(uname -m)" in + x86_64|amd64) arch="amd64" ;; + aarch64|arm64) arch="arm64" ;; + *) die "Cannot auto-detect arch from uname -m=$(uname -m); pass --arch" ;; + esac +fi + +case "$arch" in + amd64) arch_suffix="amd"; runner_arch="amd64"; go_arch="amd64" ;; + arm64) arch_suffix="arm"; runner_arch="arm64"; go_arch="arm64" ;; + *) die "--arch must be amd64 or arm64" ;; +esac + +case "$scheduler" in + pbs|slurm|pjm) ;; + *) die "--scheduler must be pbs, slurm, or pjm" ;; +esac + +case "$libseccomp_mode" in + auto|system|local|none) ;; + *) die "--libseccomp must be auto, system, local, or none" ;; +esac + +if [[ -n "$runner_proxy" ]]; then + case "$runner_proxy" in + http://*|https://*) ;; + *) runner_proxy="http://${runner_proxy}" ;; + esac +fi + +if [[ -z "$jacamar_repo" ]]; then + if [[ "$scheduler" == "pjm" ]]; then + jacamar_repo="https://gitlab.com/yoshifuminakamura/jacamar-ci.git" + else + jacamar_repo="https://gitlab.com/ecp-ci/jacamar-ci.git" + fi +fi + +if [[ -z "$base_dir" ]]; then + base_dir="${HOME}/gitlab-runner_jacamar-ci_${arch_suffix}" +fi +base_dir="$(mkdir -p "$base_dir" && cd "$base_dir" && pwd)" + +if [[ -z "$service_host" ]]; then + service_host="$(hostname -s)" +fi + +login_tag="${login_tag:-${site}_login}" +jacamar_tag="${jacamar_tag:-${site}_jacamar}" +login_desc="${site}-login" +jacamar_desc="${site}-jacamar" + +for cmd in curl git tar make gcc g++; do + command -v "$cmd" >/dev/null 2>&1 || die "Required command not found: $cmd" +done + +mkdir -p "$base_dir/bin" "$base_dir/builds" "$base_dir/cache" + +runner_bin="${base_dir}/bin/gitlab-runner" +jacamar_bin="${base_dir}/bin/jacamar" +runner_url="https://gitlab-runner-downloads.s3.amazonaws.com/${runner_version}/binaries/gitlab-runner-linux-${runner_arch}" + +if [[ ! -x "$runner_bin" ]]; then + info "Downloading GitLab Runner ${runner_version} (${runner_arch})" + curl -fsSL "$runner_url" -o "$runner_bin" + chmod +x "$runner_bin" +else + info "GitLab Runner already exists: $runner_bin" +fi + +work_dir="${base_dir}/_bootstrap" +rm -rf "$work_dir" +mkdir -p "$work_dir" + +install_go() { + local go_pkg="go${go_version}.linux-${go_arch}.tar.gz" + info "Installing Go ${go_version} (${go_arch})" + curl -fsSL "https://go.dev/dl/${go_pkg}" -o "${work_dir}/${go_pkg}" + tar -C "$work_dir" -xzf "${work_dir}/${go_pkg}" + export GOROOT="${work_dir}/go" + export GOBIN="${GOROOT}/bin" + export PATH="${GOBIN}:${PATH}" +} + +build_local_libseccomp() { + local gperf_ver="3.1" + local sec_ver="2.5.5" + local local_prefix="${work_dir}/local" + local gperf_prefix="${local_prefix}/gperf" + local sec_prefix="${local_prefix}/libseccomp" + + info "Building local gperf/libseccomp" + curl -fsSL "https://ftp.gnu.org/gnu/gperf/gperf-${gperf_ver}.tar.gz" -o "${work_dir}/gperf.tar.gz" + tar -C "$work_dir" -xzf "${work_dir}/gperf.tar.gz" + (cd "${work_dir}/gperf-${gperf_ver}" && ./configure --prefix="$gperf_prefix" && make -j"$(nproc)" && make install) + export PATH="${gperf_prefix}/bin:${PATH}" + + curl -fsSL "https://github.com/seccomp/libseccomp/releases/download/v${sec_ver}/libseccomp-${sec_ver}.tar.gz" -o "${work_dir}/libseccomp.tar.gz" + tar -C "$work_dir" -xzf "${work_dir}/libseccomp.tar.gz" + (cd "${work_dir}/libseccomp-${sec_ver}" && ./configure --prefix="$sec_prefix" --disable-shared && make -j"$(nproc)" && make install) + export PKG_CONFIG_PATH="${sec_prefix}/lib/pkgconfig:${PKG_CONFIG_PATH:-}" + export LD_LIBRARY_PATH="${sec_prefix}/lib:${LD_LIBRARY_PATH:-}" + export LIBRARY_PATH="${sec_prefix}/lib:${LIBRARY_PATH:-}" + export CPATH="${sec_prefix}/include:${CPATH:-}" +} + +have_system_libseccomp() { + if command -v pkg-config >/dev/null 2>&1 && pkg-config --exists libseccomp; then + return 0 + fi + + local test_c="${work_dir}/check-libseccomp.c" + local test_bin="${work_dir}/check-libseccomp" + cat > "$test_c" <<'EOF' +#include +int main(void) { + return seccomp_api_get() < 0; +} +EOF + gcc "$test_c" -lseccomp -o "$test_bin" >/dev/null 2>&1 +} + +configure_libseccomp() { + case "$libseccomp_mode" in + none) + info "Skipping libseccomp detection/build (--libseccomp none)" + ;; + system) + if have_system_libseccomp; then + info "Using system libseccomp" + else + die "System libseccomp was requested but not found" + fi + ;; + local) + build_local_libseccomp + ;; + auto) + if have_system_libseccomp; then + info "Using system libseccomp" + else + info "System libseccomp not found; building local copy" + build_local_libseccomp + fi + ;; + esac +} + +if [[ ! -x "$jacamar_bin" ]]; then + install_go + configure_libseccomp + + info "Building Jacamar-CI from ${jacamar_repo}" + git clone "$jacamar_repo" "${work_dir}/jacamar-ci" + if [[ -n "$jacamar_pbs_tools" ]]; then + [[ -f "$jacamar_pbs_tools" ]] || die "--jacamar-pbs-tools file not found: $jacamar_pbs_tools" + cp "$jacamar_pbs_tools" "${work_dir}/jacamar-ci/internal/executors/pbs/tools.go" + fi + ( + cd "${work_dir}/jacamar-ci" + export CC=gcc CXX=g++ CGO_ENABLED=1 + make build + make install PREFIX="$base_dir" + ) +else + info "Jacamar already exists: $jacamar_bin" +fi + +rm -rf "$work_dir" + +info "Writing custom executor helper scripts" +cat > "${base_dir}/config.sh" < "${base_dir}/prepare.sh" <<'EOF' +#!/usr/bin/env bash +set -euo pipefail +exit 0 +EOF + +cat > "${base_dir}/run.sh" <<'EOF' +#!/usr/bin/env bash +source ~/.bashrc +set -eo pipefail +exec "$@" +EOF + +cat > "${base_dir}/cleanup.sh" <> "\$LOGFILE" + +BUILD_DIR="\${CUSTOM_UNIQUE_BUILD_DIR:-}" +CACHE_DIR="\${CUSTOM_UNIQUE_CACHE_DIR:-}" + +case "\$BUILD_DIR" in + "\${BASE_DIR}/builds/"*) [[ -d "\$BUILD_DIR" ]] && rm -rf -- "\$BUILD_DIR" ;; +esac + +case "\$CACHE_DIR" in + "\${BASE_DIR}/cache/"*) [[ -d "\$CACHE_DIR" ]] && rm -rf -- "\$CACHE_DIR" ;; +esac + +echo "CLEANUP DONE at \$(date)" >> "\$LOGFILE" +EOF + +chmod +x "${base_dir}/config.sh" "${base_dir}/prepare.sh" "${base_dir}/run.sh" "${base_dir}/cleanup.sh" + +info "Writing Jacamar config" +cat > "${base_dir}/custom-config.toml" < "$login_template" < "$jacamar_template" < "$unit_path" <> "$unit_path" </dev/null 2>&1; then + loginctl enable-linger "$allow_user" || true + fi + if [[ "$start_service" -eq 1 ]]; then + systemctl --user restart "$service_name" + systemctl --user --no-pager status "$service_name" || true + fi +fi + +info "Done" +info "Base dir: ${base_dir}" +info "Login tag: ${login_tag}" +info "Jacamar tag: ${jacamar_tag}" +info "Jacamar unrestricted_cmd_line: ${unrestricted_cmd_line}" +if [[ -n "$runner_proxy" ]]; then + info "Runner proxy: ${runner_proxy}" +fi diff --git a/scripts/test_submit.sh b/scripts/test_submit.sh index 7b206c0..ea12bb8 100644 --- a/scripts/test_submit.sh +++ b/scripts/test_submit.sh @@ -28,6 +28,7 @@ fi source ./scripts/job_functions.sh SYSTEM_FILE="config/system.csv" +SYSTEM_INFO_FILE="config/system_info.csv" # --- checking dir and list --- if [ ! -d "programs/$code" ]; then @@ -114,6 +115,45 @@ case "$system" in -S -x PJM_LLIO_GFSCACHE=/vol0002:/vol0003:/vol0004:/vol0005 \ script.sh ;; + GenkaiA|GenkaiB|GenkaiC) + proc=$((nodes * numproc_node)) + echo pjsub -L rscgrp=$queue_group,node=$nodes,elapse=$elapse \ + --mpi proc=$proc \ + script.sh + pjsub -L rscgrp=$queue_group,node=$nodes,elapse=$elapse \ + --mpi proc=$proc \ + script.sh + ;; + Grand_C) + cpu_per_node=$(get_system_cpu_per_node "$system") + echo qsub -q $queue_group \ + -l select=${nodes}:nsockets=${cpu_per_node},walltime=${elapse} \ + -W group_list=d30992 script.sh + qsub -q $queue_group \ + -l select=${nodes}:nsockets=${cpu_per_node},walltime=${elapse} \ + -W group_list=d30992 script.sh + ;; + Grand_G) + echo qsub -q $queue_group \ + -l select=${nodes}:ngpus=1,walltime=${elapse} \ + -W group_list=d30992 script.sh + qsub -q $queue_group \ + -l select=${nodes}:ngpus=1,walltime=${elapse} \ + -W group_list=d30992 script.sh + ;; + AOBA_A|AOBA_S) + proc=$((nodes * numproc_node)) + echo qsub -Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q $queue_group -T necmpi --venode $proc \ + -l elapstim_req=$elapse script.sh + qsub -Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q $queue_group -T necmpi --venode $proc \ + -l elapstim_req=$elapse script.sh + ;; + AOBA_B) + echo qsub -Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q $queue_group -T intmpi -b $nodes \ + -l elapstim_req=$elapse script.sh + qsub -Z -v http_proxy,https_proxy,HTTP_PROXY,HTTPS_PROXY -q $queue_group -T intmpi -b $nodes \ + -l elapstim_req=$elapse script.sh + ;; RC_GH200) echo sbatch -p qc-gh200 -N $nodes -t $elapse --ntasks-per-node=${numproc_node} --cpus-per-task=$nthreads \ --wrap="bash programs/$code/run.sh $system $nodes $numproc_node $nthreads" @@ -121,21 +161,20 @@ case "$system" in --wrap="bash programs/${code}/run.sh $system $nodes $numproc_node $nthreads" ;; MiyabiC) - echo qsub -q debug-c -l select=${nodes}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ + echo qsub -q debug-c -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ script.sh - qsub -q debug-c -l select=${nodes}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ + qsub -q debug-c -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ script.sh ;; MiyabiG) - echo qsub -q debug-g -l select=${nodes}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ + echo qsub -q debug-g -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ script.sh - qsub -q debug-g -l select=${nodes}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ + qsub -q debug-g -l select=${nodes}:mpiprocs=${numproc_node}:ompthreads=$nthreads -l walltime=${elapse} -W group_list=$(groups |awk '{print $2}') \ script.sh ;; *) echo "Error: Unknown system '$system'" - echo "Supported systems: Fugaku, FugakuCN, FugakuLN, RC_GH200, MiyabiC, MiyabiG" + echo "Supported systems: Fugaku, FugakuCN, FugakuLN, GenkaiA, GenkaiB, GenkaiC, Grand_C, Grand_G, AOBA_A, AOBA_B, AOBA_S, RC_GH200, MiyabiC, MiyabiG" exit 1 ;; esac - diff --git a/scripts/tests/test_bk_profiler.sh b/scripts/tests/test_bk_profiler.sh index c0835d7..85150b7 100644 --- a/scripts/tests/test_bk_profiler.sh +++ b/scripts/tests/test_bk_profiler.sh @@ -37,6 +37,9 @@ done if [ "$mode" = "-C" ]; then mkdir -p "$dir" printf '%s\n' "$event" > "${dir}/event.txt" + if [ "${FAKE_FAPP_FAIL:-0}" = "1" ]; then + exit 23 + fi exit 0 fi @@ -77,6 +80,54 @@ EOF chmod +x "${FAKE_BIN}/fapp" "${FAKE_BIN}/fapppx" export PATH="${FAKE_BIN}:${PATH}" +cat > "${FAKE_BIN}/ncu" <<'EOF' +#!/bin/bash +set -euo pipefail +outfile="" +import_file="" +import_mode=0 +while [ $# -gt 0 ]; do + case "$1" in + -o|--output) + shift + outfile="$1" + ;; + --import) + shift + import_file="$1" + import_mode=1 + ;; + --page|--target-processes|--launch-count|--set) + shift + ;; + --nvtx) + ;; + --*) + ;; + *) + if [ "$import_mode" -eq 0 ]; then + break + fi + ;; + esac + shift || true +done + +if [ "$import_mode" -eq 1 ]; then + printf 'ncu import:%s\n' "$import_file" + exit 0 +fi + +if [ -n "$outfile" ]; then + mkdir -p "$(dirname "$outfile")" + printf 'ncu report\n' > "${outfile}.ncu-rep" +fi + +"$@" +EOF + +chmod +x "${FAKE_BIN}/ncu" + run_and_check_level() { local level="$1" local expected_last_rep="$2" @@ -112,4 +163,58 @@ run_and_check_level simple 5 pa5 both yes run_and_check_level standard 11 pa11 both yes run_and_check_level detailed 17 pa17 both yes +ncu_archive="${TMP_DIR}/ncu.tgz" +ncu_extract="${TMP_DIR}/ncu_extract" +ncu_raw="${TMP_DIR}/ncu_pa" +bk_profiler ncu --level single --archive "$ncu_archive" --raw-dir "$ncu_raw" -- bash -c 'printf "ncu target\n"' +mkdir -p "$ncu_extract" +tar -xzf "$ncu_archive" -C "$ncu_extract" +test -f "${ncu_extract}/bk_profiler_artifact/meta.json" +test -f "${ncu_extract}/bk_profiler_artifact/raw/rep1/profile.ncu-rep" +test -f "${ncu_extract}/bk_profiler_artifact/reports/ncu_import_rep1.txt" +grep -q '"tool": "ncu"' "${ncu_extract}/bk_profiler_artifact/meta.json" +grep -q '"kind": "ncu_report"' "${ncu_extract}/bk_profiler_artifact/meta.json" +grep -q '"ncu_options": \["--target-processes", "all", "--set", "basic", "--launch-count", "1"\]' "${ncu_extract}/bk_profiler_artifact/meta.json" + +ncu_detailed_archive="${TMP_DIR}/ncu_detailed.tgz" +ncu_detailed_extract="${TMP_DIR}/ncu_detailed_extract" +ncu_detailed_raw="${TMP_DIR}/ncu_detailed_pa" +bk_profiler ncu --level detailed --archive "$ncu_detailed_archive" --raw-dir "$ncu_detailed_raw" -- bash -c 'printf "ncu detailed target\n"' +mkdir -p "$ncu_detailed_extract" +tar -xzf "$ncu_detailed_archive" -C "$ncu_detailed_extract" +grep -q '"ncu_options": \["--target-processes", "all", "--set", "full", "--nvtx"\]' "${ncu_detailed_extract}/bk_profiler_artifact/meta.json" + +fapp_fail_archive="${TMP_DIR}/fapp_fail.tgz" +fapp_fail_extract="${TMP_DIR}/fapp_fail_extract" +fapp_fail_raw="${TMP_DIR}/fapp_fail_pa" +export FAKE_FAPP_FAIL=1 +if bk_profiler fapp --level single --archive "$fapp_fail_archive" --raw-dir "$fapp_fail_raw" -- true; then + echo "expected failing fapp target to propagate non-zero status" >&2 + exit 1 +else + fapp_fail_status=$? +fi +unset FAKE_FAPP_FAIL +test "$fapp_fail_status" -eq 23 +mkdir -p "$fapp_fail_extract" +tar -xzf "$fapp_fail_archive" -C "$fapp_fail_extract" +test -f "${fapp_fail_extract}/bk_profiler_artifact/meta.json" +test -f "${fapp_fail_extract}/bk_profiler_artifact/raw/rep1/event.txt" +grep -q '"fapp_events": \["pa1"\]' "${fapp_fail_extract}/bk_profiler_artifact/meta.json" + +ncu_fail_archive="${TMP_DIR}/ncu_fail.tgz" +ncu_fail_extract="${TMP_DIR}/ncu_fail_extract" +ncu_fail_raw="${TMP_DIR}/ncu_fail_pa" +if bk_profiler ncu --level single --archive "$ncu_fail_archive" --raw-dir "$ncu_fail_raw" -- bash -c 'exit 42'; then + echo "expected failing ncu target to propagate non-zero status" >&2 + exit 1 +else + ncu_fail_status=$? +fi +test "$ncu_fail_status" -eq 42 +mkdir -p "$ncu_fail_extract" +tar -xzf "$ncu_fail_archive" -C "$ncu_fail_extract" +test -f "${ncu_fail_extract}/bk_profiler_artifact/meta.json" +test -f "${ncu_fail_extract}/bk_profiler_artifact/raw/rep1/profile.ncu-rep" + echo "bk_profiler tests passed" diff --git a/scripts/tests/test_result_profile_data.sh b/scripts/tests/test_result_profile_data.sh index 1bcd4df..d491c48 100644 --- a/scripts/tests/test_result_profile_data.sh +++ b/scripts/tests/test_result_profile_data.sh @@ -7,7 +7,7 @@ REPO_DIR=$(cd "${SCRIPT_DIR}/../.." && pwd) TMP_DIR=$(mktemp -d) trap 'rm -rf "${TMP_DIR}"' EXIT -mkdir -p "${TMP_DIR}/results" "${TMP_DIR}/bk_profiler_artifact" +mkdir -p "${TMP_DIR}/results" "${TMP_DIR}/bk_profiler_artifact" "${TMP_DIR}/ncu/results" "${TMP_DIR}/ncu/bk_profiler_artifact" if ! command -v jq >/dev/null 2>&1; then echo "jq not found; skipping result profile_data test" @@ -39,18 +39,64 @@ EOF tar -czf "${TMP_DIR}/results/padata0.tgz" -C "${TMP_DIR}" bk_profiler_artifact +cat > "${TMP_DIR}/ncu/results/result" <<'EOF' +FOM:2.345 FOM_version:test Exp:CASE0 node_count:1 numproc_node:8 nthreads:9 +EOF + +cat > "${TMP_DIR}/ncu/bk_profiler_artifact/meta.json" <<'EOF' +{ + "tool": "ncu", + "level": "single", + "report_format": "text", + "raw_dir": "raw", + "measurement": { + "ncu_options": ["--target-processes", "all", "--set", "basic", "--launch-count", "1"] + }, + "runs": [ + { + "name": "rep1", + "event": "single", + "raw_path": "raw/rep1", + "reports": [ + {"kind": "ncu_report", "path": "raw/rep1/profile.ncu-rep"}, + {"kind": "summary_text", "path": "reports/ncu_import_rep1.txt"} + ] + } + ] +} +EOF + +tar -czf "${TMP_DIR}/ncu/results/padata0.tgz" -C "${TMP_DIR}/ncu" bk_profiler_artifact + pushd "${TMP_DIR}" >/dev/null bash "${REPO_DIR}/scripts/result.sh" qws Fugaku cross build run 999 >/dev/null popd >/dev/null +pushd "${TMP_DIR}/ncu" >/dev/null +bash "${REPO_DIR}/scripts/result.sh" genesis RC_GH200 cross build run 999 >/dev/null +popd >/dev/null + RESULT_JSON="${TMP_DIR}/results/result0.json" test -f "${RESULT_JSON}" -grep -q '"profile_data"' "${RESULT_JSON}" -grep -q '"tool": "fapp"' "${RESULT_JSON}" -grep -q '"level": "single"' "${RESULT_JSON}" -grep -q '"report_format": "text"' "${RESULT_JSON}" -grep -q '"run_count": 1' "${RESULT_JSON}" -grep -q '"pa1"' "${RESULT_JSON}" -grep -q '"summary_text"' "${RESULT_JSON}" +jq -e ' + .profile_data.tool == "fapp" and + .profile_data.level == "single" and + .profile_data.report_format == "text" and + .profile_data.run_count == 1 and + (.profile_data.events | index("pa1") != null) and + (.profile_data.report_kinds | index("summary_text") != null) +' "${RESULT_JSON}" >/dev/null + +NCU_RESULT_JSON="${TMP_DIR}/ncu/results/result0.json" +test -f "${NCU_RESULT_JSON}" +jq -e ' + .profile_data.tool == "ncu" and + .profile_data.level == "single" and + .profile_data.report_format == "text" and + .profile_data.run_count == 1 and + .profile_data.events == [] and + (.profile_data.ncu_options | index("--target-processes") != null) and + (.profile_data.report_kinds | index("ncu_report") != null) +' "${NCU_RESULT_JSON}" >/dev/null echo "result profile_data test passed" diff --git a/scripts/tests/test_send_results_profile_data.sh b/scripts/tests/test_send_results_profile_data.sh index 7153937..58c157d 100644 --- a/scripts/tests/test_send_results_profile_data.sh +++ b/scripts/tests/test_send_results_profile_data.sh @@ -28,17 +28,21 @@ EOF cat > "${TMP_DIR}/bk_profiler_artifact/meta.json" <<'EOF' { - "tool": "fapp", + "tool": "ncu", "level": "single", "report_format": "text", "raw_dir": "raw", + "measurement": { + "ncu_options": ["--target-processes", "all", "--set", "basic", "--launch-count", "1"] + }, "runs": [ { "name": "rep1", - "event": "pa1", + "event": "single", "raw_path": "raw/rep1", "reports": [ - {"kind": "summary_text", "path": "reports/fapp_A_rep1.txt"} + {"kind": "ncu_report", "path": "raw/rep1/profile.ncu-rep"}, + {"kind": "summary_text", "path": "reports/ncu_import_rep1.txt"} ] } ] @@ -74,10 +78,23 @@ set -euo pipefail exec "${TMP_DIR}/bin/python" "$@" EOF +PYTHON_FOR_FAKE_JQ="${PYTHON_FOR_FAKE_JQ:-}" +if [ -z "$PYTHON_FOR_FAKE_JQ" ]; then + if command -v python3 >/dev/null 2>&1; then + PYTHON_FOR_FAKE_JQ="$(command -v python3)" + elif command -v python >/dev/null 2>&1; then + PYTHON_FOR_FAKE_JQ="$(command -v python)" + else + echo "python3/python not found; skipping send_results profile_data test" + exit 0 + fi +fi +export PYTHON_FOR_FAKE_JQ + cat > "${TMP_DIR}/bin/jq" <<'EOF' #!/bin/bash set -euo pipefail -python_exe="/c/Users/yoshi/AppData/Local/Programs/Python/Python312/python.exe" +python_exe="${PYTHON_FOR_FAKE_JQ:?PYTHON_FOR_FAKE_JQ is required}" if [ "$1" = "-c" ]; then shift @@ -97,7 +114,8 @@ if "tool: .tool" in expr and "report_kinds" in expr: "report_format": data.get("report_format"), "raw_dir": data.get("raw_dir"), "run_count": len(data.get("runs", [])), - "events": [run.get("event") for run in data.get("runs", []) if run.get("event")], + "events": [run.get("event") for run in data.get("runs", []) if data.get("tool") == "fapp" and run.get("event")], + "ncu_options": data.get("measurement", {}).get("ncu_options", []) if data.get("tool") == "ncu" else [], "report_kinds": sorted({rep.get("kind") for run in data.get("runs", []) for rep in run.get("reports", []) if rep.get("kind")}), } print(json.dumps(summary)) @@ -186,9 +204,12 @@ bash "${REPO_DIR}/scripts/result_server/send_results.sh" >/dev/null popd >/dev/null grep -q '"profile_data"' "${TMP_DIR}/results/result0.json" -grep -q '"tool": "fapp"' "${TMP_DIR}/results/result0.json" -grep -q '"level": "single"' "${TMP_DIR}/results/result0.json" -grep -q '"run_count": 1' "${TMP_DIR}/results/result0.json" +grep -Eq '"tool":[[:space:]]*"ncu"' "${TMP_DIR}/results/result0.json" +grep -Eq '"level":[[:space:]]*"single"' "${TMP_DIR}/results/result0.json" +grep -Eq '"run_count":[[:space:]]*1' "${TMP_DIR}/results/result0.json" +grep -Eq '"events":[[:space:]]*\[[[:space:]]*\]' "${TMP_DIR}/results/result0.json" +grep -Eq '"ncu_options":[[:space:]]*\[' "${TMP_DIR}/results/result0.json" +grep -Eq '"ncu_report"' "${TMP_DIR}/results/result0.json" grep -q '"_server_uuid": "11111111-2222-3333-4444-555555555555"' "${TMP_DIR}/results/result0.json" grep -q '"result0.json"' "${TMP_DIR}/results/server_result_meta.json"