Skip to content

Commit

Permalink
Upload log of distributed CI (#4028)
Browse files Browse the repository at this point in the history
* Upload log of distributed CI

* if always

* run_id

* fix indent

* refine

* refine

* fix

* +r

* a'd

Co-authored-by: Tsai <caishenghang@oneflow.org>
Co-authored-by: oneflow-ci-bot <69100618+oneflow-ci-bot@users.noreply.github.com>
  • Loading branch information
3 people committed Dec 23, 2020
1 parent e099978 commit 7e9731f
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 8 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,14 @@ jobs:
--build_docker_img \
--oneflow_wheel_path=${wheelhouse_dir} \
--oneflow_worker_bin=${bin_dir}/oneflow_worker
- name: Upload log (distributed test)
if: always()
uses: ./.github/actions/upload_oss
with:
src_path: oneflow_temp
oss_dst_path: oss://oneflow-log/pr/${{ github.event.pull_request.number }}/$(date '+%Y.%m.%d-%H.%M.%S')-${{github.run_id}}/oneflow_temp
oss_access_key_id: ${{ secrets.OSS_ACCESS_KEY_ID }}
oss_access_key_secret: ${{ secrets.OSS_ACCESS_KEY_SECRET }}
- name: (CUDA) Op test
run: |
docker run --shm-size=8g --rm -w $PWD -v $PWD:$PWD -v /dataset:/dataset -v /model_zoo:/model_zoo -v ${wheelhouse_dir}:${wheelhouse_dir} --env ONEFLOW_WHEEL_PATH=${wheelhouse_dir} \
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ compile_commands.json
/oneflow/python/test/ops/localhost_script_*.sh
.cache
/oneflow-src.zip
/oneflow_temp
41 changes: 33 additions & 8 deletions ci/test/distributed_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def launch_remote_container(
docker_cmd = f"""docker run --privileged --cidfile {workspace_dir}/worker.cid --network host --shm-size=8g --rm -v {workspace_dir}/dotssh:/root/.ssh -v {workspace_dir}:{workspace_dir} -w {workspace_dir} -v /dataset:/dataset -v /model_zoo:/model_zoo oneflow-test:$USER bash launch_ssh_server.sh
"""
ssh_cmd = f"ssh {hostname} {docker_cmd}"
print(ssh_cmd)
print(ssh_cmd, flush=True)
proc = subprocess.Popen(ssh_cmd, shell=True,)
try:
proc.wait(timeout=10)
Expand All @@ -135,7 +135,8 @@ def launch_remote_container(
survival_time_min = survival_time / 60
survival_time_min = int(survival_time_min)
print(
f"remote container launched, host: {hostname}, ssh port: {docker_ssh_port}, .ssh dir: {dotssh_dir}, survival: {survival_time_min} mins"
f"remote container launched, host: {hostname}, ssh port: {docker_ssh_port}, .ssh dir: {dotssh_dir}, survival: {survival_time_min} mins",
flush=True,
)


Expand Down Expand Up @@ -173,13 +174,37 @@ def run_bash_script(
{FIX_SSH_PERMISSION}
bash {bash_script}
"""
with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f:
artifact_cmd = f"""set -ex
{exports}
rm -rf ~/.ssh
cp -r /dotssh ~/.ssh
{FIX_SSH_PERMISSION}
mkdir -p oneflow_temp
rm -rf oneflow_temp/{remote_host}
scp -P {ssh_port} -r {remote_host}:~/oneflow_temp oneflow_temp/{remote_host}
rm -f oneflow_temp/{remote_host}/*/oneflow_worker
chmod -R o+w oneflow_temp
chmod -R o+r oneflow_temp
"""
returncode = None

def get_docker_cmd(f, cmd):
f_name = f.name
f.write(bash_cmd)
print(cmd, flush=True)
f.write(cmd)
f.flush()
docker_cmd = f"docker run --privileged --network host --shm-size=8g --rm -v /tmp:/host/tmp -v $PWD:$PWD -v $HOME:$HOME -w $PWD -v {dotssh_dir}:/dotssh -v /dataset:/dataset -v /model_zoo:/model_zoo oneflow-test:$USER bash /host{f_name}"
print(docker_cmd)
subprocess.check_call(docker_cmd, shell=True, timeout=timeout)
return f"docker run --privileged --network host --shm-size=8g --rm -v /tmp:/host/tmp -v $PWD:$PWD -v $HOME:$HOME -w $PWD -v {dotssh_dir}:/dotssh -v /dataset:/dataset -v /model_zoo:/model_zoo oneflow-test:$USER bash /host{f_name}"

with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f:
run_docker_cmd = get_docker_cmd(f, bash_cmd)
returncode = subprocess.call(run_docker_cmd, shell=True, timeout=timeout)

with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f:
artifact_docker_cmd = get_docker_cmd(f, artifact_cmd)
subprocess.check_call(artifact_docker_cmd, shell=True, timeout=timeout)

if returncode != 0:
raise ValueError(run_docker_cmd)


if __name__ == "__main__":
Expand Down Expand Up @@ -234,7 +259,7 @@ def run_bash_script(
remote_host = affiliations[0]
remote_host = socket.gethostbyname(remote_host)

print(f"this_host: {this_host}, remote_host: {remote_host}")
print(f"this_host: {this_host}, remote_host: {remote_host}", flush=True)
workspace_dir = os.path.join(
os.path.expanduser("~"), "distributed_run_workspace", str(uuid.uuid4())
)
Expand Down

0 comments on commit 7e9731f

Please sign in to comment.