Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upload log of distributed CI #4028

Merged
merged 11 commits into from
Dec 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,14 @@ jobs:
--build_docker_img \
--oneflow_wheel_path=${wheelhouse_dir} \
--oneflow_worker_bin=${bin_dir}/oneflow_worker
- name: Upload log (distributed test)
if: always()
uses: ./.github/actions/upload_oss
with:
src_path: oneflow_temp
oss_dst_path: oss://oneflow-log/pr/${{ github.event.pull_request.number }}/$(date '+%Y.%m.%d-%H.%M.%S')-${{github.run_id}}/oneflow_temp
oss_access_key_id: ${{ secrets.OSS_ACCESS_KEY_ID }}
oss_access_key_secret: ${{ secrets.OSS_ACCESS_KEY_SECRET }}
- name: (CUDA) Op test
run: |
docker run --shm-size=8g --rm -w $PWD -v $PWD:$PWD -v /dataset:/dataset -v /model_zoo:/model_zoo -v ${wheelhouse_dir}:${wheelhouse_dir} --env ONEFLOW_WHEEL_PATH=${wheelhouse_dir} \
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ compile_commands.json
/oneflow/python/test/ops/localhost_script_*.sh
.cache
/oneflow-src.zip
/oneflow_temp
41 changes: 33 additions & 8 deletions ci/test/distributed_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def launch_remote_container(
docker_cmd = f"""docker run --privileged --cidfile {workspace_dir}/worker.cid --network host --shm-size=8g --rm -v {workspace_dir}/dotssh:/root/.ssh -v {workspace_dir}:{workspace_dir} -w {workspace_dir} -v /dataset:/dataset -v /model_zoo:/model_zoo oneflow-test:$USER bash launch_ssh_server.sh
"""
ssh_cmd = f"ssh {hostname} {docker_cmd}"
print(ssh_cmd)
print(ssh_cmd, flush=True)
proc = subprocess.Popen(ssh_cmd, shell=True,)
try:
proc.wait(timeout=10)
Expand All @@ -135,7 +135,8 @@ def launch_remote_container(
survival_time_min = survival_time / 60
survival_time_min = int(survival_time_min)
print(
f"remote container launched, host: {hostname}, ssh port: {docker_ssh_port}, .ssh dir: {dotssh_dir}, survival: {survival_time_min} mins"
f"remote container launched, host: {hostname}, ssh port: {docker_ssh_port}, .ssh dir: {dotssh_dir}, survival: {survival_time_min} mins",
flush=True,
)


Expand Down Expand Up @@ -173,13 +174,37 @@ def run_bash_script(
{FIX_SSH_PERMISSION}
bash {bash_script}
"""
with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f:
artifact_cmd = f"""set -ex
{exports}
rm -rf ~/.ssh
cp -r /dotssh ~/.ssh
{FIX_SSH_PERMISSION}
mkdir -p oneflow_temp
rm -rf oneflow_temp/{remote_host}
scp -P {ssh_port} -r {remote_host}:~/oneflow_temp oneflow_temp/{remote_host}
rm -f oneflow_temp/{remote_host}/*/oneflow_worker
chmod -R o+w oneflow_temp
chmod -R o+r oneflow_temp
"""
returncode = None

def get_docker_cmd(f, cmd):
f_name = f.name
f.write(bash_cmd)
print(cmd, flush=True)
f.write(cmd)
f.flush()
docker_cmd = f"docker run --privileged --network host --shm-size=8g --rm -v /tmp:/host/tmp -v $PWD:$PWD -v $HOME:$HOME -w $PWD -v {dotssh_dir}:/dotssh -v /dataset:/dataset -v /model_zoo:/model_zoo oneflow-test:$USER bash /host{f_name}"
print(docker_cmd)
subprocess.check_call(docker_cmd, shell=True, timeout=timeout)
return f"docker run --privileged --network host --shm-size=8g --rm -v /tmp:/host/tmp -v $PWD:$PWD -v $HOME:$HOME -w $PWD -v {dotssh_dir}:/dotssh -v /dataset:/dataset -v /model_zoo:/model_zoo oneflow-test:$USER bash /host{f_name}"

with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f:
run_docker_cmd = get_docker_cmd(f, bash_cmd)
returncode = subprocess.call(run_docker_cmd, shell=True, timeout=timeout)

with tempfile.NamedTemporaryFile(mode="w+", encoding="utf-8") as f:
artifact_docker_cmd = get_docker_cmd(f, artifact_cmd)
subprocess.check_call(artifact_docker_cmd, shell=True, timeout=timeout)

if returncode != 0:
raise ValueError(run_docker_cmd)


if __name__ == "__main__":
Expand Down Expand Up @@ -234,7 +259,7 @@ def run_bash_script(
remote_host = affiliations[0]
remote_host = socket.gethostbyname(remote_host)

print(f"this_host: {this_host}, remote_host: {remote_host}")
print(f"this_host: {this_host}, remote_host: {remote_host}", flush=True)
workspace_dir = os.path.join(
os.path.expanduser("~"), "distributed_run_workspace", str(uuid.uuid4())
)
Expand Down