Skip to content

Commit

Permalink
Clean up YAML after job completion, add example configs for dry_run a…
Browse files Browse the repository at this point in the history
…nd cifar benchmarks
  • Loading branch information
IKACE committed Oct 7, 2022
1 parent 9833fb4 commit f0927d8
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 15 deletions.
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Configuration file of FAR training experiment using Aggregator & Executor containers
# Configuration file of FAR training experiment using Aggregator & Executor containers and docker for container deployment

# ========== Container configuration ==========
# whether to use container deployment
use_container: True
use_container: docker

# containers need port-mapping to communicate with host machine
# E.g., 1 aggregator and 2 executor, ports: [Aggr, Exec1, Exec2]
Expand Down Expand Up @@ -47,15 +47,15 @@ setup_commands:

# We use fixed paths in job_conf as they will be accessed inside containers
job_conf:
- job_name: cifar_ctnr # Generate logs under this folder: log_path/job_name/time_stamp
- job_name: cifar_docker # Generate logs under this folder: log_path/job_name/time_stamp
- log_path: /FedScale/benchmark # Path of log files
- num_participants: 4 # Number of participants per round, we use K=100 in our paper, large K will be much slower
- data_set: cifar10 # Dataset: openImg, google_speech, stackoverflow
- data_dir: /FedScale/benchmark/dataset/data/ # Path of the dataset
- model: shufflenet_v2_x2_0 # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs
# - model_zoo: fedscale-zoo # Default zoo (torchcv) uses the pytorchvision zoo, which can not support small images well
- eval_interval: 10 # How many rounds to run a testing on the testing set
- rounds: 20 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
- rounds: 21 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
- filter_less: 0 # Remove clients w/ less than 21 samples
- num_loaders: 2
- local_steps: 20
Expand Down
49 changes: 49 additions & 0 deletions benchmark/configs/cifar_cpu/cifar_cpu_k8s.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Configuration file of FAR training experiment using Aggregator & Executor containers and k8s for container deployment

# ========== Container configuration ==========
# whether to use container deployment
use_container: k8s

# containers need a data-path mount to facilitate dataset reuse
# We assume the same data-path is used on all host machines
data_path: $FEDSCALE_HOME/benchmark

# ========== Cluster configuration ==========
# k8s-specific
# number of aggregators, right now we only support a single aggregator
# placeholder for supporting hierarchical aggregator in the future
num_aggregators: 1

# k8s-specific
# number of executors
num_executors: 2

auth:
ssh_user: ""
ssh_private_key: ~/.ssh/id_rsa

# cmd to run before we can indeed run FAR (in order)
setup_commands:


# ========== Additional job configuration ==========
# Default parameters are specified in config_parser.py, wherein more description of the parameter can be found

# We use fixed paths in job_conf as they will be accessed inside containers
job_conf:
- job_name: cifar_k8s # Generate logs under this folder: log_path/job_name/time_stamp
- log_path: /FedScale/benchmark # Path of log files
- num_participants: 4 # Number of participants per round, we use K=100 in our paper, large K will be much slower
- data_set: cifar10 # Dataset: openImg, google_speech, stackoverflow
- data_dir: /FedScale/benchmark/dataset/data/ # Path of the dataset
- model: shufflenet_v2_x2_0 # NOTE: Please refer to our model zoo README and use models for these small image (e.g., 32x32x3) inputs
# - model_zoo: fedscale-zoo # Default zoo (torchcv) uses the pytorchvision zoo, which can not support small images well
- eval_interval: 10 # How many rounds to run a testing on the testing set
- rounds: 21 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
- filter_less: 0 # Remove clients w/ less than 21 samples
- num_loaders: 2
- local_steps: 20
- learning_rate: 0.05
- batch_size: 32
- test_bsz: 32
- use_cuda: False
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Configuration file of dry run experiment using Aggregator & Executor containers
# Configuration file of dry run experiment using Aggregator & Executor containers and docker for container deployment

# ========== Container configuration ==========
# whether to use container deployment
use_container: True
use_container: docker

# containers need port-mapping to communicate with host machine
# E.g., 1 aggregator and 2 executor, ports: [Aggr, Exec1, Exec2]
Expand Down Expand Up @@ -48,7 +48,7 @@ setup_commands:

# We use fixed paths in job_conf as they will be accessed inside containers
job_conf:
- job_name: dryrun_ctnr # Generate logs under this folder: log_path/job_name/time_stamp
- job_name: dryrun_docker # Generate logs under this folder: log_path/job_name/time_stamp
- log_path: /FedScale/benchmark # Path of log files
- num_participants: 4 # Number of participants per round, we use K=100 in our paper, large K will be much slower
- data_set: cifar10 # Dataset: openImg, google_speech, stackoverflow
Expand Down
48 changes: 48 additions & 0 deletions benchmark/configs/dry_run/dry_run_k8s.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Configuration file of dry run experiment using Aggregator & Executor containers and k8s for container deployment

# ========== Container configuration ==========
# whether to use container deployment
use_container: k8s

# containers need a data-path mount to facilitate dataset reuse
# We assume the same data-path is used on all host machines
data_path: $FEDSCALE_HOME/benchmark

# ========== Cluster configuration ==========
# k8s-specific
# number of aggregators, right now we only support a single aggregator
# placeholder for supporting hierarchical aggregator in the future
num_aggregators: 1

# k8s-specific
# number of executors
num_executors: 2

auth:
ssh_user: ""
ssh_private_key: ~/.ssh/id_rsa

# cmd to run before we can indeed run FAR (in order)
setup_commands:


# ========== Additional job configuration ==========
# Default parameters are specified in config_parser.py, wherein more description of the parameter can be found

# We use fixed paths in job_conf as they will be accessed inside containers
job_conf:
- job_name: dryrun_k8s # Generate logs under this folder: log_path/job_name/time_stamp
- log_path: /FedScale/benchmark # Path of log files
- num_participants: 4 # Number of participants per round, we use K=100 in our paper, large K will be much slower
- data_set: cifar10 # Dataset: openImg, google_speech, stackoverflow
- data_dir: /FedScale/benchmark/dataset/data/ # Path of the dataset
- model: resnet18 # Models: e.g., shufflenet_v2_x2_0, mobilenet_v2, resnet34, albert-base-v2# - gradient_policy: yogi # {"fed-yogi", "fed-prox", "fed-avg"}, "fed-avg" by default
- eval_interval: 10 # How many rounds to run a testing on the testing set
- rounds: 21 # Number of rounds to run this training. We use 1000 in our paper, while it may converge w/ ~400 rounds
- filter_less: 0 # Remove clients w/ less than 21 samples
- num_loaders: 2
- local_steps: 20
- learning_rate: 0.001
- batch_size: 32
- test_bsz: 32
- use_cuda: False
2 changes: 1 addition & 1 deletion benchmark/configs/femnist/conf_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ setup_commands:

# We use fixed paths in job_conf as they will be accessed inside containers
job_conf:
- job_name: femnist_ctnr # Generate logs under this folder: log_path/job_name/time_stamp
- job_name: femnist_docker # Generate logs under this folder: log_path/job_name/time_stamp
- log_path: /FedScale/benchmark # Path of log files
- num_participants: 50 # Number of participants per round, we use K=100 in our paper, large K will be much slower
- data_set: femnist # Dataset: openImg, google_speech, stackoverflow
Expand Down
6 changes: 3 additions & 3 deletions benchmark/configs/femnist/conf_k8s.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use_container: k8s

# containers need a data-path mount to facilitate dataset reuse
# We assume the same data-path is used on all host machines
data_path: /users/yilegu/benchmark
data_path: $FEDSCALE_HOME/benchmark

# ========== Cluster configuration ==========
# k8s-specific
Expand All @@ -20,7 +20,7 @@ num_executors: 2


auth:
ssh_user: "yilegu"
ssh_user: ""
ssh_private_key: ~/.ssh/id_rsa

# cmd to run before we can indeed run FAR (in order)
Expand All @@ -32,7 +32,7 @@ setup_commands:

# We use fixed paths in job_conf as they will be accessed inside containers
job_conf:
- job_name: femnist_ctnr # Generate logs under this folder: log_path/job_name/time_stamp
- job_name: femnist_k8s # Generate logs under this folder: log_path/job_name/time_stamp
- log_path: /FedScale/benchmark # Path of log files
- num_participants: 5 # Number of participants per round, we use K=100 in our paper, large K will be much slower
- data_set: femnist # Dataset: openImg, google_speech, stackoverflow
Expand Down
14 changes: 10 additions & 4 deletions docker/driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,9 @@ def terminate(job_name):
config.load_kube_config()
core_api = client.CoreV1Api()
for name, meta_dict in job_meta['k8s_dict'].items():
if os.path.exists(meta_dict["yaml_path"]):
os.remove(meta_dict["yaml_path"])

print(f"Shutting down container {name}...")
core_api.delete_namespaced_pod(name, namespace="default")

Expand Down Expand Up @@ -328,12 +331,14 @@ def submit_to_k8s(yaml_conf):
"data_path": yaml_conf["data_path"],
"pod_name": exec_name
}

exec_yaml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{exec_name}.yaml')
generate_exec_template(exec_config, exec_yaml_path)
k8s_dict[exec_name] = {
"type": "executor",
"rank_id": rank_id
"rank_id": rank_id,
"yaml_path": exec_yaml_path
}
exec_yaml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), f'{exec_name}.yaml')
generate_exec_template(exec_config, exec_yaml_path)
print(f'Submitting executor container {exec_name} to k8s...')
# TODO: logging?
utils.create_from_yaml(k8s_client, exec_yaml_path, namespace="default")
Expand All @@ -355,7 +360,8 @@ def submit_to_k8s(yaml_conf):
k8s_dict[aggr_name] = {
"type": "aggregator",
"ip": aggr_ip,
"rank_id": 0
"rank_id": 0,
"yaml_path": aggr_yaml_path
}

# TODO: refactor the code so that docker/k8s version invoke the same init function
Expand Down

0 comments on commit f0927d8

Please sign in to comment.