From 58f1d5bcf0d1145e4af61262e190239c5a2ac7f0 Mon Sep 17 00:00:00 2001 From: Ryan Zotti Date: Sun, 13 Aug 2017 19:10:01 -0700 Subject: [PATCH] Write speed data to log --- AWS_P_Series_GPU_Setup.md | 3 ++- README.md | 3 ++- Trainer.py | 5 ++++- resume_training.py | 4 +++- 4 files changed, 11 insertions(+), 4 deletions(-) diff --git a/AWS_P_Series_GPU_Setup.md b/AWS_P_Series_GPU_Setup.md index cf5587e8..4aa9bef3 100644 --- a/AWS_P_Series_GPU_Setup.md +++ b/AWS_P_Series_GPU_Setup.md @@ -77,4 +77,5 @@ How to run the training code: SCRIPT=train_conv_net.py nohup python3 ${SCRIPT} --datapath /root/data \ --epochs 100 \ - --s3_bucket self-driving-car & \ No newline at end of file + --s3_bucket self-driving-car \ + --show_speed True & \ No newline at end of file diff --git a/README.md b/README.md index aa9e0be7..f8a18cff 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,8 @@ I created a script called `resume_training.py` that is agnostic to the model who --datapath $DATA_PATH \ --epochs $EPOCHS \ --model_dir $MODEL_DIR \ - --s3_bucket ${S3_BUCKET} & + --s3_bucket ${S3_BUCKET} \ + --show_speed True & ## FAQ diff --git a/Trainer.py b/Trainer.py index 3c2b6361..5a9f8062 100644 --- a/Trainer.py +++ b/Trainer.py @@ -41,6 +41,7 @@ def __init__(self, self.model_dir = mkdir_tfboard_run_dir(self.tfboard_basedir) self.results_file = os.path.join(self.model_dir, 'results.txt') + self.speed_file = os.path.join(self.model_dir, 'speed.txt') self.model_checkpoint_dir = os.path.join(self.model_dir,'checkpoints') self.saver = tf.train.Saver() self.start_epoch = start_epoch @@ -137,13 +138,15 @@ def train(self, sess, x, y_, accuracy, train_step, train_feed_dict, test_feed_di # Track speed to better compare GPUs and CPUs now = datetime.now() diff_seconds = (now - prev_time).total_seconds() - prev_time = datetime.now() if self.show_speed: message = 'batch {batch_id} of {total_batches}, {seconds} seconds' message = message.format(batch_id=batch_id, seconds=diff_seconds, total_batches=dataset.batches_per_epoch) + with open(self.speed_file, 'a') as f: + f.write(message + '\n') print(message) + prev_time = datetime.now() # TODO: Document and understand what RunOptions does run_opts = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) diff --git a/resume_training.py b/resume_training.py index 972510a7..232bba5f 100644 --- a/resume_training.py +++ b/resume_training.py @@ -8,6 +8,7 @@ data_path = args["datapath"] epochs = args["epochs"] model_dir = args["model_dir"] +show_speed = args['show_speed'] s3_bucket = format_s3_bucket(args['s3_bucket']) s3_data_dir = format_s3_data_dir(s3_bucket) @@ -50,7 +51,8 @@ max_sample_records=1000, # TODO: Get max_sample_records from collections file start_epoch = start_epoch, restored_model=True, - restored_model_dir=model_dir) + restored_model_dir=model_dir, + show_speed=show_speed) trainer.train(sess=sess, x=x, y_=y_, accuracy=accuracy,