Skip to content

Commit

Permalink
Write speed data to log
Browse files Browse the repository at this point in the history
  • Loading branch information
Ryan Zotti authored and Ryan Zotti committed Aug 14, 2017
1 parent a30b1ac commit 58f1d5b
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 4 deletions.
3 changes: 2 additions & 1 deletion AWS_P_Series_GPU_Setup.md
Expand Up @@ -77,4 +77,5 @@ How to run the training code:
SCRIPT=train_conv_net.py
nohup python3 ${SCRIPT} --datapath /root/data \
--epochs 100 \
--s3_bucket self-driving-car &
--s3_bucket self-driving-car \
--show_speed True &
3 changes: 2 additions & 1 deletion README.md
Expand Up @@ -178,7 +178,8 @@ I created a script called `resume_training.py` that is agnostic to the model who
--datapath $DATA_PATH \
--epochs $EPOCHS \
--model_dir $MODEL_DIR \
--s3_bucket ${S3_BUCKET} &
--s3_bucket ${S3_BUCKET} \
--show_speed True &


## FAQ
Expand Down
5 changes: 4 additions & 1 deletion Trainer.py
Expand Up @@ -41,6 +41,7 @@ def __init__(self,
self.model_dir = mkdir_tfboard_run_dir(self.tfboard_basedir)

self.results_file = os.path.join(self.model_dir, 'results.txt')
self.speed_file = os.path.join(self.model_dir, 'speed.txt')
self.model_checkpoint_dir = os.path.join(self.model_dir,'checkpoints')
self.saver = tf.train.Saver()
self.start_epoch = start_epoch
Expand Down Expand Up @@ -137,13 +138,15 @@ def train(self, sess, x, y_, accuracy, train_step, train_feed_dict, test_feed_di
# Track speed to better compare GPUs and CPUs
now = datetime.now()
diff_seconds = (now - prev_time).total_seconds()
prev_time = datetime.now()
if self.show_speed:
message = 'batch {batch_id} of {total_batches}, {seconds} seconds'
message = message.format(batch_id=batch_id,
seconds=diff_seconds,
total_batches=dataset.batches_per_epoch)
with open(self.speed_file, 'a') as f:
f.write(message + '\n')
print(message)
prev_time = datetime.now()

# TODO: Document and understand what RunOptions does
run_opts = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
Expand Down
4 changes: 3 additions & 1 deletion resume_training.py
Expand Up @@ -8,6 +8,7 @@
data_path = args["datapath"]
epochs = args["epochs"]
model_dir = args["model_dir"]
show_speed = args['show_speed']
s3_bucket = format_s3_bucket(args['s3_bucket'])

s3_data_dir = format_s3_data_dir(s3_bucket)
Expand Down Expand Up @@ -50,7 +51,8 @@
max_sample_records=1000, # TODO: Get max_sample_records from collections file
start_epoch = start_epoch,
restored_model=True,
restored_model_dir=model_dir)
restored_model_dir=model_dir,
show_speed=show_speed)

trainer.train(sess=sess, x=x, y_=y_,
accuracy=accuracy,
Expand Down

0 comments on commit 58f1d5b

Please sign in to comment.