In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf

class Net(tf.keras.Model):
  # 모델 가정하고 시작
  def __init__(self):
    super(Net, self).__init__()
    self.l1 = tf.keras.layers.Dense(5)

  def call(self, x):
    return self.l1(x)

net = Net()


In [0]:

# 체크포인트 저장하기
net.save_weights('easy_checkpoint')

# tf.Variable 에 저장되는 모델 상태를 Chekpoint 로 저장할 것
# 객체에 변수를 연결


In [0]:

# toy dataset 과 최적화 정의
def toy_dataset():
  inputs = tf.range(10.)[:, None]
  labels = inputs * 5. + tf.range(5.)[None, :]
  return tf.data.Dataset.from_tensor_slices(
    dict(x=inputs, y=labels)).repeat(10).batch(2)

def train_step(net, example, optimizer):
  """Trains `net` on `example` using `optimizer`."""
  with tf.GradientTape() as tape:
    output = net(example['x'])
    loss = tf.reduce_mean(tf.abs(output - example['y']))
  variables = net.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  return loss


In [0]:
# tf.train.Checkpoint 객체를 생성
opt = tf.keras.optimizers.Adam(0.1)
ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=opt, net=net)
manager = tf.train.CheckpointManager(ckpt, './tf_ckpts', max_to_keep=3)

# model 과 optimizer 인스턴스를 만들어 체크포인트 객체에 저장
# 주기적으로 체크포인트 작성
def train_and_checkpoint(net, manager):
    # restore 는 불러오기
    # 최근 저장된 checkpoint 가 있으면 restore 한다.
  ckpt.restore(manager.latest_checkpoint)
  if manager.latest_checkpoint:
    print("Restored from {}".format(manager.latest_checkpoint))
  else:
    print("Initializing from scratch.")

  for example in toy_dataset():
    loss = train_step(net, example, opt)
    ckpt.step.assign_add(1)
    if int(ckpt.step) % 10 == 0:
      save_path = manager.save()
      print("Saved checkpoint for step {}: {}".format(int(ckpt.step), save_path))
      print("loss {:1.2f}".format(loss.numpy()))

In [6]:
train_and_checkpoint(net, manager)

Initializing from scratch.
Saved checkpoint for step 10: ./tf_ckpts/ckpt-1
loss 28.54
Saved checkpoint for step 20: ./tf_ckpts/ckpt-2
loss 21.96
Saved checkpoint for step 30: ./tf_ckpts/ckpt-3
loss 15.40
Saved checkpoint for step 40: ./tf_ckpts/ckpt-4
loss 8.96
Saved checkpoint for step 50: ./tf_ckpts/ckpt-5
loss 3.11


In [7]:
# 앞에서 실행을 끝내고 다음 지점부터 트레이닝
opt = tf.keras.optimizers.Adam(0.1)
net = Net()  # 새 모델
ckpt = tf.train.Checkpoint(step=tf.Variable(1), optimizer=opt, net=net)
manager = tf.train.CheckpointManager(ckpt, './tf_ckpts', max_to_keep=3) # 새 매니저

# 모델과 체크포인트 매니저는 새로 만들었지만, 다음 번호부터
train_and_checkpoint(net, manager)

Restored from ./tf_ckpts/ckpt-5
Saved checkpoint for step 60: ./tf_ckpts/ckpt-6
loss 1.37
Saved checkpoint for step 70: ./tf_ckpts/ckpt-7
loss 1.52
Saved checkpoint for step 80: ./tf_ckpts/ckpt-8
loss 0.87
Saved checkpoint for step 90: ./tf_ckpts/ckpt-9
loss 0.86
Saved checkpoint for step 100: ./tf_ckpts/ckpt-10
loss 0.54


In [8]:
print(manager.checkpoints)  # 남은 checkpoint들 나열
# max_to_keep=3 이라서 세개가 남아있음

['./tf_ckpts/ckpt-8', './tf_ckpts/ckpt-9', './tf_ckpts/ckpt-10']


In [0]:
# ckpt-웅앵은 파일이 아니고, checkpoint 에 묶여있음
# CheckpoingManager 거 저장하는 하나의 checkpoint 파일에 묶여있음

In [10]:
ls ./tf_ckpts

checkpoint                   ckpt-8.data-00000-of-00001  ckpt-9.index
ckpt-10.data-00000-of-00001  ckpt-8.index
ckpt-10.index                ckpt-9.data-00000-of-00001


In [11]:
to_restore = tf.Variable(tf.zeros([5]))
print(to_restore.numpy())  # 모두 0
fake_layer = tf.train.Checkpoint(bias=to_restore)
fake_net = tf.train.Checkpoint(l1=fake_layer)
new_root = tf.train.Checkpoint(net=fake_net)
status = new_root.restore(tf.train.latest_checkpoint('./tf_ckpts/'))
print(to_restore.numpy())  # 복구된 변수
# 왜 복구되지..?
# restore() 는 선택적으로 확인한 객체 상태 반환

[0. 0. 0. 0. 0.]
[2.5321743 2.0682318 3.1031408 3.4294453 5.010107 ]


In [12]:
status.assert_existing_objects_matched()
# checkpoint 와 계층과 변수.. 이런것들이 일치할 때만 통과

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc4891cb358>

In [13]:
delayed_restore = tf.Variable(tf.zeros([1, 5]))
print(delayed_restore.numpy())  # 복원 안돼서 0
fake_layer.kernel = delayed_restore
print(delayed_restore.numpy())  # 복원

[[0. 0. 0. 0. 0.]]
[[4.6286473 4.8262315 4.8013506 4.9458823 4.9298534]]


In [14]:
# checkpoint 키 , 변수 형태
tf.train.list_variables(tf.train.latest_checkpoint('./tf_ckpts/'))

[('_CHECKPOINTABLE_OBJECT_GRAPH', []),
 ('net/l1/bias/.ATTRIBUTES/VARIABLE_VALUE', [5]),
 ('net/l1/bias/.OPTIMIZER_SLOT/optimizer/m/.ATTRIBUTES/VARIABLE_VALUE', [5]),
 ('net/l1/bias/.OPTIMIZER_SLOT/optimizer/v/.ATTRIBUTES/VARIABLE_VALUE', [5]),
 ('net/l1/kernel/.ATTRIBUTES/VARIABLE_VALUE', [1, 5]),
 ('net/l1/kernel/.OPTIMIZER_SLOT/optimizer/m/.ATTRIBUTES/VARIABLE_VALUE',
  [1, 5]),
 ('net/l1/kernel/.OPTIMIZER_SLOT/optimizer/v/.ATTRIBUTES/VARIABLE_VALUE',
  [1, 5]),
 ('optimizer/beta_1/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('optimizer/beta_2/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('optimizer/decay/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('optimizer/iter/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('optimizer/learning_rate/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('save_counter/.ATTRIBUTES/VARIABLE_VALUE', []),
 ('step/.ATTRIBUTES/VARIABLE_VALUE', [])]

In [0]:
save = tf.train.Checkpoint()
save.listed = [tf.Variable(1.)]
save.listed.append(tf.Variable(2.))
save.mapped = {'one': save.listed[0]}
save.mapped['two'] = save.listed[1]
save_path = save.save('./tf_list_example')

restore = tf.train.Checkpoint()
v2 = tf.Variable(0.)
assert 0. == v2.numpy()  # 아직 복구되지 않았습니다.
restore.mapped = {'two': v2}
restore.restore(save_path)
assert 2. == v2.numpy()

In [16]:
restore.listed = []
print(restore.listed)  # 리스트래퍼([])
v1 = tf.Variable(0.)
restore.listed.append(v1)  # 이전 셀의 restore()에서 v1 복원합니다.
assert 1. == v1.numpy()

ListWrapper([])


In [0]:
import tensorflow.compat.v1 as tf_compat
# 이름 기반 체크포인트

In [18]:
def model_fn(features, labels, mode):
  net = Net()
  opt = tf.keras.optimizers.Adam(0.1)
  ckpt = tf.train.Checkpoint(step=tf_compat.train.get_global_step(),
                             optimizer=opt, net=net)
  with tf.GradientTape() as tape:
    output = net(features['x'])
    loss = tf.reduce_mean(tf.abs(output - features['y']))
  variables = net.trainable_variables
  gradients = tape.gradient(loss, variables)
  return tf.estimator.EstimatorSpec(
    mode,
    loss=loss,
    train_op=tf.group(opt.apply_gradients(zip(gradients, variables)),
                      ckpt.step.assign_add(1)),
    # Estimator가 "ckpt"를 객체 기반의 꼴로 저장하게 합니다.
    scaffold=tf_compat.train.Scaffold(saver=ckpt))

tf.keras.backend.clear_session()
est = tf.estimator.Estimator(model_fn, './tf_estimator_example/')
est.train(toy_dataset, steps=10)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './tf_estimator_example/', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Us

<tensorflow_estimator.python.estimator.estimator.EstimatorV2 at 0x7fc4891c6390>

In [19]:
opt = tf.keras.optimizers.Adam(0.1)
net = Net()
ckpt = tf.train.Checkpoint(
  step=tf.Variable(1, dtype=tf.int64), optimizer=opt, net=net)
ckpt.restore(tf.train.latest_checkpoint('./tf_estimator_example/'))
ckpt.step.numpy()  # est.train(..., steps=10)부터

10