In [2]:
# imports
import tensorflow as tf
from dataLoader import DataLoader
from modelLoader import ModelLoader
from time import time
from utils import buildRunName
import os

In [3]:
# Debugging info
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### Checkpoint experiment
- Train model over 50 epochs
- Save weights every epoch

In [1]:
# Experiment:
#   Train model over 50 epochs
#   Save weights every epoch

#Params
dataDir = "../data/"
imgHeight = 224
imgWidth = 224
batchSizes = [32]

shuffleSeed = 123

transferLearning = False

epochCounts = [50]

currentBatchSize = batchSizes[0]
currentEpochCount = epochCounts[0]


In [8]:
#Constants & Utils
modelName = "MobileNetV1_CheckpointTest"
train_ds, val_ds, _ = DataLoader().loadDatasets(dataDir, currentBatchSize)
model = ModelLoader().loadMobileNetV1(train_ds, True, True)

Found 15561 files belonging to 2 classes.


In [10]:
def trainModel():
    print(f'-------- Now Training: {buildRunName(modelName, transferLearning, currentEpochCount, currentBatchSize)} --------')
    #checkpoint_baseDir = "../models/checkpoints/" + buildRunName(modelName, transferLearning, currentEpochCount, currentBatchSize)
    #os.makedirs(checkpoint_baseDir)
    #checkpoint_filepath = checkpoint_baseDir + "/{epoch:02d}.hdf5"
    train_ds, val_ds, _ = DataLoader().loadDatasets(dataDir, currentBatchSize)
    start_time = time()
    model = ModelLoader().loadMobileNetV1(train_ds, transferLearning, not transferLearning)
    
    log_dir = "../logs/fit/" + buildRunName(modelName, transferLearning, currentEpochCount, currentBatchSize)
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
    
    
    #model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    #    filepath=checkpoint_filepath,
    #    save_weights_only=True)
    
    model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy', tf.keras.metrics.FalseNegatives(), tf.keras.metrics.FalsePositives()])
    model.fit(train_ds,
                    epochs=currentEpochCount,
                    validation_data=val_ds,
                    callbacks=[tensorboard_callback])
    model.save("../models/" + buildRunName(modelName, transferLearning, currentEpochCount, currentBatchSize))
    end_time = time()
    
    f = open("../logs/RunTimer_CheckpointTest.txt", "a")
    f.write(f'{end_time - start_time};{buildRunName(modelName, transferLearning, currentEpochCount, currentBatchSize)}\n')
    f.close()

In [10]:
for batchSize in batchSizes:
    currentBatchSize = batchSize
    for epochCount in epochCounts:
        currentEpochCount = epochCount
        trainModel()

-------- Now Training: MobileNetV1_CheckpointTest_scratch_epochs-50_batch-32 --------
Found 15561 files belonging to 2 classes.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50




INFO:tensorflow:Assets written to: ../models/MobileNetV1_CheckpointTest_scratch_epochs-50_batch-32\assets


INFO:tensorflow:Assets written to: ../models/MobileNetV1_CheckpointTest_scratch_epochs-50_batch-32\assets


## Experiment different batch sizes
- Train the model with different batch sizes
- batch sizes: `[8,16,32,64,128,256]`

In [4]:
#Params
dataDir = "../data/"
imgHeight = 224
imgWidth = 224
batchSizes = [8,16,32,64,128,256]

shuffleSeed = 123

transferLearning = False

epochCounts = [10]

currentBatchSize = batchSizes[0]
currentEpochCount = epochCounts[0]

modelName = "MobileNetV1_BatchTest"

In [11]:
for batchSize in batchSizes:
    currentBatchSize = batchSize
    for epochCount in epochCounts:
        currentEpochCount = epochCount
        trainModel()

-------- Now Training: MobileNetV1_BatchTest_scratch_epochs-10_batch-8 --------
Found 15561 files belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ../models/MobileNetV1_BatchTest_scratch_epochs-10_batch-8\assets


INFO:tensorflow:Assets written to: ../models/MobileNetV1_BatchTest_scratch_epochs-10_batch-8\assets


-------- Now Training: MobileNetV1_BatchTest_scratch_epochs-10_batch-16 --------
Found 15561 files belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ../models/MobileNetV1_BatchTest_scratch_epochs-10_batch-16\assets


INFO:tensorflow:Assets written to: ../models/MobileNetV1_BatchTest_scratch_epochs-10_batch-16\assets


-------- Now Training: MobileNetV1_BatchTest_scratch_epochs-10_batch-32 --------
Found 15561 files belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ../models/MobileNetV1_BatchTest_scratch_epochs-10_batch-32\assets


INFO:tensorflow:Assets written to: ../models/MobileNetV1_BatchTest_scratch_epochs-10_batch-32\assets


-------- Now Training: MobileNetV1_BatchTest_scratch_epochs-10_batch-64 --------
Found 15561 files belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ../models/MobileNetV1_BatchTest_scratch_epochs-10_batch-64\assets


INFO:tensorflow:Assets written to: ../models/MobileNetV1_BatchTest_scratch_epochs-10_batch-64\assets


-------- Now Training: MobileNetV1_BatchTest_scratch_epochs-10_batch-128 --------
Found 15561 files belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ../models/MobileNetV1_BatchTest_scratch_epochs-10_batch-128\assets


INFO:tensorflow:Assets written to: ../models/MobileNetV1_BatchTest_scratch_epochs-10_batch-128\assets


-------- Now Training: MobileNetV1_BatchTest_scratch_epochs-10_batch-256 --------
Found 15561 files belonging to 2 classes.
Epoch 1/10


ResourceExhaustedError: Graph execution error:

Detected at node 'model_5/mobilenet_1.00_224/conv_pw_3/Conv2D' defined at (most recent call last):
    File "C:\Users\nikla\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Users\nikla\AppData\Local\Programs\Python\Python39\lib\runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\traitlets\config\application.py", line 982, in launch_instance
      app.start()
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\ipykernel\kernelapp.py", line 712, in start
      self.io_loop.start()
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\tornado\platform\asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "C:\Users\nikla\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 596, in run_forever
      self._run_once()
    File "C:\Users\nikla\AppData\Local\Programs\Python\Python39\lib\asyncio\base_events.py", line 1890, in _run_once
      handle._run()
    File "C:\Users\nikla\AppData\Local\Programs\Python\Python39\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\ipykernel\kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\ipykernel\kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\ipykernel\kernelbase.py", line 406, in dispatch_shell
      await result
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\ipykernel\kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\ipykernel\ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\ipykernel\zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\IPython\core\interactiveshell.py", line 2940, in run_cell
      result = self._run_cell(
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\IPython\core\interactiveshell.py", line 2995, in _run_cell
      return runner(coro)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\IPython\core\interactiveshell.py", line 3194, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\IPython\core\interactiveshell.py", line 3373, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\IPython\core\interactiveshell.py", line 3433, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\nikla\AppData\Local\Temp\ipykernel_12744\3376906410.py", line 5, in <module>
      trainModel()
    File "C:\Users\nikla\AppData\Local\Temp\ipykernel_12744\3526589008.py", line 21, in trainModel
      model.fit(train_ds,
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\training.py", line 1564, in fit
      tmp_logs = self.train_function(iterator)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\training.py", line 1160, in train_function
      return step_function(self, iterator)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\training.py", line 1146, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\training.py", line 1135, in run_step
      outputs = model.train_step(data)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\training.py", line 993, in train_step
      y_pred = self(x, training=True)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\training.py", line 557, in __call__
      return super().__call__(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\functional.py", line 510, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\functional.py", line 667, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\utils\traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\engine\base_layer.py", line 1097, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\utils\traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\layers\convolutional\base_conv.py", line 283, in call
      outputs = self.convolution_op(inputs, self.kernel)
    File "l:\Pogrammier Projekte\FHBielefeld\Master\ComputerVision\.env\lib\site-packages\keras\layers\convolutional\base_conv.py", line 255, in convolution_op
      return tf.nn.convolution(
Node: 'model_5/mobilenet_1.00_224/conv_pw_3/Conv2D'
OOM when allocating tensor with shape[256,128,56,56] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model_5/mobilenet_1.00_224/conv_pw_3/Conv2D}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_1142052]

## Transfer Learning
- Model loaded with imageNet weights
- base model frozen
- only GlobalAveragePooling2D and Dense trainable

In [13]:
#Params
dataDir = "../data/"
imgHeight = 224
imgWidth = 224
batchSizes = [32]

shuffleSeed = 123

transferLearning = True

epochCounts = [5,10]

currentBatchSize = batchSizes[0]
currentEpochCount = epochCounts[0]

modelName = "MobileNetV1_TL"

In [14]:
for batchSize in batchSizes:
    currentBatchSize = batchSize
    for epochCount in epochCounts:
        currentEpochCount = epochCount
        trainModel()

-------- Now Training: MobileNetV1_TL_transfer_epochs-5_batch-32 --------
Found 15561 files belonging to 2 classes.
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet/mobilenet_1_0_224_tf_no_top.h5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




INFO:tensorflow:Assets written to: ../models/MobileNetV1_TL_transfer_epochs-5_batch-32\assets


INFO:tensorflow:Assets written to: ../models/MobileNetV1_TL_transfer_epochs-5_batch-32\assets


-------- Now Training: MobileNetV1_TL_transfer_epochs-10_batch-32 --------
Found 15561 files belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: ../models/MobileNetV1_TL_transfer_epochs-10_batch-32\assets


INFO:tensorflow:Assets written to: ../models/MobileNetV1_TL_transfer_epochs-10_batch-32\assets


-------- Now Training: MobileNetV1_TL_transfer_epochs-15_batch-32 --------
Found 15561 files belonging to 2 classes.
Epoch 1/15
