# Extract and save bottleneck feature vectors with an example model
Step 2 in  
1. Generate Nosaic MNIST (`python make_nmnist.py`).
2. Extract and save features in TFRecords format (save_featureTFR_nmnist.ipynb).
3. Plot SAT curve (plot_SAT_curve.ipynb)

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os, time
import PIL
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf

from datasets.data_processing import decode_nosaic_mnist, reshape_for_featext,\
    normalize_images_nosaic_mnist, binarize_labels_nosaic_mnist,\
    read_tfrecords_nosaic_mnist, decode_feat
from models.backbones_fe import ResNetModel, get_ressize_dependent_params
from models.losses import get_loss_fe

In [2]:
# Utility functions
def set_gpu_devices(gpu):
    physical_devices = tf.config.experimental.list_physical_devices('GPU')
    assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
    tf.config.experimental.set_visible_devices(physical_devices[gpu], 'GPU')
    tf.config.experimental.set_memory_growth(physical_devices[gpu], True)

    
def np_to_tfr_fe(x, y, writer):
    """Save a np.array to a tfrecord file. DO NOT FORGET writer.close().
    Args:
        x: data: np.ndarray, dtype=float32
        y: label: int, dtype=int64
        writer: tf.io.TFRecordWriter object. Don't forget writer.close()
    """
    def _bytes_feature(value):
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
    def _int64_feature(value):
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
    def _float_feature(value):
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

    # Make an Example object that has one record data
    example = tf.train.Example(features=tf.train.Features(feature={
        'video': _bytes_feature(x.tostring()),
        'label': _int64_feature(y)
        }))

    # Serialize the example object and make a TFRecord file
    writer.write(example.SerializeToString())

    
def _read_tfrecords_nmnist(record_file_train, record_file_test, batch_size):
    """Reads TFRecord file and make parsed dataset tensors."""
    def _parse_image_function(example_proto):
        return tf.io.parse_single_example(example_proto, {
                    'video': tf.io.FixedLenFeature([], tf.string),
                    'label': tf.io.FixedLenFeature([],tf.int64)
                    })

    raw_image_dataset = tf.data.TFRecordDataset(record_file_train)
    raw_image_dataset_test = tf.data.TFRecordDataset(record_file_test)
    parsed_image_dataset_train = raw_image_dataset.map(_parse_image_function)
    parsed_image_dataset_test = raw_image_dataset_test.map(_parse_image_function)

    parsed_image_dataset_train = parsed_image_dataset_train.batch(batch_size, drop_remainder=False)
    parsed_image_dataset_test = parsed_image_dataset_test.batch(batch_size, drop_remainder=False) 

    return parsed_image_dataset_train, parsed_image_dataset_test


def _checkpoint_logger(model, flag_resume, root_ckptlogs,
    subproject_name, exp_phase, comment, time_stamp, path_resume=None, 
    max_to_keep=3, config_path=None):
    """Make ckpt and manager objects, and restore the latest checkpoint if necessary.
    Args:
        model: A tf.keras.Model object.
        flag_resume: A boolean. Whether to resume training from the latest ckpt.
        root_ckptlogs: A string. Used for path to ckpts.
        subproject_name: A string. Used for path to ckpts.
        comment: A string. Used for path to ckpts.
        time_stamp: A string. Used for path to ckpts.
        path_resume: A string or None. The path to ckpt logs to be resumed. 
            path_resume is ignored if flag_resume=False.
        max_to_keep: An int. Set max_to_keep=0 or None to keep all the ckpts.
        config_path: A string. 
    Returns:
        ckpt: tf.train.Checkpoint object.
        ckpt_manager: tf.train.CheckpointManager object.
    Remark:
        Path to checkpoint files is 
            'root_ckptlogs'/'subproject_name'_'exp_phase'/'comment'_'time_stamp'/ckptXXX
    """
    # Naming rule
    dir_ckptlogs = "{}/{}_{}/{}_{}".format(
        root_ckptlogs, subproject_name, exp_phase, comment, time_stamp)

    if not os.path.exists(path_resume):
        os.makedirs(path_resume)

    # Create ckpt
    ckpt = tf.train.Checkpoint(net=model)

    # If resume
    if flag_resume:
        assert os.path.exists(path_resume), "Not exist: path_ckpt = {}".format(
            path_resume)

        # Create ckpt and manager for restore
        ckpt_manager_restore = tf.train.CheckpointManager(
            ckpt, path_resume, max_to_keep=max_to_keep)

        # Restore the latest ckpt log.
        ckpt.restore(ckpt_manager_restore.latest_checkpoint)
        print("Restored from {}".format(ckpt_manager_restore.latest_checkpoint))        
    
    # Create manager
    ckpt_manager = tf.train.CheckpointManager(
        ckpt, dir_ckptlogs, max_to_keep=max_to_keep)

    return ckpt, ckpt_manager


def ext_and_save(parsed_image_datasets, record_files, list_numdata):
    # Extraction and save TFR
    global_iter = 0
    for parsed_image_dataset, record_file, num_data in zip(parsed_image_datasets, record_files, list_numdata):
        with tf.io.TFRecordWriter(record_file) as writer:

            # Start loop
            for iter_b, feats in enumerate(parsed_image_dataset):
                # 1. Decode features, normalize images, and binarize classification labels
                x_batch, y_batch = decode_nosaic_mnist(feats)
                y_batch = binarize_labels_nosaic_mnist(y_batch)  
                iter_bs = y_batch.shape[0]
                labels_batch = np.int64(y_batch.numpy())

                x_batch, y_batch = reshape_for_featext(x_batch, y_batch, (28, 28, 1)) 
                    # (bs*duration, 28,28,1), (bs*duration,)
                x_batch = normalize_images_nosaic_mnist(x_batch)

                # 2. Extract features
                _, losses, _, feats_batch = get_loss_fe(
                model, 
                x_batch, 
                y_batch, 
                training=False, 
                param_wd=None, 
                flag_wd=False,
                calc_grad=False
                )

                # Reshape (batch, duration, final size)
                feats_batch = tf.reshape(feats_batch, (iter_bs, duration, final_size))
                feats_batch = np.float32(feats_batch.numpy())

                # 3. Save images
                assert len(feats_batch) == len(labels_batch), "{}, {}".format(feats_batch.shape, labels_batch.shape)
                for feat, label in zip(feats_batch, labels_batch):
                    assert (label == 1) or (label == 0)
                    np_to_tfr_fe(x=feat, y=label, writer=writer) 

                global_iter += 1

                # 4. Verbose
                if (iter_b+1) % 10 == 0:
                    print("")
                    print("Global Iter={:7d} Iter={:5d}/{:5d} xent loss={:7.5f}: writing {}"
                        .format(
                            global_iter,
                            iter_b + 1,
                            (num_data // batch_size) + 1 if num_data % batch_size != 0 else num_data // batch_size,
                            losses[1],
                            record_file))
                    print(feat.shape)
                    print(labels_batch)
    print("Done")

# User Defined Parameters

In [3]:
# User defined
tfr_train = './data-directory/nosaic_mnist_train.tfrecords' # NMNIST data
tfr_test = './data-directory/nosaic_mnist_test.tfrecords' # NMNIST data
tfr_feat_train = './data-directory/nosaic_mnist_feat_train.tfrecords' # extracted features to be saved here
tfr_feat_test = './data-directory/nosaic_mnist_feat_test.tfrecords' # extracted features to be saved here
batch_size = 50 # 64
gpu = 0 # GPU number
assert not os.path.exists(tfr_feat_train), tfr_feat_train + "exists. Remove or rename."
assert not os.path.exists(tfr_feat_test), tfr_feat_test + "exists. Remove or rename."

# Start Extraction

In [4]:
# Fixed parameters
duration = 20
path_resume = "./example_ckpts/feature_extractor/ResNetv1"
resnet_size = 110
resnet_version = 1
nb_cls = 2
final_size = 128
root_ckptlogs = "./tmp"
subproject_name = "_"
exp_phase = "_"
comment = "_"

In [5]:
set_gpu_devices(gpu) # GPU number

In [6]:
# Read Nosaic MNIST
# Make sure to run ./make_nmnist.py in advance
parsed_image_dataset_train, parsed_image_dataset_test = _read_tfrecords_nmnist(
        record_file_train=tfr_train, 
        record_file_test=tfr_test, 
        batch_size=batch_size)

print(parsed_image_dataset_train)
print(parsed_image_dataset_test)

<BatchDataset shapes: {label: (None,), video: (None,)}, types: {label: tf.int64, video: tf.string}>
<BatchDataset shapes: {label: (None,), video: (None,)}, types: {label: tf.int64, video: tf.string}>


In [7]:
# Load model
dict_resparams = get_ressize_dependent_params(resnet_version, resnet_size)

model = ResNetModel(
    resnet_size=resnet_size,
    bottleneck=dict_resparams["bottleneck"],
    num_classes=nb_cls,
    kernel_size=dict_resparams["kernel_size"],
    conv_stride=dict_resparams["conv_stride"],
    first_pool_size=dict_resparams["first_pool_size"],
    first_pool_stride=dict_resparams["first_pool_stride"],
    block_sizes=dict_resparams["block_sizes"],
    block_strides=dict_resparams["block_strides"],
    final_size=final_size,
    resnet_version=resnet_version,
    data_format='channels_last',
    dtype=tf.float32
)

# Checkpoint
now = "0"
_, ckpt_manager = _checkpoint_logger(
    model, 
    True, 
    root_ckptlogs, 
    subproject_name, 
    exp_phase,
    comment, 
    now, 
    path_resume)

Restored from ./example_ckpts/feature_extractor/ResNetv1/ckpt_step6400_mbac0.98195-17


test data

In [8]:
# Feature Extraction and Save TFR
parsed_image_datasets = [parsed_image_dataset_test]
record_files = [tfr_feat_test]
list_numdata = [10000]

ext_and_save(parsed_image_datasets, record_files, list_numdata)


Global Iter=     10 Iter=   10/  200 xent=0.14915: writing ./data-directory/nosaic_mnist_feat_test.tfrecords
(20, 128)
[0 0 1 1 0 0 1 0 0 1 0 0 1 1 1 0 0 0 0 1 1 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 1
 0 0 1 0 0 0 0 1 1 0 1 0 0]

Global Iter=     20 Iter=   20/  200 xent=0.08344: writing ./data-directory/nosaic_mnist_feat_test.tfrecords
(20, 128)
[1 0 0 1 1 1 1 1 0 1 1 0 0 0 0 1 0 0 1 0 0 1 1 1 1 1 1 0 1 0 0 0 1 1 1 0 0
 0 1 0 0 1 1 1 1 0 0 0 1 0]

Global Iter=     30 Iter=   30/  200 xent=0.09182: writing ./data-directory/nosaic_mnist_feat_test.tfrecords
(20, 128)
[0 1 0 0 0 1 1 1 0 0 1 1 1 1 1 0 1 0 1 0 0 1 1 0 0 1 1 0 1 0 1 0 0 0 1 1 0
 1 0 0 1 1 0 0 1 1 1 1 1 1]

Global Iter=     40 Iter=   40/  200 xent=0.05922: writing ./data-directory/nosaic_mnist_feat_test.tfrecords
(20, 128)
[1 1 1 1 1 1 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 1 0 0 1 0 1 1 0 0 0 0 0
 1 0 0 1 1 0 1 0 0 1 0 1 0]

Global Iter=     50 Iter=   50/  200 xent=0.07530: writing ./data-directory/nosaic_mnist_feat_test.tfrecord

training data (takes a little long time)

In [9]:
# Feature Extraction and Save TFR
parsed_image_datasets = [parsed_image_dataset_train]
record_files = [tfr_feat_train]
list_numdata = [60000] # = train 50000 + valid 10000

ext_and_save(parsed_image_datasets, record_files, list_numdata)


Global Iter=     10 Iter=   10/ 1200 xent=0.05374: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 1 1 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1
 0 1 0 0 0 0 1 0 0 1 0 0 0]

Global Iter=     20 Iter=   20/ 1200 xent=0.10946: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[0 0 0 0 0 0 1 0 1 1 1 1 0 0 0 0 0 1 0 1 0 1 0 1 1 1 0 1 0 0 1 1 0 1 0 1 0
 1 1 1 1 1 0 0 1 0 0 0 1 0]

Global Iter=     30 Iter=   30/ 1200 xent=0.07291: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[1 1 1 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0
 0 1 0 1 1 1 0 0 1 1 0 0 0]

Global Iter=     40 Iter=   40/ 1200 xent=0.08148: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[1 0 1 0 0 1 1 0 0 1 0 0 1 1 1 1 0 0 0 1 0 1 0 1 0 1 1 1 0 0 1 0 0 0 1 1 1
 1 0 1 1 1 1 1 0 0 0 0 0 0]

Global Iter=     50 Iter=   50/ 1200 xent=0.07375: writing ./data-directory/nosaic_mnist_feat_train.tfr


Global Iter=    380 Iter=  380/ 1200 xent=0.13454: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[0 0 0 1 1 1 0 1 1 1 0 0 1 0 0 0 0 1 0 0 1 1 1 1 1 1 0 0 1 1 1 1 0 1 0 0 1
 1 1 0 0 1 1 1 0 0 0 0 1 1]

Global Iter=    390 Iter=  390/ 1200 xent=0.08360: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[0 0 0 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 1 0 1 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0
 0 0 0 1 0 1 0 1 1 1 1 1 1]

Global Iter=    400 Iter=  400/ 1200 xent=0.10064: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[0 0 1 1 0 1 1 1 1 0 0 0 1 1 0 1 1 1 0 0 0 1 0 0 1 0 0 1 1 0 1 1 1 0 0 1 0
 1 0 1 0 1 1 1 0 1 1 1 1 1]

Global Iter=    410 Iter=  410/ 1200 xent=0.12797: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[1 1 1 0 0 1 0 0 1 1 1 0 0 1 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 0
 1 1 0 1 0 1 1 0 1 1 1 1 0]

Global Iter=    420 Iter=  420/ 1200 xent=0.11315: writing ./data-directory/nosaic_mnist_feat_train.tfr


Global Iter=    750 Iter=  750/ 1200 xent=0.07894: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[1 1 0 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 1 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 0 1 1
 0 0 0 1 0 1 0 0 1 0 0 0 1]

Global Iter=    760 Iter=  760/ 1200 xent=0.08136: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[0 1 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 1 1 1 0 1 1 0 0 0 1 0
 1 0 1 0 0 0 0 0 1 0 1 0 0]

Global Iter=    770 Iter=  770/ 1200 xent=0.14751: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[0 1 0 0 1 1 0 0 1 0 1 0 0 1 1 1 0 1 1 0 1 0 1 1 1 0 1 1 0 1 0 1 0 1 1 1 1
 1 1 0 0 1 1 1 0 0 0 1 1 1]

Global Iter=    780 Iter=  780/ 1200 xent=0.07671: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 0 0 1 0 0 1 1 1 1 0 1
 0 1 0 1 1 0 0 1 1 1 1 0 1]

Global Iter=    790 Iter=  790/ 1200 xent=0.07241: writing ./data-directory/nosaic_mnist_feat_train.tfr


Global Iter=   1120 Iter= 1120/ 1200 xent=0.17395: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[1 1 1 0 1 1 0 0 1 0 1 1 1 1 0 0 1 0 0 1 1 1 0 1 1 1 0 1 0 1 0 0 1 0 1 0 1
 0 1 0 0 1 0 1 1 0 0 1 0 0]

Global Iter=   1130 Iter= 1130/ 1200 xent=0.14167: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[0 1 0 1 0 0 1 0 0 1 1 0 0 1 1 0 1 1 1 1 1 0 0 1 0 0 0 1 0 1 0 0 0 0 1 1 0
 0 0 1 0 1 0 0 0 0 1 1 1 0]

Global Iter=   1140 Iter= 1140/ 1200 xent=0.09108: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[0 0 1 0 0 0 0 1 0 0 1 0 1 1 0 0 1 1 0 1 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 1 1
 0 0 1 0 0 0 0 0 0 0 1 0 1]

Global Iter=   1150 Iter= 1150/ 1200 xent=0.12183: writing ./data-directory/nosaic_mnist_feat_train.tfrecords
(20, 128)
[1 1 0 0 1 0 1 0 0 1 0 1 1 0 1 0 1 1 0 1 1 0 0 0 1 1 1 0 1 0 1 1 0 1 1 1 0
 1 1 0 1 0 0 1 0 1 1 1 1 0]

Global Iter=   1160 Iter= 1160/ 1200 xent=0.11861: writing ./data-directory/nosaic_mnist_feat_train.tfr

### <font color=red>Extraction done. See ./data-directory</font>

# Appendix: Read Feature TFR

Runs without error only after saving `tfr_feat_train` and `tfr_feat_test`.

In [10]:
import time

In [11]:
# Read Feature TFR
feat_dim = final_size
bs_tmp = 1
dtype_feat = tf.float32
dtype_label = tf.int32
pid_tr, pid_val, pid_test =\
    read_tfrecords_nosaic_mnist(tfr_feat_train, tfr_feat_test, bs_tmp)

# Show
show_pid = pid_val
for i, features in enumerate(show_pid):
    vb, lb = decode_feat(features, duration, feat_dim, dtype_feat=tf.float32, dtype_label=tf.int32)
    if (i+1) % 100 == 0:
        #time.sleep(0.01)
        print("iter {}".format(i+1))
        #print(vb)
        #print(lb)
print("=================================\ntotal num of validation datapoints: ", i+1)

show_pid = pid_test
for i, features in enumerate(show_pid):
    vb, lb = decode_feat(features, duration, feat_dim, dtype_feat=tf.float32, dtype_label=tf.int32)
    if (i+1) % 100 == 0:
        #time.sleep(0.01)
        print("iter {}".format(i+1))
        #print(vb)
        #print(lb)
print("=================================\ntotal num of test datapoints: ", i+1)

show_pid = pid_tr
for i, features in enumerate(show_pid):
    vb, lb = decode_feat(features, duration, feat_dim, dtype_feat=tf.float32, dtype_label=tf.int32)
    if (i+1) % 100 == 0:
        #time.sleep(0.01)
        print("iter {}".format(i+1))
        #print(vb)
        #print(lb)
print("=================================\ntotal num of training datapoints: ", i+1)

iter 100
iter 200
iter 300
iter 400
iter 500
iter 600
iter 700
iter 800
iter 900
iter 1000
iter 1100
iter 1200
iter 1300
iter 1400
iter 1500
iter 1600
iter 1700
iter 1800
iter 1900
iter 2000
iter 2100
iter 2200
iter 2300
iter 2400
iter 2500
iter 2600
iter 2700
iter 2800
iter 2900
iter 3000
iter 3100
iter 3200
iter 3300
iter 3400
iter 3500
iter 3600
iter 3700
iter 3800
iter 3900
iter 4000
iter 4100
iter 4200
iter 4300
iter 4400
iter 4500
iter 4600
iter 4700
iter 4800
iter 4900
iter 5000
iter 5100
iter 5200
iter 5300
iter 5400
iter 5500
iter 5600
iter 5700
iter 5800
iter 5900
iter 6000
iter 6100
iter 6200
iter 6300
iter 6400
iter 6500
iter 6600
iter 6700
iter 6800
iter 6900
iter 7000
iter 7100
iter 7200
iter 7300
iter 7400
iter 7500
iter 7600
iter 7700
iter 7800
iter 7900
iter 8000
iter 8100
iter 8200
iter 8300
iter 8400
iter 8500
iter 8600
iter 8700
iter 8800
iter 8900
iter 9000
iter 9100
iter 9200
iter 9300
iter 9400
iter 9500
iter 9600
iter 9700
iter 9800
iter 9900
iter 10000
total nu