In [1]:
import os
import importlib # since we are actively modifying the underlying modules it is very helpful 
                 # to be able to include changes without restarting the kernel with 
                 # importlib.reload( the_mod )

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

import autoencoder as ae
import c10_reader as rdr

In [2]:
# Set random seed (for reproducibility)
seed = 1000
np.random.seed( seed )
tf.set_random_seed( seed )

TODO:
- Save the full model
- Load the model and pass in the placeholder?
- Clean up some of the visualization methods and make it easier to push the data into an eigenspace
- Better autoencoder architecture?
- Better loss functions? (not just L2 for recon...)
- Create new training loop based on the fixed codes
   - Make sure to save the ckpt in a recognizable way

In [3]:
HEIGHT = rdr.HEIGHT
WIDTH  = rdr.WIDTH
DEPTH  = rdr.DEPTH

TOT_CLASS_NUM = rdr.TOT_CLASS_NUM
TOT_TRAIN_EX  = rdr.TRAIN_EX
TOT_TEST_EX   = rdr.TEST_EX

In [4]:
# code_dim =  32 # fixed for now
code_dim = 128 # fixed for now

joint_epochs = 20
code_epochs  = 1

train_batch_size = 64
valid_batch_size = 64
test_batch_size  = 64
batch_size = tf.placeholder( tf.int64 )
is_train   = tf.placeholder( tf.bool )

train_on_full = False # True # train on our prebuilt, subset or the full set
train_classes = [ ii for ii in range( 8 ) ] # list of classes to train on
test_classes  = None # [ 3 ] # list of (additional) classes to test on (the out-of-distribution)
                      # None uses all classes

is_joint_class = not True
is_joint_ocnn  = not True
squish_classes = True

nu = 1.0 # WAG
H  = 16 # OC hidden layer size
# oc_nonlin = tf.nn.relu  # OC hidden activation
oc_nonlin = tf.identity # OC hidden activation

class_num = TOT_CLASS_NUM

train_tfr_name = "./data/tfr/cifar-010/train.tfrecords"
valid_tfr_name = "./data/tfr/cifar-010/validate.tfrecords"
test_tfr_name  = "./data/tfr/cifar-010/test.tfrecords"

# if ( train_on_full ):
#     num_classes = 10
# else:
#     num_classes = 2
# print( "Building one-hot vectors with {} classes".format( num_classes ) )

In [5]:
if ( is_joint_class ):
    fol_name = 'yesClass_'
else: 
    fol_name = 'noClass_'

if ( is_joint_ocnn ):
    fol_name += 'yesOc'
else:
    fol_name += 'noOc'

In [6]:
data_path, train_name = os.path.split( train_tfr_name )
_,         valid_name = os.path.split( valid_tfr_name )
_,          test_name = os.path.split(  test_tfr_name )

os.makedirs( os.path.join( data_path, fol_name ), exist_ok=True )
train_code_tfr_name = os.path.join( data_path, fol_name, "code_{}_".format( code_dim ) + train_name )
valid_code_tfr_name = os.path.join( data_path, fol_name, "code_{}_".format( code_dim ) + valid_name )
test_code_tfr_name  = os.path.join( data_path, fol_name, "code_{}_".format( code_dim ) +  test_name )

In [7]:
all_classes = [ii for ii in range( TOT_CLASS_NUM )]

if ( squish_classes and train_on_full ):
    load_classes = all_classes
    sep_classes = [ii for ii in range( len( train_classes ) )]
    merge_classes = list( set( all_classes ) - set( train_classes ) )
    cl_name = 'squished'
    class_num = len( train_classes ) + 1
elif ( train_on_full ):
    load_classes = all_classes
    sep_classes = all_classes
    merge_classes = []
    cl_name = 'full'
    class_num = 10
else:
    load_classes = train_classes
    sep_classes = [ii for ii in range( len( train_classes ) )]
    merge_classes = []
    cl_name = 'expanded'
    class_num = len( train_classes ) + 1

This will erase any existing C10 TFR and replace them with the new data split

In [8]:
rdr.subset2tfr( [ 1, 2, 3, 4 ], class_nums=load_classes, output_file=train_tfr_name )            # first 4 binary files
rdr.subset2tfr( [ 5 ],          class_nums=load_classes, output_file=valid_tfr_name )            # last binary file
rdr.test2tfr( sep_classes=load_classes, retain_classes=test_classes, output_file=test_tfr_name ) # testing binary file

In [9]:
out_folder = "cifar-010/{}".format( fol_name ) 
out_file   = "{}_in{}_{}".format( 
    cl_name, ''.join( str( _ ) for _ in load_classes ), joint_epochs
)
if ( is_joint_ocnn ):
    if ( nu < 0.01 ) or ( nu > 1.0 ):
        print( "File name expects nu to be O(0.01)-O(1.0)")
    out_file = "p{}_".format( int( 100 * nu ) ) + out_file

ckpt_folder  = os.path.join( './ckpt_{}'.format( code_dim ), out_folder )
image_folder = os.path.join( './images', out_folder )

os.makedirs(  ckpt_folder, exist_ok=True )
os.makedirs( image_folder, exist_ok=True )

ckpt_fname = os.path.join( ckpt_folder, out_file + ".ckpt" )

print( "Checkpoint file set to: {}.".format( ckpt_fname ) ) # os.path.join( ckpt_folder, out_file ) ) )
print( "Image folder set to {}.".format( image_folder ) )

Checkpoint file set to: ./ckpt_128/cifar-010/noClass_noOc/expanded_in01234567_20.ckpt.
Image folder set to ./images/cifar-010/noClass_noOc.


In [10]:
iterator, training_dataset, valid_dataset, testing_dataset = rdr.build_data_iterator( 
    train_tfr_name, 
    valid_tfr_name, 
    test_tfr_name, 
    sep_classes=sep_classes, 
    batch_size=batch_size, 
    train_batch_size=train_batch_size, 
    valid_batch_size=valid_batch_size, 
    test_batch_size=test_batch_size, 
    drop_remainder=False
)

In [11]:
def get_1class_dist( data, name='one_class_dist' ):
    with tf.variable_scope( name ):
        w_shape = [data.get_shape().as_list()[1],1];
        
        w = tf.random_normal( w_shape, mean=0, stddev=1, name='hyper_normal' )
        
    return tf.matmul( data, w ), w

def get_1class_loss( dist, r, nu, name='one_class' ):
    with tf.variable_scope( name ):
        
        main_loss = tf.reduce_mean( tf.nn.relu( r - dist ) ) / nu - r
        
    return main_loss

In [12]:
r = tf.Variable( 1.0, name='hyper_distance' )
# r = tf.constant( 10.0, dtype=tf.float32, name='hyper_distance' )

V = tf.get_variable( 
    'V', 
    [code_dim,H], 
    tf.float32, 
    tf.random_normal_initializer( stddev=0.01 )
)

V_norm = tf.norm( V, ord=2 ) / 2

In [13]:
images_iter, labels_iter = iterator.get_next()

images = tf.placeholder_with_default( images_iter, images_iter.get_shape(), name='images' )
labels = tf.placeholder_with_default( labels_iter, labels_iter.get_shape(), name='labels' )

# We want to train the encoder so we can get access to the codes for likelihood estimation
# The simplest way to do this is to train the encoder-decoder pair with some reconstruction loss
code  = ae.encoder( images, is_train=is_train, code_dim=code_dim )
recon = ae.decoder( code, is_train=is_train )

recon_loss = tf.losses.mean_squared_error( recon, images )

loss = recon_loss

Add additional losses (based on classification or 1-class SVM ideas)

In [14]:
if ( is_joint_class ):
    cl_code = code
else:
    cl_code = tf.stop_gradient( code )

est_labels = ae.code2labels( cl_code, is_train=is_train, class_num=class_num )
class_loss = tf.losses.softmax_cross_entropy( labels, est_labels )

if ( is_joint_class ):
    loss += class_loss / 100
    print( "Jointly training a classifier on the codes." )

    
if ( is_joint_ocnn ):
    oc_code = code
else:
    oc_code = tf.stop_gradient( code )
# pre_one_class = ae.linear( code, 8 )

with tf.variable_scope( 'one_class' ):
    pre_one_class = oc_nonlin( tf.matmul( oc_code, V ) )
    one_class_dist, oc_norm = get_1class_dist( pre_one_class )
    one_class_loss = get_1class_loss( one_class_dist, r, nu )

if ( is_joint_ocnn ):
    loss += one_class_loss + V_norm + tf.norm( oc_norm, ord=2 ) / 2
    print( "Imposing a one-class SVM-like loss on the codes." )

In [15]:
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay( 
    learning_rate=0.0005, 
    global_step=global_step,
    decay_steps=int( ( 50000 / ( 2 * train_batch_size ) ) ), 
    decay_rate=0.95, 
    staircase=True
)

# train_op = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=0.9 ).minimize( loss )
# train_op = tf.train.AdamOptimizer( learning_rate=learning_rate ).minimize( loss )
train_op = tf.train.RMSPropOptimizer( learning_rate ).minimize( loss )

training_init_op = iterator.make_initializer( training_dataset )
valid_init_op    = iterator.make_initializer( valid_dataset )
testing_init_op  = iterator.make_initializer( testing_dataset  )

saver = tf.train.Saver()

In [16]:
cand_ckpt_files = [ ff for ff in os.listdir( ckpt_folder ) if os.path.isfile( os.path.join( ckpt_folder, ff ) ) ]

# for ff in cand_ckpt_files:
#     if ( out_file in ff ):    

I want to be able to save checkpoints as I go

What I'll do, is create a tmp folder based on the name of the final ckpt file

I'll save intermediate ckpts to that file
When the final checkpoint has been saved, I'll delete the folder

In [17]:
with tf.Session() as sess:
    if not tf.train.checkpoint_exists( ckpt_fname ):
        # Train the model if it doesn't already exist
        print( 'Checkpoint not found.')
        
        print( 'Initializing global variables.' )
        sess.run(tf.global_variables_initializer())
        
        for ee in range( joint_epochs ):
            print( 'Beginning epoch {} of {}.'.format( ee+1, joint_epochs ) )
            
            cnt = 0
            train_loss = 0.0
            # Run some number of training batches in this epoch
            sess.run( training_init_op, feed_dict={batch_size: train_batch_size} )
            while True:
                try:
                    _, loss_, cl_, ocl_ = sess.run( [train_op, loss, class_loss, one_class_loss], feed_dict={is_train: True} )
                    cnt += 1
                except tf.errors.OutOfRangeError:
                    break
                train_loss += loss_
            
            train_loss = train_loss / cnt
            print( "  The average training loss at epoch {} was {} with {} steps.".format( ee+1, train_loss, cnt ) )
            
            cnt = 0
            test_loss = 0.0
            # Run some number of testing batches in this epoch
            sess.run( testing_init_op, feed_dict={batch_size: test_batch_size} )
            while True:
                try:
                    loss_ = sess.run( [loss], feed_dict={is_train: False} )
                    cnt += 1
                except tf.errors.OutOfRangeError:
                    break
                test_loss += loss_[0]
            
            test_loss = test_loss / cnt
            print( "  The average testing loss at epoch {} was {} with {} steps".format( ee+1, test_loss, cnt ) )
        
        save_path = saver.save( sess, ckpt_fname )
        
    else:
        # Just load the model if we already have it
        print( "Loading parameter." )
        saver.restore( sess, ckpt_fname )
    
    sess.run( training_init_op , feed_dict={batch_size: train_batch_size})
    train_im, train_re, train_rl, train_lb0 = sess.run( 
        [images, recon, recon_loss, labels], feed_dict={is_train: False} )
    
    sess.run( testing_init_op, feed_dict={batch_size: test_batch_size} )
    test_im, test_re, test_lb0 = sess.run( [images, recon, labels], feed_dict={is_train: False})

Loading parameter.
INFO:tensorflow:Restoring parameters from ./ckpt_128/cifar-010/noClass_noOc/expanded_in01234567_20.ckpt


TODO:
- Do I need to check that the model has been restored or is that guaranteed above?
- i.e. do I need to do the next cell in a session that has reloaded the checkpoint?

In [18]:
with tf.Session() as sess:
    saver.restore( sess, ckpt_fname )
    
    sess.run( training_init_op, feed_dict={batch_size: train_batch_size} )
    train_codes = []
    train_lbls  = []
    while True: 
        try:
            train_codes_, train_lbls_, im_, re_ = sess.run( [code,labels,images,recon], feed_dict={is_train: False} )
        except tf.errors.OutOfRangeError:
            break
        train_codes.append( np.squeeze( train_codes_ ) )
        train_lbls.append(  np.argmax(  train_lbls_, axis=1 ) )
    
    
    sess.run( testing_init_op, feed_dict={batch_size: train_batch_size} )
    test_codes = []
    test_lbls  = []
    while True: 
        try:
            test_codes_, test_lbls_ = sess.run( [code,labels], feed_dict={is_train: False} )
        except tf.errors.OutOfRangeError:
            break
        test_codes.append( np.squeeze( test_codes_ ) )
        test_lbls.append(  np.argmax(  test_lbls_, axis=1 ) )
    
train_codes = np.concatenate( train_codes, axis=0 )
train_lbls  = np.concatenate( train_lbls,  axis=0 )

test_codes  = np.concatenate( test_codes, axis=0 )
test_lbls   = np.concatenate( test_lbls,  axis=0 )

np.save(    './scikit_train_codes.npy', train_codes )
np.savetxt( './scikit_train_lbls.csv',  train_lbls.astype( np.int32 ), delimiter=',', fmt='%d' )

np.save(    './scikit_test_codes.npy',  test_codes  )
np.savetxt( './scikit_test_lbls.csv',   test_lbls.astype(  np.int32 ), delimiter=',', fmt='%d' )

INFO:tensorflow:Restoring parameters from ./ckpt_128/cifar-010/noClass_noOc/expanded_in01234567_20.ckpt


In [None]:
rdr.save_codes( code, labels, training_init_op, batch_size, is_train, ckpt_fname, train_code_tfr_name )
rdr.save_codes( code, labels,    valid_init_op, batch_size, is_train, ckpt_fname, valid_code_tfr_name )
rdr.save_codes( code, labels,  testing_init_op, batch_size, is_train, ckpt_fname,  test_code_tfr_name )

INFO:tensorflow:Restoring parameters from ./ckpt_128/cifar-010/noClass_noOc/expanded_in01234567_20.ckpt
INFO:tensorflow:Restoring parameters from ./ckpt_128/cifar-010/noClass_noOc/expanded_in01234567_20.ckpt


In [None]:
def plot_recon( num, truth, recon ):
    fig, ax = plt.subplots( 2 , num, figsize=(18,3) )
    
    for nn in range( num ):
        ax[0,nn].get_xaxis().set_visible( False )
        ax[0,nn].get_yaxis().set_visible( False )
        ax[1,nn].get_xaxis().set_visible( False )
        ax[1,nn].get_yaxis().set_visible( False )
        
        ax[0,nn].imshow( truth[nn] )
        ax[1,nn].imshow( recon[nn] )
        
    return fig, ax

In [None]:
plt_train = plot_recon( 10, train_im, train_re )
plt_train[0].savefig( 
    os.path.join( image_folder, out_file + "_train_recon.png" ), 
    dpi=216
)

In [None]:
plt_test = plot_recon( 10, test_im, test_re )
plt_test[0].savefig( 
    os.path.join( image_folder, out_file + "_test_recon.png" ), 
    dpi=216
)

In [None]:
plt_save = plot_recon( 10, im_, re_ )

In [None]:
err

In [None]:
for ser_example in tf.python_io.tf_record_iterator( train_code_tfr_name ):
    example = tf.train.Example()
    example.ParseFromString( ser_example )
#     print( example )
    break

In [None]:
code_iterator, training_codeset, valid_codeset, testing_codeset = rdr.build_code_iterator( 
    train_code_tfr_name, 
    valid_code_tfr_name, 
    test_code_tfr_name, 
    code.get_shape().as_list()[-1], 
    class_num, 
    batch_size=batch_size, 
    train_batch_size=train_batch_size, 
    valid_batch_size=valid_batch_size, 
    test_batch_size=test_batch_size, 
    drop_remainder=True # this may not really be desirable...
)

training_init_code = code_iterator.make_initializer( training_codeset )
valid_init_code    = code_iterator.make_initializer( valid_codeset )
testing_init_code  = code_iterator.make_initializer( testing_codeset )

# training_init_code = training_init_op
# valid_init_code    = valid_init_op
# testing_init_code  = testing_init_op

fcode, flabel = code_iterator.get_next()

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    sess.run( training_init_code, feed_dict={batch_size: train_batch_size} )
    
    fixed_codes = []
    fixed_lbls  = []
    while True: 
        try:
            fixed_codes_, fixed_lbls_ = sess.run( [fcode,flabel], feed_dict={is_train: False} )
        except tf.errors.OutOfRangeError:
            break
        fixed_codes.append( np.squeeze( fixed_codes_ ) )
        fixed_lbls.append(  np.argmax(  fixed_lbls_, axis=1 ) )
        
fixed_codes = np.concatenate( fixed_codes, axis=0 )
fixed_lbls  = np.concatenate( fixed_lbls,  axis=0 )

In [None]:
err

In [None]:
code_loss = 0
if ( not is_joint_class ):
#     code_loss += class_loss / 100
    
    print( "Jointly training a classifier on the fixed codes." )

if ( not is_joint_ocnn ):
    code_loss += one_class_loss # + V_norm + tf.norm( oc_norm, ord=2 ) / 2
    
    print( "Imposing a one-class SVM-like loss on the fixed codes." )

In [None]:
# fixed_recon = ae.decoder( fixed_code, name='fixed' )

In [None]:
# with tf.Session() as sess:
#     sess.run(tf.global_variables_initializer())
#     sess.run( training_init_code, feed_dict={batch_size: train_batch_size} )
    
#     saver.restore( sess, ckpt_fname )
#     fixed_recon_, fixed_code_ = sess.run( [fixed_recon, fixed_code], feed_dict={is_train: False} )

# # Check that we have saved and loaded the codes correctly
# plot_recon( 10, fixed_recon_, fixed_recon_ )

In [None]:
learning_rate_code = tf.train.exponential_decay( 
    learning_rate=0.00005, 
    global_step=global_step,
    decay_steps=int( ( 50000 / ( 2 * train_batch_size ) ) ), 
    decay_rate=0.99, 
    staircase=True
)

# train_op = tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=0.9 ).minimize( loss )
# train_op = tf.train.AdamOptimizer( learning_rate=learning_rate ).minimize( loss )
train_code_op = tf.train.RMSPropOptimizer( learning_rate_code ).minimize( code_loss )

saver_code = tf.train.Saver()

os.makedirs( os.path.join( ckpt_folder, 'remainder' ), exist_ok=True )
ckpt_code_fname = os.path.join( ckpt_folder, 'remainder', out_file+".ckpt" )

In [None]:
with tf.Session() as sess:
    saver.restore( sess, ckpt_fname )
    
    if not tf.train.checkpoint_exists( ckpt_code_fname ):
        # Train the model if it doesn't already exist
        print( 'Checkpoint not found.')
        
        print( 'Initializing global variables.' )
#         sess.run(tf.global_variables_initializer())
        
        for ee in range( code_epochs ):
            print( 'Beginning epoch {} of {}.'.format( ee+1, code_epochs ) )
            
            cnt = 0
            train_loss = 0.0
            # Run some number of training batches in this epoch
            sess.run( training_init_op, feed_dict={batch_size: train_batch_size} )
            while True:
                try:
                    _, loss_, images_, recon_ = sess.run( [train_code_op, code_loss, images, recon], feed_dict={is_train: True} )
                    cnt += 1
                except tf.errors.OutOfRangeError:
                    break
                train_loss += loss_
            
            train_loss = train_loss / cnt
            print( "  The average training loss at epoch {} was {} with {} steps.".format( ee+1, train_loss, cnt ) )
            
            cnt = 0
            test_loss = 0.0
            # Run some number of testing batches in this epoch
            sess.run( testing_init_op, feed_dict={batch_size: test_batch_size} )
            while True:
                try:
                    loss_ = sess.run( [code_loss], feed_dict={is_train: False} )
                    cnt += 1
                except tf.errors.OutOfRangeError:
                    break
                test_loss += loss_[0]
            
            test_loss = test_loss / cnt
            print( "  The average testing loss at epoch {} was {} with {} steps".format( ee+1, test_loss, cnt ) )
        
        save_path = saver_code.save( sess, ckpt_code_fname )
        
    else:
        # Just load the model if we already have it
        print( "Loading parameter." )
        saver_code.restore( sess, ckpt_code_fname )
    
    sess.run( training_init_code , feed_dict={batch_size: train_batch_size})
#     train_im, train_re, train_rl, train_lb0 = sess.run( 
#         [images, recon, recon_loss, labels], feed_dict={is_train: False} )
    
    sess.run( testing_init_code, feed_dict={batch_size: test_batch_size} )
#     test_im, test_re, test_lb0 = sess.run( [images, recon, labels], feed_dict={is_train: False})

In [None]:
_ = plot_recon( 10, images_, recon_ )

In [None]:
if ( not is_joint_ocnn ):
    with tf.Session() as sess:
        saver.restore( sess, ckpt_fname )
        saver_code.restore( sess, ckpt_code_fname )
        
        sess.run( training_init_code , feed_dict={batch_size: train_batch_size})
        
        r_all = []
        oc_dist_train = np.zeros( (0,1) )
        labels_train = np.zeros( (0,class_num) )
        while True:
            try:
                oc_dist_, lbls_, r_ = sess.run( [one_class_dist, labels, r], feed_dict={is_train: False} )
            except tf.errors.OutOfRangeError:
                break
            
            r_all.append( r_ )
            oc_dist_train = np.concatenate( ( oc_dist_train, oc_dist_ ), axis=0 )
            labels_train  = np.concatenate( ( labels_train,  lbls_ ), axis=0 )
            
        sess.run( testing_init_code , feed_dict={batch_size: train_batch_size})
        
        oc_dist_test = np.zeros( (0,1) )
        labels_test  = np.zeros( (0,class_num) )
        while True:
            try:
                oc_dist_, lbls_ = sess.run( [one_class_dist, labels], feed_dict={is_train: False} )
            except tf.errors.OutOfRangeError:
                break
            
            oc_dist_test = np.concatenate( ( oc_dist_test, oc_dist_ ), axis=0 )
            labels_test  = np.concatenate( ( labels_test,  lbls_ ), axis=0 )
            
oc_dist_train = np.squeeze( oc_dist_train )
oc_dist_test  = np.squeeze( oc_dist_test )

labels_train_num = np.argmax( labels_train, axis=1 )
labels_test_num  = np.argmax( labels_test,  axis=1 )

In [None]:
fig, ax = plt.subplots( 1 , 2, figsize=(12,4) )
_ = ax[0].hist( oc_dist_train, bins=100 )
_ = ax[0].set_title( 'Train ({} In)'.format( len( oc_dist_train ) ) )

in_test  = oc_dist_test[ labels_test_num != class_num-1 ]
out_test = oc_dist_test[ labels_test_num == class_num-1 ]
_ = ax[1].hist(  in_test, bins=100 )
_ = ax[1].hist( out_test, bins=100 )
_ = ax[1].set_title( 'Test ({} In/{} Out)'.format( len( in_test ), len( out_test ) ) )

In [None]:
figc, axc = plt.subplots( class_num , 2, figsize=(12,4*class_num) )

# axc[cc,0].set_title( 'Train ({} In)'.format( len( oc_dist_train ) ) )
for cc in range( class_num-1 ):
    axc[cc,0].hist( oc_dist_train[np.argmax( labels_train, axis=1 ) == cc], bins=100 )
    axc[cc,1].hist( oc_dist_test[ np.argmax( labels_test,  axis=1 ) == cc], bins=100 )
    
    axc[cc,0].set_xlim( [-1.5,1.5] )
    axc[cc,1].set_xlim( [-1.5,1.5] )

out_test = oc_dist_test[ labels_test_num == class_num-1 ]
axc[-1,1].hist( out_test, bins=100 )
_ = axc[-1,1].set_xlim( [-1.5,1.5] )