# Assignment 7: CNNs & RNNs

In [1]:
# enter your name and UFL email address
name = 'Yang Bai'
email = 'baiyang94@ufl.edu'

In [2]:
if name == 'enter your name' or email == 'enter your email':
    assert False, 'Enter your name & email first!'
else:
    print('Assignment 7 -- name: {}, email: {}\n'.format(name, email))
    
    # Load packages we need
    import sys
    import os
    import time

    import numpy as np
    import sklearn
    
    # we'll use tensorflow and keras for neural networks
    import tensorflow as tf
    import tensorflow.keras as keras
    
    # import layers we may use
    from tensorflow.keras.layers import Input, Flatten, Dense, Conv2D, MaxPooling2D, Dropout

    # import callbacks we may use
    from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
    
    # Load the TensorBoard notebook extension
    #%load_ext tensorboard

    from matplotlib import pyplot as plt
    plt.rcParams.update({'font.size': 16})

    # Let's check our software versions
    print('### Python version: ' + __import__('sys').version)
    print('### NumPy version: ' + np.__version__)
    print('### Scikit-learn version: ' + sklearn.__version__)
    print('### Tensorflow version: ' + tf.__version__)
    print('### TF Keras version: ' + keras.__version__)
    print('------------')


    # load our packages / code
    sys.path.insert(1, '../common/')
    import utils
    import plots

Assignment 7 -- name: Yang Bai, email: baiyang94@ufl.edu

### Python version: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
### NumPy version: 1.19.5
### Scikit-learn version: 0.24.1
### Tensorflow version: 2.4.1
### TF Keras version: 2.4.0
------------


In [3]:
# global parameters to control behavior of the pre-processing, ML, analysis, etc.
seed = 42

# deterministic seed for reproducibility
np.random.seed(seed)
tf.random.set_seed(seed)

prop_vec = [24, 2, 2]

## [Task 1] (20 points) Loading and Processing CIFAR-10

### [Task 1a] (20 points) Complete the implementation of load_preprocess_cifar10(). Make sure you correctly implement all of the cases.

In [4]:
from tensorflow.keras.datasets import cifar10

# refer to: https://www.tensorflow.org/api_docs/python/tf/keras/datasets/cifar10/load_data
# and to https://www.cs.toronto.edu/~kriz/cifar.html
def load_preprocess_cifar10(onehot=True, minmax_normalize=True):
    
    labels = np.array(['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck'])
    
    ### Load and preprocess the cifar10 data, then split it into train, test, validation
    ### The shapes of train_x, test_x, val_x should be: (50000, 32, 32, 3), (5000, 32, 32, 3), (5000, 32, 32, 3)
    ### If onehot=True you need to one hot encode the labels (y vector)
    ### If minmax_normalize=True you need to minmax normalize the pixel values to be in the range [0,1]
    ###* put your code here (~10-20 lines) *###
    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    
    # The CIFAR10 dataset contains 60,000 color images in 10 classes, with 6,000 images in each class. 
    # The dataset is divided into 50,000 training images and 10,000 testing images. 
    # The classes are mutually exclusive and there is no overlap between them.
#     print('Loaded CIFAR10 data; shape: {} [y: {}], test shape: {} [y: {}]'.format(x_train.shape, y_train.shape,
#                                                                                       x_test.shape, y_test.shape))

    if onehot:
        # Put the labels in "one-hot" encoding using keras' to_categorical()
        num_classes = 10
        y_train = keras.utils.to_categorical(y_train, num_classes)
        y_test = keras.utils.to_categorical(y_test, num_classes)

    # let's aggregate all the data then split
    all_x = np.r_[x_train, x_test]
    all_y = np.r_[y_train, y_test]
    
    if minmax_normalize:
        # Normalize pixel values to be between 0 and 1
        all_x = all_x / 255.0
    
    # split the data into train, test, val
    prop_vec = [10, 1, 1]
    train_x, train_y, test_x, test_y, val_x, val_y = utils.train_test_val_split(all_x, all_y, prop_vec, shuffle=True, seed=seed)
    
    return train_x, train_y, test_x, test_y, val_x, val_y, labels

In [5]:
# do some sanity checks
train_x, train_y, test_x, test_y, val_x, val_y, labels = load_preprocess_cifar10(onehot=False, minmax_normalize=False)
assert train_x.shape[0] == train_y.shape[0] and test_x.shape[0] == test_y.shape[0] and val_x.shape[0] == val_y.shape[0]
assert np.amax(train_x) >= 255 and np.amax(test_x) >= 255 and np.amax(val_x) >= 255
assert train_y.shape == (train_y.shape[0],) or train_y.shape == (train_y.shape[0],1)

train_x, train_y, test_x, test_y, val_x, val_y, labels = load_preprocess_cifar10(onehot=True, minmax_normalize=False)
assert np.amax(train_x) >= 255 and np.amax(test_x) >= 255 and np.amax(val_x) >= 255
assert train_y.shape == (train_y.shape[0],10) and train_y.shape[1] == test_y.shape[1]

In [6]:
# actually load the data
train_x, train_y, test_x, test_y, val_x, val_y, labels = load_preprocess_cifar10()
assert np.amax(train_x) <= 1 and np.amax(test_x) <= 1 and np.amax(val_x) <= 1
assert np.amax(train_x) >= 0 and np.amax(test_x) >= 0 and np.amax(val_x) >= 0

assert labels.shape[0] == 10 and labels.shape[0] == train_y.shape[1]

## [Task 2] (30 points) Training a CNN for Cifar-10

#### We will use the following architecture
- Conv layer with 32 filters, (3,3) filter size, stride of 1, padding 'same'
- Conv layer with 32 filters, (3,3) filter size, stride of 1, padding 'same'
- Max pooling layer (2,2)
- Dropout with rate 25%
- Conv layer with 64 filters, (3,3) filter size, stride of 1, padding 'same'
- Conv layer with 64 filters, (3,3) filter size, stride of 1, padding 'same'
- Max pooling layer (2,2)
- Dropout with rate 25%
- Conv layer with 128 filters, (3,3) filter size, stride of 1, padding 'same'
- Conv layer with 128 filters, (3,3) filter size, stride of 1, padding 'same'
- Max pooling layer (2,2)
- Dropout with rate 25%
- Flatten
- FC with 128 units
- Dropout with rate 25%
- FC with 64 units
- Dropout with rate 25%
- (Output layer) FC with 10 units

#### For all layers (if applicable) except the output layer you should use:
- ReLU as activation function
- He uniform weight initialization strategy
- L2 regularization with regularization constant set to 0.001

#### For the output layer you should select a suitable activation function that is consistent with the task and loss function you use. Use Adam for the optimizer with learning rate 0.002.

### [Task 2a] (20 points) Implement create_compile_cnn() according to the architecture specified above.

In [7]:
def create_compile_cnn(input_shape=[32, 32, 3], num_outputs=10, verbose=False):
    
    model = keras.models.Sequential(name='CIFAR-10--CNN')
    
    initializer = tf.keras.initializers.HeUniform(seed=seed)
    regularizer = tf.keras.regularizers.l2(0.001)
    activation_func = 'relu'
    
    ### Don't forget to compile the model and print the summary if verbose=True
    ###* put your code here (~20 lines) *###
    model.add(Conv2D(32, kernel_size=(3,3), strides=1, input_shape=input_shape,
                     padding='same', activation=activation_func, kernel_regularizer=regularizer, 
                     kernel_initializer=initializer, name='conv1'))
    model.add(Conv2D(32, kernel_size=(3,3), strides=1,
                     padding='same', activation=activation_func, kernel_regularizer=regularizer, 
                     kernel_initializer=initializer, name='conv2'))
    model.add(MaxPooling2D(2, name='maxpool1'))
    model.add(Dropout(0.25, name='dropout1'))
    
    model.add(Conv2D(64, kernel_size=(3,3), strides=1,
                     padding='same', activation=activation_func, kernel_regularizer=regularizer, 
                     kernel_initializer=initializer, name='conv3'))
    model.add(Conv2D(64, kernel_size=(3,3), strides=1,
                     padding='same', activation=activation_func, kernel_regularizer=regularizer, 
                     kernel_initializer=initializer, name='conv4'))
    model.add(MaxPooling2D(2, name='maxpool2'))
    model.add(Dropout(0.25, name='dropout2'))
    
    model.add(Conv2D(128, kernel_size=(3,3), strides=1,
                     padding='same', activation=activation_func, kernel_regularizer=regularizer, 
                     kernel_initializer=initializer, name='conv5'))
    model.add(Conv2D(128, kernel_size=(3,3), strides=1,
                     padding='same', activation=activation_func, kernel_regularizer=regularizer, 
                     kernel_initializer=initializer, name='conv6'))
    model.add(MaxPooling2D(2, name='maxpool3'))
    model.add(Dropout(0.25, name='dropout3'))
    
    model.add(Flatten(name='flatten'))
    
    model.add(Dense(128, activation=activation_func, kernel_regularizer=regularizer, 
                    kernel_initializer=initializer, name='fc1'))
    model.add(Dropout(0.25, name='dropout4'))
    model.add(Dense(64, activation=activation_func, kernel_regularizer=regularizer, 
                    kernel_initializer=initializer, name='fc2'))
    model.add(Dropout(0.25, name='dropout5'))
    
    model.add(Dense(num_outputs, activation="softmax", name='output'))
    
    opt = keras.optimizers.Adam(lr=0.002)
    
    if verbose:
        model.summary()
    
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    
    
    return model

In [8]:
_ = create_compile_cnn(verbose=True)

Model: "CIFAR-10--CNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1 (Conv2D)               (None, 32, 32, 32)        896       
_________________________________________________________________
conv2 (Conv2D)               (None, 32, 32, 32)        9248      
_________________________________________________________________
maxpool1 (MaxPooling2D)      (None, 16, 16, 32)        0         
_________________________________________________________________
dropout1 (Dropout)           (None, 16, 16, 32)        0         
_________________________________________________________________
conv3 (Conv2D)               (None, 16, 16, 64)        18496     
_________________________________________________________________
conv4 (Conv2D)               (None, 16, 16, 64)        36928     
_________________________________________________________________
maxpool2 (MaxPooling2D)      (None, 8, 8, 64)        

### [Task 2b] (10 points) Train the model. Fill in the implementation below.

In [9]:
cnn_model_fp = './cifar10-cnn.h5'
fp = "./mymodel-bestweights.h5"

early_stop_cb = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
# set up a model checkpointing callback
checkpoint_cb = ModelCheckpoint(fp, monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)

max_epochs = 15
batch_size = 64

if len(train_x.shape) < 4:
    train_x = train_x.reshape(-1, 28, 28, 1)
    val_x = val_x.reshape(-1, 28, 28, 1)
    test_x = test_x.reshape(-1, 28, 28, 1)

# If the model file exists, load it. Otherwise train it and save the model.
# Note: if you need to retrain the model, simply delete the h5 file.
if os.path.exists(cnn_model_fp):
    print("Loading model from %s" % cnn_model_fp)
    model = keras.models.load_model(cnn_model_fp)
    print("Training based on the previous trained model...")
    history = model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=max_epochs, batch_size=batch_size, 
                         shuffle=True, callbacks=[early_stop_cb, checkpoint_cb])
    # save the model
    model.save(cnn_model_fp)
else:
    print("Training from scratch...")
    model = create_compile_cnn(verbose=False)
    # train the model using model.fit() for at least 3 epochs and your chosen batch_size
    # you can set any callback you want on it, including checkpoint, early stopping, etc.
    ###* put your code here (~3-5 lines) *###
    history = model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=max_epochs, batch_size=batch_size, 
                         shuffle=True, callbacks=[early_stop_cb, checkpoint_cb])
    # save the model
    model.save(cnn_model_fp)

Loading model from ./cifar10-cnn.h5
Training based on the previous trained model...
Epoch 1/15

Epoch 00001: val_accuracy improved from -inf to 0.71040, saving model to ./mymodel-bestweights.h5
Epoch 2/15

Epoch 00002: val_accuracy improved from 0.71040 to 0.72080, saving model to ./mymodel-bestweights.h5
Epoch 3/15

Epoch 00003: val_accuracy did not improve from 0.72080
Epoch 4/15

Epoch 00004: val_accuracy did not improve from 0.72080
Epoch 5/15

Epoch 00005: val_accuracy did not improve from 0.72080


In [10]:
# let's evaluate the model on the test data
loss, acc = model.evaluate(test_x, test_y, verbose=0)
print('[Model] Test accuracy: {:.2f}%'.format(100*acc))

[Model] Test accuracy: 72.88%


## [Task 3] (15 points) Processing Sequence Data

### [Task 3a] (15 points) Fill in the implementation of load_preprocess_imdb()

In [11]:
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

GPU is NOT AVAILABLE


In [12]:
from tensorflow.keras.datasets import imdb

# the size of the vocabulary we'll use
vocab_size = 12000
maxlen = 150

# refer to: https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb
def load_preprocess_imdb(num_words=vocab_size, prop_vec=prop_vec, maxlen=maxlen, vectorize=False):
    
    np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)
    
    # IMDB is a dataset of 25,000 movies reviews from IMDB, labeled by sentiment (positive/negative). 
    # Reviews have been preprocessed, and each review is encoded as a list of word indexes (integers). 
    # For convenience, words are indexed by overall frequency in the dataset, 
    # so that for instance the integer "3" encodes the 3rd most frequent word in the data. 
    # This allows for quick filtering operations such as: "only consider the top 10,000 most common words, 
    # but eliminate the top 20 most common words".
    # As a convention, "0" does not stand for a specific word, but instead is used to encode any unknown word.
    train, testval = imdb.load_data(num_words=num_words, maxlen=maxlen, oov_char=0)
    
    np.warnings.filterwarnings('default', category=np.VisibleDeprecationWarning)    
    
    ### Process the data 
    ### Merge train and testval, but then split again into train, test, val sets (according to prop_vec). You can use utils.train_test_val_split().)
    ### - If vectorize=True, then you must encode the features of each example into vectors of vocab_size entries 
    ### such that entry i contains the number of time word i appeared in the sequence
    ### - If vectorize=False, then you must encode the features of each examples as a sequence of size maxlen (represented as a np.array()). 
    ### Make sure to pad sequences with 0 as appropriate.
    ###* put your code here (~10-15 lines) *###
    (x_train, y_train), (x_test, y_test) = train, testval
    all_x = np.r_[x_train, x_test]
    all_y = np.r_[y_train, y_test]
    if vectorize:
        vectorized_all_x = np.zeros((all_x.shape[0], num_words))
        for i, sequence in enumerate(all_x):
            for word in sequence:
                vectorized_all_x[i, word] +=  1   
        all_x = vectorized_all_x
    else:        
        padded_all_x = np.zeros((all_x.shape[0], maxlen))
        for i, sequence in enumerate(all_x):
            padded_all_x[i, :len(sequence)] =  sequence   
        all_x = padded_all_x
    
    train_x, train_y, test_x, test_y, val_x, val_y = utils.train_test_val_split(all_x, all_y, 
                                                                                prop_vec, shuffle=True, 
                                                                                seed=seed)
    
    return train_x, train_y, test_x, test_y, val_x, val_y

In [13]:
# sanity checks
train_x, train_y, test_x, test_y, val_x, val_y = load_preprocess_imdb(vectorize=False)
assert train_x.shape == (16281, maxlen) and train_y.shape == (train_x.shape[0],)

train_x, train_y, test_x, test_y, val_x, val_y = load_preprocess_imdb(vectorize=True)
assert train_x.shape == (16281, vocab_size) and train_y.shape == (train_x.shape[0],)

In [14]:
# word_index = tf.keras.datasets.imdb.get_word_index()
# sorted_word_index_dict = dict(sorted(word_index.items(), key=lambda item: item[1]))

## [Task 4] (35 points) RNN for Sentiment Analysis

### [Task 4a] (35 points) Complete the code below to define an RNN architecture for sentiment analysis. The goal is to predict the sentiment of IMDB reviews. You can use any architecture you want, but a good place to start would be to use an embedding layer followed by some recurrent layers (e.g., LSTM, GRU, etc.). Keep the number of parameters of the model below 2m.

In [15]:
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, SimpleRNN, GRU, Embedding
dropout_rate = 0.2

def create_compile_rnn(input_shape=[None], vocab_size=vocab_size, embedding_size=128, num_outputs=1, verbose=False):
    
    model = keras.models.Sequential(name='imdb-RNN')
    
    ### Don't forget to compile the model and print the summary if verbose=True
    ### Use binary_crossentropy as loss function.    
    ###* put your code here (~15-20 lines) *###
    model.add(keras.Input(shape=input_shape, sparse=False, name='input'))
    
    model.add(Embedding(vocab_size, embedding_size, name='embedding'))
    
    model.add(Dropout(dropout_rate))
    
    model.add(GRU(192, return_sequences=True, dropout=dropout_rate, recurrent_dropout=0.0, name='gru1'))
    model.add(GRU(128, recurrent_dropout=0.0, name='gru2'))
    
    # output
    model.add(Dense(num_outputs, activation='sigmoid', name='output'))
    
    if verbose:
        model.summary()
        
    opt = keras.optimizers.Adam(lr=0.001)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

    
    
    return model

In [33]:
model = create_compile_rnn(verbose=True)

Model: "imdb-RNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         1536000   
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
gru1 (GRU)                   (None, None, 192)         185472    
_________________________________________________________________
gru2 (GRU)                   (None, 128)               123648    
_________________________________________________________________
output (Dense)               (None, 1)                 129       
Total params: 1,845,249
Trainable params: 1,845,249
Non-trainable params: 0
_________________________________________________________________


In [34]:
rnn_model_fp = './imdb-rnn.h5'

# let's load the data
train_x, train_y, test_x, test_y, val_x, val_y = load_preprocess_imdb(vectorize=False)

early_stop_cb = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)

# feel free to tweak the batch size, number of epochs and callbacks.
max_epochs = 3
batch_size = 32

if os.path.exists(rnn_model_fp) and False:
    print("Loading model from %s" % rnn_model_fp)
    model = keras.models.load_model(rnn_model_fp)
    print("Training based on the previous trained model...")
    hist = model.fit(train_x, train_y, epochs=max_epochs, batch_size=batch_size, validation_data=(val_x, val_y), 
                     callbacks=[early_stop_cb])
    model.save(rnn_model_fp)
else:
    print("Traing from scratch...")
    hist = model.fit(train_x, train_y, epochs=max_epochs, batch_size=batch_size, validation_data=(val_x, val_y), 
                     callbacks=[early_stop_cb])
    model.save(rnn_model_fp)

Traing from scratch...
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [35]:
# let's evaluate the model on the test data
loss, acc = model.evaluate(test_x, test_y, verbose=0)
print('[Model] Test accuracy: {:.2f}%'.format(100*acc))

[Model] Test accuracy: 90.71%


## [CIS6930 Additional Task -- Task 5] (25 points): DNN for Sentiment Analysis

### In the previous task, we use an RNN for sentiment analysis. In this task you will use a neural network without any recurrent layers for the same task as a comparison.

### We'll use the data in vectorized form for this.

### [Task 5a] (20 points) Complete the code below to define an architecture of your choice *without* any recurrent layers. The goal is to get the best model with the fewest number of parameters. Keep the number of parameters of the model below 2m and ideally similar to the model of Task 4.

In [36]:
def create_compile_dnn(input_shape=[vocab_size], num_outputs=1, verbose=False):
    
    model = keras.models.Sequential(name='imdb-DNN')
    
    ### Don't forget to compile the model and print the summary if verbose=True
    ###* put your code here (~10 lines) *###
    model.add(Input(shape=input_shape, sparse=False, name='input'))
    
    hidden_widths=[150, 50]
    
    for i, hw in enumerate(hidden_widths):
        model.add(Dense(hw, activation='relu', name='hidden_{}'.format(i), 
                             kernel_initializer=keras.initializers.RandomNormal(stddev=np.sqrt(1/hw)),
                             bias_initializer=keras.initializers.Zeros()))
    
    # output
    model.add(Dense(num_outputs, activation='sigmoid', name='output'))
    
    if verbose:
        model.summary()
        
    opt = keras.optimizers.Adam(lr=0.001)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

    
    return model

In [37]:
model = create_compile_dnn(verbose=True)

Model: "imdb-DNN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden_0 (Dense)             (None, 150)               1800150   
_________________________________________________________________
hidden_1 (Dense)             (None, 50)                7550      
_________________________________________________________________
output (Dense)               (None, 1)                 51        
Total params: 1,807,751
Trainable params: 1,807,751
Non-trainable params: 0
_________________________________________________________________


In [38]:
dnn_model_fp = './imdb-dnn.h5'

# Let's load the data in vectorized form
train_x, train_y, test_x, test_y, val_x, val_y = load_preprocess_imdb(vectorize=True)


early_stop_cb = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1)

# feel free to tweak the batch size, number of epochs and callbacks.
max_epochs = 50
batch_size = 100

if os.path.exists(dnn_model_fp) and False:
    print("Loading model from %s" % dnn_model_fp)
    model = keras.models.load_model(dnn_model_fp)
    print("Training based on the previous trained model...")
    hist = model.fit(train_x, train_y, epochs=max_epochs, batch_size=batch_size,validation_data=(val_x, val_y), 
                     callbacks=[early_stop_cb])
    model.save(dnn_model_fp)
    
else:
    print("Training from scratch...")
    hist = model.fit(train_x, train_y, epochs=max_epochs, batch_size=batch_size,validation_data=(val_x, val_y), 
                     callbacks=[early_stop_cb])
    model.save(dnn_model_fp)

Training from scratch...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Restoring model weights from the end of the best epoch.
Epoch 00006: early stopping


In [39]:
# let's evaluate the model on the test data
loss, acc = model.evaluate(test_x, test_y, verbose=0)
print('[Model] Test accuracy: {:.2f}%'.format(100*acc))

[Model] Test accuracy: 87.91%


### [Task 5b] (5 points) Compare this model to the model of Task 4. What do you conclude?

In [40]:
###* put your answer here *###
#
# RNN got a test accuracy of 90.71% while DNN got a test accuracy of 87.91 at a similar number of parameters.
# From this I can conclude that at a similar scale of parameters, RNN performs a little bit better than than DNN.
# It could be counted as an evidence that RNN is better at capture features of sequence data than DNN.
# However, training speed of DNN is much faster than RNN.