## ODSC Workshop Part 2 - Models

In [None]:
import pickle
import numpy as np
import tensorflow as tf

#### Load in the tokenized dataset from the previous notebook, it may have to be moved to your current working directory

In [None]:
X_train = np.load('X_train.npy')
y_train = np.load('y_train.npy')
X_test = np.load('X_test.npy')
y_test = np.load('y_test.npy')

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

#### Input_dim is equal to the length of the tokenizer.word_index plus 1. This creates over 45,000 nodes in the first layer of our neural network, which will take too long to train. Luckily, as is the case with many text datasets, when can omit a large portion of the index as the least frequently used words are very sparse, and still produce an accruate model that trains quickly

##### Begin executing the cell below, but note the eta for the training time for each epoch. This is too long. Stop the execution of the cell

In [None]:
input_dim = 45785
embedding_dim = 1000

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=input_dim, 
                           output_dim=embedding_dim, 
                           input_length=X_train.shape[1]))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train,
                    epochs=3,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

#### Rebuilind a new tokenizer, this time with half of the words of the previous one. This will be produce an index of the top 50% of words that occur most frequently

#### Rerun the same model and note the difference and training time and the effects on model accuracy 

In [None]:
code = np.load('code.npy')
labels = np.load('labels.npy')

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=45785*.5)
tokenizer.fit_on_texts(code)
vocab_size = len(tokenizer.word_index) + 1 
maxlen = 1000

code_train = code[:9180]
code_test = code[9180:]

X_train = tokenizer.texts_to_sequences(code_train)
X_test = tokenizer.texts_to_sequences(code_test)

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, padding='post', maxlen=maxlen)
y_train = np.array(labels[:9180]).reshape(9180,1)
y_test = np.array(labels[9180:]).reshape(2295,1)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train,
                    epochs=3,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=100)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

#### Using tf.keras' embedding layer allows for the creation of 1D Text CNN's, which can be build just like other famous CNN's. Below is an example of a popular CNN for image classification, LeNet, built to classify text. The network will be trained and saved along with the tokenizer for model serving

##### Execute the cell below and then use the remaining cells as a sandbox to test out different neural network archectectures. 

In [None]:
lenet = tf.keras.Sequential()
lenet.add(tf.keras.layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))

lenet.add(tf.keras.layers.Conv1D(filters=6, kernel_size=(3), activation='relu'))
lenet.add(tf.keras.layers.AveragePooling1D())

lenet.add(tf.keras.layers.Conv1D(filters=16, kernel_size=(3), activation='relu'))
lenet.add(tf.keras.layers.AveragePooling1D())

lenet.add(tf.keras.layers.Flatten())

lenet.add(tf.keras.layers.Dense(units=120, activation='relu'))

lenet.add(tf.keras.layers.Dense(units=84, activation='relu'))

lenet.add(tf.keras.layers.Dense(units=1, activation = 'sigmoid'))
lenet.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

lenet.fit(X_train, y_train,
                    epochs=3,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=100)
loss, accuracy = lenet.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = lenet.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

tf.saved_model.save(lenet, "/tmp/lenet/1/")

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### SANDBOX - try different model architectures below