# AST 7939 Week 8

### Deep Neural Networks

In [None]:
from sklearn.datasets import load_iris
import numpy as np
import matplotlib.pyplot as plt 

iris = load_iris()
X = iris.data[:, (2, 3)]  # petal length, petal width
y = iris.target

plt.scatter(X[:,0],X[:,1],c=y)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
X_train.shape

In [None]:
from sklearn.preprocessing import StandardScaler

X_train_scaled = StandardScaler().fit(X_train).transform(X_train)
X_test_scaled = StandardScaler().fit(X_train).transform(X_test)

### Let's build our first neural network. We will use sklearn's MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
help(MLPClassifier)

In [None]:
model = MLPClassifier(hidden_layer_sizes=(100,100),activation="relu",
                      max_iter=1000,random_state=1,verbose=1)
model.fit(X_train_scaled, y_train)

y_pred=model.predict(X_test_scaled)
print(model.score(X_test_scaled, y_test))

In [None]:
from sklearn.metrics import plot_confusion_matrix

fig=plot_confusion_matrix(model, X_test_scaled, y_test,display_labels=["Setosa","Versicolor","Virginica"])
fig.figure_.suptitle("Confusion Matrix")
plt.show()

In [None]:
model.predict_proba(X_test_scaled)

In [None]:
print(model.coefs_[0].shape, model.coefs_[1].shape, model.coefs_[2].shape)

In [None]:
model.coefs_[0]

In [None]:
plt.plot(model.loss_curve_)
plt.xlabel('Iterations')
plt.ylabel('Training Loss')

### Hyperparameter Optimization

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('sc', StandardScaler()),
    ('mlp', MLPClassifier())
])

param_grid = {
    'mlp__hidden_layer_sizes': [(300,200,100), (150,100,50), (75,50,25)],
    'mlp__max_iter': [100, 200, 300],
    'mlp__solver': ['sgd', 'adam'],
    'mlp__learning_rate': ['constant','adaptive']
}

grid_search = GridSearchCV(pipe, param_grid, return_train_score=True, cv=5, n_jobs=4, verbose=3)
grid_search.fit(X_train, y_train)

In [None]:
print("Best parameters: {}".format(grid_search.best_params_))
print("Best model: {}".format(grid_search.best_estimator_))
print("Test score: {:.2f}".format(grid_search.score(X_test, y_test)))

### Save and load a model.

In [None]:
import pickle

# save
with open('iris_model.pkl','wb') as f:
    pickle.dump(grid_search.best_estimator_,f)

# load
with open('iris_model.pkl', 'rb') as f:
    model_best = pickle.load(f)

In [None]:
model_best.get_params()

In [None]:
model_best.predict(X_test)

In [None]:
model_best.score(X_test, y_test)

### Instead of sklearn's MLP classifier, we can use Keras implemented onto Tensorflow. You will have to choose a Tensorflow kernel on hipergator.


Keras: https://keras.io/

Tensorflow: https://www.tensorflow.org/

In [None]:
import tensorflow.keras as keras

# Initialize a model.
model = keras.models.Sequential()

# Add the input layer and specifying its shape.
model.add(keras.layers.Input(shape=X_train.shape[1]))

# Add the first hidden layer with 100 neurons and the ReLU activation function.
model.add(keras.layers.Dense(100, activation='relu'))

# Add the second hidden layer with 100 neurons and the ReLU activation function.
model.add(keras.layers.Dense(100, activation='relu'))

# Add the output layer
# softmax normalizes the output to a probability distribution
model.add(keras.layers.Dense(np.unique(y).shape[0], activation='softmax'))

### We can use the following syntax instead.

In [None]:
model = keras.models.Sequential()

# This line combines the input layer and the first hidden layer.
model.add(keras.layers.Dense(100, input_dim=X_train.shape[1], activation='relu'))
model.add(keras.layers.Dense(100, activation='relu'))
model.add(keras.layers.Dense(np.unique(y).shape[0], activation='softmax'))

### We can also use the following syntax.

In [None]:
model = keras.models.Sequential([
    keras.layers.Input(shape=X_train.shape[1]),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(100, activation='relu',),
    keras.layers.Dense(np.unique(y).shape[0], activation='softmax')
])

### Let's check the "architecture" of our neural network. Note that the input layer does not show up here.

In [None]:
model.summary()

### We can also have a look at the weights.

In [None]:
model.weights

### Now, we need to "compile" the model before we fit to the data.

Some useful links

Loss functions: https://keras.io/api/losses/ 

Optimizers: https://keras.io/api/optimizers/ 

Metrics: https://keras.io/api/metrics/

In [None]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics='accuracy')

### We are ready to train the neural network. Let's start training.

In [None]:
# history will record the loss, accuracy, etc. 
# validation_split will split the training dataset into training + validation datasets. 
history = model.fit(X_train_scaled, y_train, epochs=1000, validation_split=0.2)

### Note that in each epoch, there are 3 mini batches.

In [None]:
X_train_scaled.shape

In [None]:
X_train_scaled.shape[0]/32

### Let's evlauate the model using the test dataset.

In [None]:
loss, accuracy = model.evaluate(X_test_scaled, y_test, verbose=0)
print('Test loss:', loss)
print('Test accuracy:', accuracy)

### Let's check how loss and validation loss evolved over epoch.

In [None]:
plt.plot(history.epoch, history.history['loss'],'.-', label='loss')
plt.plot(history.epoch, history.history['val_loss'],'.-', label='validation loss')
plt.xlabel('epoch')
plt.ylabel('loss, validation loss')
plt.legend()

### Let's check how accuracy and validation accuracy evolved over epoch.

In [None]:
plt.plot(history.epoch, history.history['accuracy'],'.-', label='accuracy')
plt.plot(history.epoch, history.history['val_accuracy'],'.-', label='validation accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy, validation accuracy')
plt.legend()

### Early stopping

In [None]:
# The default monitor is "val_loss"
early_stopping_cb = keras.callbacks.EarlyStopping(monitor='val_loss', patience=30)
checkpoint_cb = keras.callbacks.ModelCheckpoint("best_model.h5", save_best_only=True)

In [None]:
model = keras.models.Sequential([
    keras.layers.Input(shape=X_train.shape[1]),
    keras.layers.Dense(100, activation='relu'),
    keras.layers.Dense(100, activation='relu',),
    keras.layers.Dense(np.unique(y).shape[0], activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics='accuracy')

In [None]:
history = model.fit(X_train, y_train, epochs=1000,
                    validation_split=0.2,
                    callbacks=[checkpoint_cb, early_stopping_cb])


In [None]:
plt.plot(history.epoch, history.history['loss'],'.-', label='loss')
plt.plot(history.epoch, history.history['val_loss'],'.-', label='validation loss')
plt.xlabel('epoch')
plt.ylabel('loss, validation loss')
plt.legend()

In [None]:
plt.plot(history.epoch, history.history['accuracy'],'.-', label='accuracy')
plt.plot(history.epoch, history.history['val_accuracy'],'.-', label='validation accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy, validation accuracy')
plt.legend()

### Let's check out weights and biases.

In [None]:
model.layers

In [None]:
hidden1 = model.layers[0]

In [None]:
hidden1.name

### Let's put weights and biases of the first hidden layer in w1 and b1.

In [None]:
w1, b1 = hidden1.get_weights()

In [None]:
w1

In [None]:
w1.shape

In [None]:
b1

In [None]:
b1.shape

### We can save the model and then load it as needed.

In [None]:
model.save("iris_model_final.h5")

In [None]:
model_new = keras.models.load_model("iris_model_final.h5")

In [None]:
model_new.summary()

### Let's try a larger dataset = the MNIST dataset.

In [None]:
import tensorflow.keras as keras

(X_train, y_train), (X_test, y_test) = keras.datasets.mnist.load_data()

### Scale the images to 0 - 1.

In [None]:
X_train = X_train / 255.
X_test = X_test / 255.

### For now, let's use 10% of the data.

In [None]:
X_train = X_train[::10]
X_test = X_test[::10]

y_train = y_train[::10]
y_test = y_test[::10]

In [None]:
X_train.shape

In [None]:
import matplotlib.pyplot as plt 

plt.imshow(X_train[0], cmap="binary")
plt.axis('off')
plt.show()

### TODO: Make your own neural network and see how they perform. 

### Try Stochastic Gradient Descent with a learning rate of 1.0e-3.

In [None]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.SGD(learning_rate=1e-3),
              metrics=["accuracy"])

### Try also Adam optimizer with a learning rate of 1.0e-3.

In [None]:
model.compile(loss="sparse_categorical_crossentropy",
              optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              metrics=["accuracy"])

### TODO: Evaluate the model using the test dataset.

### TODO: Check how loss and accuracy have evolved over the training.