# Data Science Methods for Clean Energy Research 
## _Neural Networks_

## Outline

### 1. Load dataset & rescale
### 2. Build a NN model with Keras
### 3. Train NN and test
### 4. Increase complexity of NN

In [None]:
# Import plotting libraries
import matplotlib 
from matplotlib import pyplot as plt
matplotlib.rcParams.update({'font.size': 20})
plt.rcParams.update({'font.size': 22})

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

print(keras.__version__)
import numpy as np

print(np.__version__)
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


%load_ext autoreload
%autoreload 2


### 1. Loading the data, splitting into train and test and rescaling

Set our random seed so that all computations are deterministic

In [None]:
seed = 42

Read in the raw data for the HCEPDB into a pandas dataframe

In [None]:
df = pd.read_csv('http://faculty.washington.edu/dacb/HCEPDB_moldata.zip')
df.head()

In [None]:
df_smaller = df.sample(frac=0.01, random_state=42)

In [None]:
df_smaller.describe()

Separate out the input features from the output target `'pce'`

In [None]:
X = df_smaller[['mass', 'voc', 'jsc', 'e_homo_alpha', 'e_gap_alpha', 
        'e_lumo_alpha']].values
Y = df_smaller[['pce']].values

Let's create the test / train split from this data and keep 20% for testing.  

In [None]:
X_train_pn, X_test_pn, y_train, y_test = train_test_split(X, Y,
                                                    test_size=0.20,
                                                    random_state=seed)

Now we need rescale the input features of the training set using the `StandardScaler()` class [more info](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) and store the scaler to use it for the future test sets. We are rescaling input features because we do not want one of the input features to matter more than the others.

In [None]:
# Create the scaler from the training data only and keep it for later use
X_train_scaler = StandardScaler().fit(X_train_pn)

# Apply the scaler transform to the training data
X_train = X_train_scaler.transform(X_train_pn)

Now let's reuse that scaler transform on the test set.  This way we never contaminate the test data with the training data.  We'll start with a histogram of the testing data just to prove to ourselves it is working.

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,4))
ax[0].hist(X_test_pn[:,0], alpha=0.6, color='mediumvioletred')
ax[0].set_xlabel('mass')
ax[0].set_ylabel('count')
ax[0].set_title('mass distribution',fontsize=18)
ax[1].hist(X_test_pn[:,1], alpha=0.6, color='royalblue')
ax[1].set_xlabel('voc')
ax[1].set_ylabel('count')
ax[1].set_title('voc distribution',fontsize=18)
plt.tight_layout()
plt.show()

OK, now apply the training scaler transform to the test and plot a histogram

In [None]:
X_test = X_train_scaler.transform(X_test_pn)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,4))
ax[0].hist(X_test[:,0], alpha=0.6, color='mediumvioletred')
ax[0].set_xlabel('mass')
ax[0].set_ylabel('count')
ax[0].set_title('mass distribution',fontsize=18)
ax[1].hist(X_test[:,1], alpha=0.6, color='royalblue')
ax[1].set_xlabel('voc')
ax[1].set_ylabel('count')
ax[1].set_title('voc distribution',fontsize=18)
plt.tight_layout()
plt.show()

### 2. Creating the neural network model

This is a simple neural network with one hidden layer and one output layer. Here we will use `Keras` functions [Keras documentation](https://keras.io/guides/). We will use `Dense` layers as defined [here](https://keras.io/api/layers/core_layers/dense/), and the `Adam` [optimizer](https://keras.io/api/optimizers/adam/) which relies on gradient descent.

In [None]:
def simple_network():
    # Define a sequential model object 

    # Add a layer to your model - Note - THIS IS YOUR FIRST HIDDEN LAYER - input layer is defined by input_dim!
    # Use kernel_initializer='normal' and  activation='relu'

    # Add another layer to your model - this is the output layer

    # Compile your model using the mean_squared_error for your loss and the adam optimizer

    return model

In [None]:
# Create an instance of your simple_network() function

# Print it's summary()


### 3. Training the NN

Train the neural network with the following

In [None]:
# Set the numpy random number generator - i.e. get reproducible starting weights 


# Create the NN estimator with the KerasRegressor and your simple_network function, 
# use 150 epochs and a batch_size of 10000


# Fit your estimator to your *** data with a validation_split of 0.3 and the same 
# epochs and batch size as above save results in an object you call "history"


The history object returned by the `fit` call contains the information in a fitting run.

In [None]:
print(history.history.keys())

In [None]:
print("final MSE for train is %.2f and for validation is %.2f" % 
      (history.history['loss'][-1], history.history['val_loss'][-1]))

Let's plot it!

In [None]:
# Plot of model loss
plt.plot(history.history['loss'], '--', c='b')
plt.plot(history.history['val_loss'], c='crimson')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

Let's get the MSE for the test set.

In [None]:
# Test set error
test_loss = estimator.model.evaluate(X_test, y_test)
print("test set mse is %.2f" % test_loss)

So our train mse is **very similar** to the training and validation at the final step!

## Exercise

Test the following

* 1) Change the **number of neurons in each layer** - do you get any errors?
* 2) Change the **optimizer** - see [here](https://keras.io/api/optimizers/) for a list of optimzers in Keras
* 3) Change the **activation functions** of both the hidden layer and the output layer, does your cost function final value change?
* 4) Test the model by prediction the **unscaled `X_test_pn`**, how does the loss change? 
* 5) Train the model with the **unscaled input features** `X_train_pn` - how does the loss / cost function change?


In [None]:
# Define a new function which now takes activation functions in input
def simple_network_2(n1, n2, act1, act2):
    # Define the sequential model
    
    # Add first hidden layer and use activation=act1

    # Add second hidden layer and use activation=act2

    # Compile the model as we had done before

    return model

In [None]:
# Reset your random number generator


# Create your NN estimator using Keras Regressor - specify your two activation functions


# Fit your estimator to your *** data with the same validation split epochs etc as before - store the results in a variable


###  Let's look at another way to evaluate the set of models using cross validation

Use 10 fold cross validation to evaluate the models generated from our training set.  We'll use scikit-learn's tools for this.  Remember, this is only assessing our training set.  If you get negative values, to make `cross_val_score` behave as expected, we have to flip the signs on the results (incompatibility with keras).

In [None]:
kfold = KFold(n_splits=10, random_state=seed, shuffle=True)

results = cross_val_score(estimator, X_train, y_train, cv=kfold)
print("Results: %.2f (%.2f) MSE" % (-1 * results.mean(), results.std()))

### 4. Increase complexity of NN

Let's add a second hidden layer this time. Note: it is worthwhile to test how the final loss changes with number of epochs of training as well as learning rate. 

In [None]:
def medium_network():
 
    model = Sequential()
    model.add(Dense(600, input_dim=6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(120, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    
    opt = keras.optimizers.Adam(learning_rate=0.9)
    
    model.compile(loss='mean_squared_error', optimizer=opt)
    return model

In [None]:
my_model = medium_network()
my_model.summary()

In [None]:
# Set random seed
np.random.seed(seed)
# Create the NN framework
estimator = KerasRegressor(build_fn=medium_network,
        epochs=150, batch_size=10000, verbose=0)
# Fit to training data
history = estimator.fit(X_train, y_train, validation_split=0.30, epochs=150, 
        batch_size=10000, verbose=0)

print("Final MSE for train is %.3e and for validation is %.3e" % 
      (history.history['loss'][-1], history.history['val_loss'][-1]))

In [None]:
# Summarize history for loss
plt.plot(history.history['loss'],'--',c='b')
plt.plot(history.history['val_loss'],c='crimson')
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
#plt.ylim([0,100])
plt.xlim([0,50])
plt.ylim([0,20])
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
test_loss = estimator.model.evaluate(X_test, y_test)
print("test set mse is %.3e" % test_loss)

In [None]:
def medium_network(lr=0.8):

    model = Sequential()
    model.add(Dense(6, input_dim=6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(12, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    opt = keras.optimizers.Adam(learning_rate=lr)

    model.compile(loss='mean_squared_error', optimizer=opt)
    
    return model

In [None]:
np.random.seed(seed)

MSE = []
lr_vals = [0.00001, 0.0001, 0.01, 0.1, 1.0]
for lrate in lr_vals:
    estimator = KerasRegressor(build_fn=medium_network, 
                               epochs=150, batch_size=10000, 
                               verbose=0, lr=lrate)
    history = estimator.fit(X_train, y_train, validation_split=0.30,
                            epochs=150, batch_size=10000, 
                            verbose=0)
    print("Final MSE for train is %.3e and for validation is %.3e" % 
      (history.history['loss'][-1], history.history['val_loss'][-1]))
    MSE.append([history.history['loss'][-1], 
                history.history['val_loss'][-1],
                estimator.model.evaluate(X_test, y_test)])
    
    

In [None]:
plt.plot(lr_vals, [m[0] for m in MSE],'-o',lw=3,label='train')
plt.plot(lr_vals, [m[1] for m in MSE],'-.s',lw=3, label='validation')
plt.plot(lr_vals, [m[2] for m in MSE],':>',lw=3, label='test')  
plt.xlabel('learning rate')
plt.ylabel('loss')
plt.legend()
plt.show()