In [70]:
#################################################################
# Code      Mass_Suite_NNmodel_Keras v1.2
# Version   1.2
# Date      2020-04-28
# Author    Bowei Zhang, boweiz@uw.edu
# Copyright Bowei Zhang, University of Washington, Seattle, WA

# Note      This code can train a Neural Network using Keras
#           Python V3.7.4
#           Tensorflow V2.0.0
#################################################################

In [8]:
from datetime import datetime
import os

import numpy as np
import pandas as pd
import scipy.interpolate
# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow import keras

In [9]:
input_file  = "dataset_training_2ndedit.csv"

#Input and output columns in csv file
inputs = ["peak width in min", "half intensity width in min", "left width", "right width", 
          "assymetric factor", "integration", "sn", "hw", "ab", "peak height", "ma", "mb", 
          "broad rate", "skewness", "variance"]
outputs = ["label"]

In [19]:
#Normalize the data.
#0 for Not Normalizing 
#Option 1:  x_i = (x_i - x_mean)/(std)
#Option 2:  x_i = (x_i - x_min)/(x_max - x_min)
#Option 3:  x_i = 2(x_i - x_min)/(x_max - x_min)-1
normalize = 1                                

#Number of hidden layers
layers = 3

#Number of nodes per hidden layer
nodes = 8

#Activation function
activation = "relu"

#Initialization for weights in Neural Network: glorot_uniform, he_normal, uniform, zeros ...
initializer = "glorot_uniform"

#Learning rate for Gradient Descent optimization 
learning_rate= 0.002

#L1 Regularization weight
weight = 0.01

#Optimizer for training: SGD, RSMProp, Adagrad, Adam...
my_optimizer = keras.optimizers.Adam(learning_rate)

#Loss Function: mean_squared_error, mean_absolute_error ...
loss = "sparse_categorical_crossentropy"

#To adjust the weight
regularizer = keras.regularizers.l1(weight)

#https://machinelearningmastery.com/difference-between-a-batch-and-an-epoch/
#An epoch is comprised of one or more batches
epochs = 500
batch = 32

#Number of models
num = 3

#Read data and remove unrealistic data
df = pd.read_csv(input_file)

#The total number of data
data_number = len(df)

#Normalize data
if normalize == 1:
	df[inputs] = (df[inputs]-df[inputs].mean())/df[inputs].std()
elif normalize == 2:
    df[inputs] = (df[inputs]-df[inputs].min())/(df[inputs].max()-df[inputs].min())
elif normalize == 3:
    df[inputs] = 2*(df[inputs]-df[inputs].min())/(df[inputs].max()-df[inputs].min())-1
else:
	pass

#Customize loss functions
'''
def loss_func():
	def losss(y_true, y_pred):
		mean_squared_losss = k.mean(k.sum(k.square(y_true - y_pred)))
		layer = model.get_layer(index=-1)
		grad = k.gradients(y_true, y_pred)
		print(grad)
		return keras.losses.mean_squared_error(y_true, y_pred)+\
				keras.losses.mean_absolute_error(y_true, y_pred)
	return losss
'''

'\ndef loss_func():\n\tdef losss(y_true, y_pred):\n\t\tmean_squared_losss = k.mean(k.sum(k.square(y_true - y_pred)))\n\t\tlayer = model.get_layer(index=-1)\n\t\tgrad = k.gradients(y_true, y_pred)\n\t\tprint(grad)\n\t\treturn keras.losses.mean_squared_error(y_true, y_pred)+\t\t\t\tkeras.losses.mean_absolute_error(y_true, y_pred)\n\treturn losss\n'

In [20]:
#Training and validation sets
train, validate = train_test_split(df, test_size=0.3)
x_train = train[inputs].values
x_validate = validate[inputs].values

trained_models = {}

for output in outputs:
    y_train = train[output].values
#     y_validate = validate[output].values

    #Create Neural Network
    model = keras.models.Sequential()
    # input layer
    model.add(keras.layers.Dense(nodes,
                                 input_dim=len(inputs),
                                 kernel_initializer=initializer,
                                 bias_initializer=initializer,
                                 activation=activation,
                                 kernel_regularizer=regularizer))

    # hidden layer
    for i in range(layers):
        model.add(keras.layers.Dense(nodes,
                                     kernel_initializer=initializer,
                                     bias_initializer=initializer,
                                     activation=activation,
                                     kernel_regularizer=regularizer))
    # output layer
    model.add(keras.layers.Dense(4,
                                 kernel_initializer=initializer,
                                 bias_initializer=initializer,
                                 activation = "softmax",
                                 kernel_regularizer=regularizer))

    #Making another `num` copies of the model with the same config
    models = [model]
    for i in range(num-1):
        models.append(keras.models.clone_model(models[0]))

    #Compile model
    for i in range(num):
        models[i].compile(loss=loss, optimizer=my_optimizer, metrics=["accuracy"])

    #Fit the model
    #There are data/batch_size steps in each epoch
    for i in range(num):
        models[i].fit(x_train, y_train, epochs=epochs, batch_size=batch,
                      shuffle=True, verbose=0)

    trained_models[output] = models

In [22]:
#Evaluate the model
model_evals = {}
best_models = {}

for output in outputs:
    models = trained_models[output]
    errors = pd.DataFrame()
    y_validate = validate[output].values

    best_model_num = 0
    max_accuracy = 0
    for i in range(num):
        
        scores = models[i].evaluate(x_validate, y_validate, verbose=0)
        print("Accuracy for validation set",i)
        print("%.3f" % (scores[1]))
        if (scores[1] > max_accuracy):
            best_model_num = i
            max_accuracy = scores[1]
        
    model_evals[output] = (best_model_num, errors)
    best_models[output] = models[best_model_num]
   
    print("The lowest accuracy is {:.3f}, from model {:d}".format(max_accuracy, best_model_num))
#Save the model with the best performance
time = datetime.now().timetuple()[1:6]
os.makedirs('./models', exist_ok=True)
models[best_model_num].save('./models/model_{}_{}_{}_{}_{}.h5'.format(*time))

Accuracy for validation set 0
0.629
Accuracy for validation set 1
0.678
Accuracy for validation set 2
0.679
The lowest accuracy is 0.679, from model 2


In [10]:
# for output in outputs:
#     evaluation = model_evals[output]
#     plt.figure(figsize=(8, 4), dpi=300)
#     ax = plt.boxplot(evaluation[1].values, vert=False)
#     ax = plt.title('Percent Error of Model for Predicting {}'.format(output), size=20)
#     ax = plt.xlabel('Percent error (%)', size=18)
#     ax = plt.ylabel('Model number', size=18)
#     ax = plt.xlim(x_limit)
#     ax = plt.xticks(size=18)
#     ax = plt.yticks(size=18)
#     plt.show()

In [11]:
# def cdf_func(data, bins=200):
#     hist_data, bin_edges = np.histogram(data, bins=bins, density=True)
#     cdf_bins = np.cumsum(hist_data * np.diff(bin_edges))
#     cdf_bins = np.insert(cdf_bins, 0, 0)
#     cdf = scipy.interpolate.interp1d(bin_edges, cdf_bins, bounds_error=False, fill_value=-1)
#     return cdf

In [12]:
# for output in outputs:
#     best_model_num, evaluation = model_evals[output]
#     best_evaluation = evaluation[best_model_num]
#     x_range = np.linspace(np.min(best_evaluation), np.max(best_evaluation), 1000)

#     fig, ax1 = plt.subplots(figsize=(8,6), dpi=300)

#     ax1.hist(best_evaluation, bins=100, density=True, color='#F25F5C')
#     ax1.tick_params(axis='y', labelcolor='#F25F5C')
#     ax1.set_ylabel('Probability', size=18, color='#F25F5C')
#     ax1.set_xlabel('Percent error (%)', size=18)
#     ax1.set_title('Percent error Distribution, {}'.format(output), size=20)
#     ax1.tick_params(labelsize=18)
#     ax1.set_xlim(x_limit[0], x_limit[1])
#     ax1.set_ylim(y_limit[0], y_limit[1])

#     ax2 = ax1.twinx()
#     ax2.tick_params(axis='y', labelcolor='#247BA0')
#     ax2.set_ylabel('Cumulative Density Function', size=18, color='#247BA0')
#     ax2.tick_params(labelsize=18)

#     ax2.plot(x_range, cdf_func(best_evaluation, 100)(x_range), label='Measured CDF from sample', color='#247BA0')