In [3]:
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split as split
from tensorflow.keras.layers import SimpleRNN, Input, Dense
from tensorflow.keras.models import Model

from deap import base, creator, tools, algorithms
from scipy.stats import bernoulli
from bitstring import BitArray

In [4]:
np.random.seed(998)

In [5]:
#read data from csv
data = pd.read_csv('../Dataset/train.csv')
#use column wp2
data = np.reshape(np.array(data['wp2']), (len(data['wp2']), 1))
data = data[0:1500]

In [6]:
def format_dataset(data, w_size):
    #initialize as empty array
    X, Y = np.empty((0, w_size)), np.empty(0)
    #depending on the window size the data is separated in 2 arrays containing each of the sizes
    for i in range(len(data)-w_size-1):
        X = np.vstack([X,data[i:(i+w_size),0]])
        Y = np.append(Y, data[i+w_size,0])
    X = np.reshape(X,(len(X),w_size,1))
    Y = np.reshape(Y,(len(Y), 1))
    return X, Y


In [7]:
#use GA to identify the optimal window size for the array
def training_hyperparameters(ga_optimization):
    #decode GA solution to integer window size and number of units
    w_size_bit = BitArray(ga_optimization[0:6])
    n_units_bit = BitArray(ga_optimization[6:])
    w_size = w_size_bit.uint
    n_units = n_units_bit.uint
    print('\nWindow Size: ', w_size, '\nNumber of units: ',n_units)

    #return fitness score of 100 if the size or the units are 0
    if w_size == 0 or n_units == 0:
        return 100

    #segment train data on the window size splitting it into 90 train, 10 validation
    X,Y = format_dataset(data, w_size)
    X_train, X_validate, Y_train, Y_validate = split(X, Y, test_size= 0.10, random_state= 998)

    #train RNNSimple model and predict validation set
    input_features = Input(shape=(w_size,1))
    x = SimpleRNN(n_units,input_shape=(w_size,1))(input_features)
    output = Dense(1, activation='linear')(x)
    rnnmodel = Model(inputs=input_features, outputs = output)
    rnnmodel.compile(optimizer='adam', loss = 'mean_squared_error')
    rnnmodel.fit(X_train, Y_train, epochs=5, batch_size=4, shuffle = True)
    Y_predict = rnnmodel.predict(X_validate)

    # calculate RMSE score as fitness score for GA
    RMSE = np.sqrt(mean_squared_error(Y_validate, Y_predict))
    print('Validation RMSE: ', RMSE, '\n')

    return RMSE,

In [8]:

population_size = 4
generations = 5
gene = 10

In [9]:
creator.create('FitnessMax', base.Fitness, weights= (-1.0,))
creator.create('Individual', list, fitness = creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register('bernoulli', bernoulli.rvs, 0.5)
toolbox.register('chromosome', tools.initRepeat, creator.Individual, toolbox.bernoulli, n = gene)
toolbox.register('population', tools.initRepeat, list, toolbox.chromosome)

toolbox.register('mate', tools.cxTwoPoint)
toolbox.register('mutate', tools.mutFlipBit, indpb = 0.6)
toolbox.register('select', tools.selRandom)
toolbox.register('evaluate', training_hyperparameters)

population = toolbox.population(n = population_size)
algo = algorithms.eaSimple(population,toolbox,cxpb=0.4, mutpb=0.1, ngen=generations, verbose=False)



Window Size:  51 
Number of units:  15
Train on 1303 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation RMSE:  0.06586873044018556 


Window Size:  28 
Number of units:  4
Train on 1323 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation RMSE:  0.07353012736054065 


Window Size:  36 
Number of units:  4
Train on 1316 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation RMSE:  0.09677601218570575 


Window Size:  48 
Number of units:  2
Train on 1305 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation RMSE:  0.11537903745571343 


Window Size:  48 
Number of units:  10
Train on 1305 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation RMSE:  0.05835663344859566 


Window Size:  51 
Number of units:  7
Train on 1303 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation RMSE:  0.09420397211906538 


Window Size:  32 
Number of units:  4
Train on 1320 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
E

In [10]:
optimal_chromosome = tools.selBest(population, k = 1)
optimal_w_size = None
optimal_n_units = None

for op in optimal_chromosome:
    w_size_bit = BitArray(op[0:6])
    n_units_bit = BitArray(op[6:])
    optimal_w_size = w_size_bit.uint
    optimal_n_units = n_units_bit.uint
    print('\nOptimal window size:', optimal_w_size, '\n Optimal number of units:', optimal_n_units)


Optimal window size: 48 
 Optimal number of units: 15
