In [1]:
#import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from keras.models import Model, load_model # for creating a Neural Network Autoencoder model
from keras import Input # for instantiating a keras tensor
from keras.layers import Dense # for adding layers to AE model
from tensorflow.keras.utils import plot_model #for plotting  model charts
from tensorflow.keras import models,layers,activations,losses,optimizers,metrics
from sklearn.preprocessing import RobustScaler,StandardScaler
from keras import regularizers
from sklearn.model_selection import train_test_split,cross_validate
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, TransformerMixin
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.tree import DecisionTreeRegressor

In [2]:
#oad data
train_data = pd.read_pickle("EDA_train_mean.pkl")
test_data = pd.read_pickle("EDA_test_median.pkl")

In [3]:
#separate target variable
y = train_data['windmill_generated_power(kW/h)']
train_data.drop(['windmill_generated_power(kW/h)'], axis=1, inplace = True)

In [4]:
# Create an instance of the scaler
scaler = StandardScaler()

# Fit the scaler to your data
scaler.fit(train_data)

# Transform the data using the scaler
scaled_data = scaler.transform(train_data)

In [5]:
#split dataset 

X_train, X_test, y_train, y_test = train_test_split(scaled_data, y, random_state=0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((21055, 33), (21055,), (7019, 33), (7019,))

### GENERAL AUTOENCODER

In [6]:
n_inputs = X_train.shape[1] # number of input neurons = the number of features in X_train
n_bottleneck= 50

#--- Input Layer 
visible = Input(shape=(n_inputs,), name='Input-Layer') # Specify input shape

#--- Encoder Layer
e = Dense(units=200, name='Encoder-Layer1',activation=activations.relu, activity_regularizer=regularizers.l1(1e-3))(visible)
e = Dense(units=100, name='Encoder-Layer2',activation=activations.relu)(e)

#--- Bottleneck
bottleneck = Dense(units=n_bottleneck, name='Bottleneck-Layer')(e)

#--- Decoder Layer
d = Dense(units=100, name='Decoder-Layer1',activation=activations.relu)(bottleneck)
d = Dense(units=200, name='Decoder-Layer2',activation=activations.relu)(d)

#--- Output layer
output = Dense(units=n_inputs, activation='relu', name='Output-Layer')(d)

# Define autoencoder model
model = Model(inputs=visible, outputs=output, name='Autoencoder-Model')

# Compile autoencoder model
model.compile(optimizer='adam', loss='mae')

# Print model summary
print(model.summary())

# Train the autoencoder model
model.fit(X_train, X_train, epochs=30, batch_size=16, verbose=2, validation_data=(X_test, X_test))

Model: "Autoencoder-Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input-Layer (InputLayer)    [(None, 33)]              0         
                                                                 
 Encoder-Layer1 (Dense)      (None, 200)               6800      
                                                                 
 Encoder-Layer2 (Dense)      (None, 100)               20100     
                                                                 
 Bottleneck-Layer (Dense)    (None, 50)                5050      
                                                                 
 Decoder-Layer1 (Dense)      (None, 100)               5100      
                                                                 
 Decoder-Layer2 (Dense)      (None, 200)               20200     
                                                                 
 Output-Layer (Dense)        (None, 33)          

<keras.callbacks.History at 0x27967b1db20>

### Random Forest and General Autoencoder

In [7]:
X_encoded = model.predict(train_data)

# encode the train data
X_train_encode = model.predict(X_train)

# encode the test data
X_test_encode = model.predict(X_test)

# Train the random forest model using the encoded features
rfr = RandomForestRegressor()
rfr.fit(X_train_encode, y_train)

#reshape test data
#X_test_encode_new = X_test_encode.reshape(-1, 1)
#y_test_new = y_test.to_numpy().reshape(-1, 1)

# prediction 
pred = rfr.predict(X_test_encode)

#metrics 
print('MAE:', mean_absolute_error(y_test,pred))
print('MSE:', mean_squared_error(y_test, pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, pred)))
print('Train score:',rfr.score(X_train_encode,y_train))
print('Test score:',rfr.score(X_test_encode,y_test))

MAE: 1.0244684988319697
MSE: 2.0675203153210084
RMSE: 1.4378874487667692
Train score: 0.9595379966699928
Test score: 0.7236431215879027


### Decision tree and Denoising Autoencoder

In [14]:
# Specify how much noise to add
level_of_noise = 0.01

# Add random noise based on sampling from Gaussian distribution
X_train_noisy = X_train + level_of_noise * np.random.normal(loc=0.0, scale=1.0, size=X_train.shape)
X_test_noisy = X_test + level_of_noise * np.random.normal(loc=0.0, scale=1.0, size=X_test.shape)

# Enforce min-max boundaries so it does not go beyond [0,1] range
X_train_noisy = np.clip(X_train_noisy, 0., 1.)
X_test_noisy = np.clip(X_test_noisy, 0., 1.)
# Print shapes
print("New shape of X_train: ", X_train.shape)
print("New shape of X_test: ", X_test.shape)

print("New shape of X_train_noisy: ", X_train_noisy.shape)
print("New shape of X_test_noisy: ", X_test_noisy.shape)


#--- Define Shapes
n_inputs = X_train_noisy.shape[1] # number of input neurons = the number of features in X_train
n_bottleneck= 50

#--- Input Layer 
visible = Input(shape=(n_inputs,), name='Input-Layer') # Specify input shape

#--- Encoder Layer
e = Dense(units=200, name='Encoder-Layer1',activation=activations.relu, activity_regularizer=regularizers.l1(1e-3))(visible)
e = Dense(units=100, name='Encoder-Layer2',activation=activations.relu)(e)

#--- Bottleneck
bottleneck = Dense(units=n_bottleneck, name='Bottleneck-Layer')(e)

#--- Decoder Layer
d = Dense(units=100, name='Decoder-Layer1',activation=activations.relu)(bottleneck)
d = Dense(units=200, name='Decoder-Layer2',activation=activations.relu)(d)

#--- Output layer
output = Dense(units=n_inputs, activation='relu', name='Output-Layer')(d)

# Define autoencoder model
model = Model(inputs=visible, outputs=output, name='Autoencoder-Model')

# Compile autoencoder model
model.compile(optimizer='adam', loss='mae')

# Print model summary
print(model.summary())

# Fit the autoencoder model to reconstruct input
history = model.fit(X_train_noisy, X_train_noisy, epochs=30, batch_size=32, verbose=1, validation_data=(X_test_noisy, X_test_noisy))

X_encoded_denoise = model.predict(train_data)

# encode the train data
X_train_encode_denoise = model.predict(X_train_noisy)

# encode the test data
X_test_encode_denoise = model.predict(X_test_noisy)


# Train the random forest model using the encoded features
rfr_denoise = RandomForestRegressor()
rfr_denoise.fit(X_train_encode_denoise, y_train)

#test prediction
prediction_denoise = rfr_denoise.predict(X_test_encode_denoise)

# Calculate the MAE and MSE
mae = mean_absolute_error(y_test, prediction_denoise)
mse = mean_squared_error(y_test, prediction_denoise)
print('RMSE:', np.sqrt(mean_squared_error(y_test, prediction_denoise)))
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print('Train score:',rfr_denoise.score(X_train_encode_denoise,y_train))
print('Test score:',rfr_denoise.score(X_test_encode_denoise,y_test))

New shape of X_train:  (21055, 33)
New shape of X_test:  (7019, 33)
New shape of X_train_noisy:  (21055, 33)
New shape of X_test_noisy:  (7019, 33)
Model: "Autoencoder-Model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input-Layer (InputLayer)    [(None, 33)]              0         
                                                                 
 Encoder-Layer1 (Dense)      (None, 200)               6800      
                                                                 
 Encoder-Layer2 (Dense)      (None, 100)               20100     
                                                                 
 Bottleneck-Layer (Dense)    (None, 50)                5050      
                                                                 
 Decoder-Layer1 (Dense)      (None, 100)               5100      
                                                                 
 Decoder-Layer2 (Dense)      (Non

  updates = self.state_updates


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


  updates=self.state_updates,


RMSE: 1.5511600751607546
Mean Absolute Error: 1.1300489615264564
Mean Squared Error: 2.406097578772718
Train score: 0.953237496978968
Test score: 0.6783869009184109


### Random Forest and Variational Autoencoder


In [9]:
from tensorflow.python.framework.ops import disable_eager_execution
disable_eager_execution()
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras import backend as K
import tensorflow as tf


batch_size = 64
original_dim = (X_train.shape[1])
latent_dim = 50
intermediate_dim1 = 200
intermediate_dim2 = 100
epochs = 30
epsilon_std = 0.000001


# sampling from mean and sd in VAE
def sampling(args: tuple):
    # we grab the variables from the tuple
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon

#########################
# input to our encoder
#########################
x = Input(shape=(original_dim,), name="input")

# intermediate layer
#h1 = Dense(intermediate_dim1, activation='tanh', name="encoding1")(x)
h1 = Dense(intermediate_dim1, activation="tanh", name="encoding1", activity_regularizer=regularizers.l1(10e-5))(x)
h = Dense(intermediate_dim2, activation='tanh', name="encoding")(h1)


# defining the mean of the latent space
z_mean = Dense(latent_dim, name="mean")(h)

# defining the log variance of the latent space
z_log_var = Dense(latent_dim, name="log-variance")(h)

# note that "output_shape" isn't necessary with the TensorFlow backend
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

# defining the encoder as a keras model
encoder = Model(x, [z_mean, z_log_var, z], name="encoder")

# print out summary of what we just did
encoder.summary()


#########################
# Input to the decoder
#########################

input_decoder = Input(shape=(latent_dim,), name="decoder_input")

# taking the latent space to intermediate dimension
decoder_h1 = Dense(intermediate_dim2, activation='relu', name="decoder_h2")(input_decoder)
decoder_h = Dense(intermediate_dim1, activation='relu', name="decoder_h")(decoder_h1)

# getting the mean from the original dimension
x_decoded = Dense(original_dim, activation='tanh', name="flat_decoded")(decoder_h)

# defining the decoder as a keras model
decoder = Model(input_decoder, x_decoded, name="decoder")
decoder.summary()

##########################
# Variational Autoencoder
##########################

# grab the output. Recall, that we need to grab the 3rd element our sampling z
output_combined = decoder(encoder(x)[2])

# link the input and the overall output
vae = Model(x, output_combined)

# print out what the overall model looks like
vae.summary()

# Defina VAE Loss Function
def vae_loss(x: tf.Tensor, x_decoded_mean: tf.Tensor,z_log_var=z_log_var, z_mean=z_mean, original_dim=original_dim):
    xent_loss = original_dim * metrics.mae(x, x_decoded_mean)
    kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
    vae_loss = K.mean(xent_loss + kl_loss)
    return vae_loss

vae.compile(optimizer='adam', loss=vae_loss, metrics=['mae'],experimental_run_tf_function=False)

history = vae.fit(X_train, X_train,
                    epochs=epochs,
                    batch_size=batch_size,
                    shuffle=True,
                    validation_data=(X_test, X_test),
                    verbose=1).history


X_encoded = vae.predict(train_data)

# encode the train data
X_train_encode_vae = vae.predict(X_train)

# encode the test data
X_test_encode_vae = vae.predict(X_test)

# Train the random forest model using the encoded features
rfr_vae = RandomForestRegressor()
rfr_vae.fit(X_train_encode_vae, y_train)

def get_error_term(v1, v2, _rmse=True):
    if _rmse:
        return np.sqrt(np.mean((v1 - v2) ** 2))
    #return MAE
    return np.mean(abs(v1 - v2))

X_train_pred = rfr_vae.predict(X_train_encode_vae)
mae_vector_train = get_error_term(X_train_pred, y_train, _rmse=False)
                   
X_pred = rfr_vae.predict(X_test_encode_vae)
mae_vector_test = get_error_term(X_pred, y_test, _rmse=False)
                   

#metrics 
print('MAE:', mean_absolute_error(y_test,X_pred))
print('MSE:', mean_squared_error(y_test, X_pred))
print('RMSE:', np.sqrt(mean_squared_error(y_test, X_pred)))
print('Train score:',rfr_vae.score(X_train_encode_vae,y_train))
print('Test score:',rfr_vae.score(X_test_encode_vae,y_test))

Model: "encoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input (InputLayer)             [(None, 33)]         0           []                               
                                                                                                  
 encoding1 (Dense)              (None, 200)          6800        ['input[0][0]']                  
                                                                                                  
 encoding (Dense)               (None, 100)          20100       ['encoding1[0][0]']              
                                                                                                  
 mean (Dense)                   (None, 50)           5050        ['encoding[0][0]']               
                                                                                            

  updates = self.state_updates


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


  updates=self.state_updates,


MAE: 0.6375983816259955
MSE: 1.0461761983753184
RMSE: 1.022827550653246
Train score: 0.9809442238069822
Test score: 0.860161960049641
