### Import libraries

In [None]:
import pandas as pd
import numpy as np
import math
from math import sqrt
import tensorflow as tf
from matplotlib import pyplot as plt
from keras.models import Model, load_model # for creating a Neural Network Autoencoder model
from keras.layers import Dense # for adding layers to AE model
from tensorflow.keras.utils import plot_model #for plotting  model charts
from tensorflow.keras import models,layers,activations,losses,optimizers,metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
from keras import regularizers
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor

In [2]:
#load data
train_data = pd.read_pickle("EDA_train_median.pkl")
test_data = pd.read_pickle("EDA_test_median.pkl")

In [5]:
#separate target variable
y = train_data['windmill_generated_power(kW/h)']
train_data.drop(['windmill_generated_power(kW/h)'], axis=1, inplace = True)

In [6]:
#split dataset 
X_train, X_test, y_train, y_test = train_test_split(train_data, y, random_state=0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((21051, 33), (21051,), (7018, 33), (7018,))

### DECISION USING MINMAX SCALER

In [7]:
# declare the steps in our pipeline
my_pipeline_minmax = Pipeline(steps = [('MinMaxScaler', MinMaxScaler()),
                      ('DecisionTreeRegressor',DecisionTreeRegressor(criterion='absolute_error'))])

In [8]:
# fit the pipeline to our training data
my_pipeline_minmax.fit(X_train, y_train)

### Mean absolute error

In [9]:
# Use cross-validation to estimate the performance of the pipeline
cvs_mae = cross_val_score(my_pipeline_minmax, X_train, y_train, cv=5,scoring='neg_mean_absolute_error')
scores_cvs_mae = -1 * cvs_mae

In [10]:
print("MAE scores:\n", scores_cvs_mae)
print('Average MAE score:', scores_cvs_mae.mean())

MAE scores:
 [0.79876321 0.7754185  0.78904985 0.80449068 0.82413103]
Average MAE score: 0.798370652479516


### Root mean squared error

In [11]:
# Use cross-validation to estimate the performance of the pipeline
#AT CV = 5
cvs_rmse = cross_val_score(my_pipeline_minmax, X_train, y_train, cv=5,scoring='neg_root_mean_squared_error')
scores_cvs_rmse = -1 * cvs_rmse

In [12]:
print("RMSE scores:\n", scores_cvs_rmse)
print('Average RMSE score:',scores_cvs_rmse.mean())

RMSE scores:
 [1.34480482 1.36617817 1.3755875  1.40480126 1.45580512]
Average RMSE score: 1.3894353729714892


### Mean squared error 

In [13]:
# Use cross-validation to estimate the performance of the pipeline
#AT CV = 5
cvs_mse = cross_val_score(my_pipeline_minmax, X_train, y_train, cv=5,scoring='neg_mean_squared_error')
scores_cvs_mse = -1 * cvs_mse

In [14]:
print("MSE scores:\n", scores_cvs_mse)
print('Average MSE score:',scores_cvs_mse.mean())

MSE scores:
 [2.13115305 1.8919114  1.89892491 1.93454582 2.11623082]
Average MSE score: 1.994553197524482


### Testing 

In [15]:
#test prediction
preds_test_minmax = my_pipeline_minmax.predict(X_test)

In [16]:
#test mae
score_minmax_mae = mean_absolute_error(y_test,preds_test_minmax)
print('MAE is:',score_minmax_mae)

MAE is: 0.7779364022472891


In [17]:
#test mse
score_minmax_mse= mean_squared_error(y_test,preds_test_minmax)
print('MSE is:',score_minmax_mse)

MSE is: 1.893509096022794


In [18]:
#test rmse
score_minmax_rmse= np.sqrt(mean_squared_error(y_test,preds_test_minmax))
print('RMSE is:',score_minmax_rmse)

RMSE is: 1.3760483625304722


In [19]:
# Multiply by -1 since sklearn calculates *negative* MAE
scores_minmax_cv = -1 * cross_val_score(my_pipeline_minmax, train_data, y, cv=5, scoring='neg_mean_absolute_error')
print("MAE scores:\n", scores_minmax_cv)
print('Average MAE score:',scores_minmax_cv.mean())

MAE scores:
 [0.7890515  0.75997452 0.7874278  0.79679993 0.77372722]
Average MAE score: 0.7813961945045088


### DECISION TREE USING STANDARD SCALER

In [20]:
# declare the steps in our pipeline
my_pipeline_standard = Pipeline([('StandardScaler', StandardScaler()),
                      ('DecisionTreeRegressor',DecisionTreeRegressor())])

In [21]:
# fit the pipeline to our training data
my_pipeline_standard.fit(X_train, y_train)

### Cross validation and Mean absolute error

In [22]:
# cross validate using our pipeline
#AT CV = 5
cvs_mae = cross_val_score(my_pipeline_standard, X_train, y_train, cv=5,scoring='neg_mean_absolute_error')
scores_cvs_mae = -1 * cvs_mae

In [23]:
print("MAE scores:\n", scores_cvs_mae)
print('Average MAE score:', scores_cvs_mae.mean())

MAE scores:
 [0.75460181 0.74025335 0.75143328 0.77176519 0.72776847]
Average MAE score: 0.7491644198149292


### Mean squared error

In [24]:
# Use cross-validation to estimate the performance of the pipeline
#AT CV = 5
cvs_mse = cross_val_score(my_pipeline_standard, X_train, y_train, cv=5,scoring='neg_mean_squared_error')
scores_cvs_mse = -1 * cvs_mse

In [25]:
print("MSE scores:\n", scores_cvs_mse)
print('Average MSE score:',scores_cvs_mse.mean())

MSE scores:
 [1.59156736 1.63639443 1.5922945  1.59180886 1.48883178]
Average MSE score: 1.5801793841899356


### Root mean squared error

In [26]:
# Use cross-validation to estimate the performance of the pipeline
#AT CV = 5
cvs_rmse = cross_val_score(my_pipeline_standard, X_train, y_train, cv=5,scoring='neg_root_mean_squared_error')
scores_cvs_rmse = -1 * cvs_rmse

In [27]:
print("RMSE scores:\n", scores_cvs_rmse)
print('Average RMSE score:',scores_cvs_rmse.mean())

RMSE scores:
 [1.23793529 1.25352657 1.25005933 1.25090078 1.21923657]
Average RMSE score: 1.2423317084341368


### Testing

In [28]:
#test prediction
preds_test_standard = my_pipeline_standard.predict(X_test)

In [29]:
#MAE
score_mae_standard = mean_absolute_error(y_test,preds_test_standard)
print('MAE is:',score_mae_standard)

MAE is: 0.7472489384423999


In [30]:
#MSE
score_mse_standard = mean_squared_error(y_test,preds_test_standard)
print('MSE is:',score_mse_standard)

MSE is: 1.6161242698487102


In [31]:
#test rmse
score_standard_rmse= np.sqrt(mean_squared_error(y_test,preds_test_standard))
print('RMSE is:',score_standard_rmse)

RMSE is: 1.271268763813817


In [32]:
# Multiply by -1 since sklearn calculates *negative* MAE
scores_standard_cv = -1 * cross_val_score(my_pipeline_standard, train_data, y, cv=5, scoring='neg_mean_absolute_error')
print("MAE scores:\n", scores_standard_cv)
print('Average MAE score:',scores_standard_cv.mean())

MAE scores:
 [0.74551393 0.73481615 0.73963771 0.75620593 0.72662292]
Average MAE score: 0.7405593281232121


### DECISION TREE USING ROBUST SCALER

In [33]:
# declare the steps in our pipeline
my_pipeline_robust = Pipeline(steps = [('RobustScaler', RobustScaler()),
                      ('DecisionTreeRegressor',DecisionTreeRegressor(random_state=42))])

In [34]:
# fit the pipeline to our training data
my_pipeline_robust.fit(X_train, y_train)

### Cross validation and Mean absolute error

In [35]:
# Use cross-validation to estimate the performance of the pipeline
cvs_mae = cross_val_score(my_pipeline_robust, X_train, y_train, cv=5,scoring='neg_mean_absolute_error')
scores_cvs_mae = -1 * cvs_mae

In [36]:
print("MAE scores:\n", scores_cvs_mae)
print('Average MAE score:', scores_cvs_mae.mean())

MAE scores:
 [0.75927256 0.7588373  0.75503921 0.78642334 0.73934774]
Average MAE score: 0.7597840284064041


### Mean squared error

In [37]:
# Use cross-validation to estimate the performance of the pipeline
cvs_mse = cross_val_score(my_pipeline_robust, X_train, y_train, cv=5,scoring='neg_mean_squared_error')
scores_cvs_mse = -1 * cvs_mse

In [38]:
print("MSE scores:\n", scores_cvs_mse)
print('Average MSE score:',scores_cvs_mse.mean())

MSE scores:
 [1.55989048 1.6600326  1.60274779 1.60978703 1.54737019]
Average MSE score: 1.5959656189785238


### Root mean squared error

In [39]:
cvs_rmse = cross_val_score(my_pipeline_robust, X_train, y_train, cv=5,scoring='neg_root_mean_squared_error')
scores_cvs_rmse = -1 * cvs_rmse

In [40]:
print("RMSE scores:\n", scores_cvs_rmse)
print('Average RMSE score:',scores_cvs_rmse.mean())

RMSE scores:
 [1.24895576 1.28842252 1.26599676 1.26877383 1.24393335]
Average RMSE score: 1.263216444555662


### Testing

In [41]:
#test prediction
preds_test_robust = my_pipeline_robust.predict(X_test)

In [42]:
#test mae
score_robust_mae = mean_absolute_error(y_test,preds_test_robust)
print('MAE is:',score_robust_mae)

MAE is: 0.7427134839074657


In [43]:
#test mae
score_robust_mse = mean_squared_error(y_test,preds_test_robust)
print('MSE is:',score_robust_mse)

MSE is: 1.5522754135678147


In [44]:
#test rmse
score_robust_rmse= np.sqrt(mean_squared_error(y_test,preds_test_robust))
print('RMSE is:',score_robust_rmse)

RMSE is: 1.2459034527473687


In [45]:
# Multiply by -1 since sklearn calculates *negative* MAE
scores_robust_cv = -1 * cross_val_score(my_pipeline_robust, train_data, y,cv=5,scoring='neg_mean_absolute_error')
print("MAE scores:\n", scores_robust_cv)

MAE scores:
 [0.76347756 0.73598252 0.73514687 0.76242631 0.74097767]


In [46]:
print('Average MAE score:',scores_robust_cv.mean())

Average MAE score: 0.7476021825176635
