In [1]:
#import libraries
import pandas as pd
import numpy as np
import math
from math import sqrt
import tensorflow as tf
from matplotlib import pyplot as plt
from keras.models import Model, load_model # for creating a Neural Network Autoencoder model
from keras.layers import Dense # for adding layers to AE model
from tensorflow.keras.utils import plot_model #for plotting  model charts
from tensorflow.keras import models,layers,activations,losses,optimizers,metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
from keras import regularizers
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor

In [2]:
#import preprocessed data
train_data = pd.read_pickle("EDA_train_mean.pkl")
test_data = pd.read_pickle("EDA_test_mean.pkl")

In [5]:
#separate target variable 
y = train_data['windmill_generated_power(kW/h)']
train_data.drop(['windmill_generated_power(kW/h)'], axis=1, inplace = True)

In [6]:
#split dataset 
X_train, X_test, y_train, y_test = train_test_split(train_data, y, random_state=0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((21055, 33), (21055,), (7019, 33), (7019,))

In [7]:
# declare the steps in pipeline
my_pipeline = Pipeline(steps = [('DecisionTreeRegressor',DecisionTreeRegressor(criterion='absolute_error'))])

In [8]:
# fit the pipeline to our training data
my_pipeline.fit(X_train, y_train)

### Cross Validation and Mean absolute error

In [9]:
# Use cross-validation to estimate the performance of the pipeline
cvs = cross_val_score(my_pipeline, X_train, y_train, cv=5,scoring='neg_mean_absolute_error')
scores_cvs = -1 * cvs

In [10]:
#mae
print("MAE scores:\n", scores_cvs)
print('Average MAE score:', scores_cvs.mean())

MAE scores:
 [0.80526604 0.77287717 0.81616589 0.85125109 0.78320575]
Average MAE score: 0.8057531871569793


### Root mean squared error

In [11]:
# Use cross-validation to estimate the performance of the pipeline
cvs_rmse = cross_val_score(my_pipeline, X_train, y_train, cv=5,scoring='neg_root_mean_squared_error')
scores_cvs_rmse = -1 * cvs_rmse

In [12]:
#RMSE
print("RMSE scores:\n", scores_cvs_rmse)
print('Average RMSE score:',scores_cvs_rmse.mean())

RMSE scores:
 [1.39308483 1.36469455 1.40412576 1.55047836 1.38163245]
Average RMSE score: 1.4188031870003717


### Mean squared error 

In [13]:
# Use cross-validation to estimate the performance of the pipeline
cvs_mse = cross_val_score(my_pipeline, X_train, y_train, cv=5,scoring='neg_mean_squared_error')
scores_cvs_mse = -1 * cvs_mse

In [14]:
#MSE
print("MSE scores:\n", scores_cvs_mse)
print('Average MSE score:',scores_cvs_mse.mean())

MSE scores:
 [2.05174796 1.80780665 2.03988524 2.40134712 1.84950435]
Average MSE score: 2.030058262790908


### Testing 

In [15]:
#test prediction
preds_test = my_pipeline.predict(X_test)

In [16]:
#test mae
score_mae = mean_absolute_error(y_test,preds_test)
print('MAE is:',score_mae)

MAE is: 0.7753560305743317


In [17]:
#test mse
score_mse = mean_squared_error(y_test,preds_test)
print('MSE is:',score_mse)

MSE is: 1.7739353769875201


In [18]:
#test rmse
score_rmse= np.sqrt(mean_squared_error(y_test,preds_test))
print('RMSE is:',score_rmse)

RMSE is: 1.3318916536218404


### Testing the pipeline on the whole data and target variable

In [19]:
# Multiply by -1 since sklearn calculates *negative* MAE
scores_cv = -1 * cross_val_score(my_pipeline, train_data, y, cv=5, scoring='neg_mean_absolute_error')
print("MAE scores:\n", scores_cv)
print('Average MAE score:',scores_cv.mean())

MAE scores:
 [0.77721433 0.7991053  0.78366766 0.79905043 0.78527727]
Average MAE score: 0.7888629966719585
