In [1]:
#import libraries
import pandas as pd
import numpy as np
import math
from math import sqrt
import tensorflow as tf
from matplotlib import pyplot as plt
from keras.models import Model, load_model # for creating a Neural Network Autoencoder model
from keras.layers import Dense # for adding layers to AE model
from tensorflow.keras.utils import plot_model #for plotting  model charts
from tensorflow.keras import models,layers,activations,losses,optimizers,metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
from keras import regularizers
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor

In [2]:
#load cleaned data
train_data = pd.read_pickle("EDA_train_median.pkl")
test_data = pd.read_pickle("EDA_test_median.pkl")

In [3]:
#separate target variable
y = train_data['windmill_generated_power(kW/h)']
train_data.drop(['windmill_generated_power(kW/h)'], axis=1, inplace = True)

In [4]:
#split dataset 
X_train, X_test, y_train, y_test = train_test_split(train_data, y, random_state=0)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((21051, 33), (21051,), (7018, 33), (7018,))

In [5]:
# declare the steps in our pipeline
my_pipeline = Pipeline(steps = [('DecisionTreeRegressor',DecisionTreeRegressor(criterion='absolute_error'))])

In [6]:
# fit the pipeline to our training data
my_pipeline.fit(X_train, y_train)

### Cross Validation and Mean absolute error

In [7]:
# Use cross-validation to estimate the performance of the pipeline
cvs = cross_val_score(my_pipeline, X_train, y_train, cv=5,scoring='neg_mean_absolute_error')
scores_cvs = -1 * cvs

In [8]:
print("MAE scores:\n", scores_cvs)
print('Average MAE score:', scores_cvs.mean())

MAE scores:
 [0.80069092 0.78957528 0.80983147 0.79636753 0.81529842]
Average MAE score: 0.8023527247710671


### Root mean squared error

In [9]:
# Use cross-validation to estimate the performance of the pipeline
cvs_rmse = cross_val_score(my_pipeline, X_train, y_train, cv=5,scoring='neg_root_mean_squared_error')
scores_cvs_rmse = -1 * cvs_rmse

In [10]:
print("RMSE scores:\n", scores_cvs_rmse)
print('Average RMSE score:',scores_cvs_rmse.mean())

RMSE scores:
 [1.41460594 1.40780092 1.35736382 1.39923573 1.4376539 ]
Average RMSE score: 1.4033320613378777


### Mean squared error 

In [13]:
# Use cross-validation to estimate the performance of the pipeline
cvs_mse = cross_val_score(my_pipeline, X_train, y_train, cv=5,scoring='neg_mean_squared_error')
scores_cvs_mse = -1 * cvs_mse

In [14]:
print("MSE scores:\n", scores_cvs_mse)
print('Average MSE score:',scores_cvs_mse.mean())

MSE scores:
 [2.06602307 1.83804994 1.84351858 1.95240849 2.14747202]
Average MSE score: 1.9694944201667937


### Testing 

In [15]:
#test prediction
preds_test = my_pipeline.predict(X_test)

In [16]:
#test mae
score_mae = mean_absolute_error(y_test,preds_test)
print('MAE is:',score_mae)

MAE is: 0.7897544449385804


In [17]:
#test mse
score_mse = mean_squared_error(y_test,preds_test)
print('MSE is:',score_mse)

MSE is: 2.0643018485370477


In [18]:
#test rmse
score_rmse= np.sqrt(mean_squared_error(y_test,preds_test))
print('RMSE is:',score_rmse)

RMSE is: 1.436767847822691


In [19]:
# Multiply by -1 since sklearn calculates *negative* MAE
scores_cv = -1 * cross_val_score(my_pipeline, train_data, y, cv=5, scoring='neg_mean_absolute_error')
print("MAE scores:\n", scores_cv)
print('Average MAE score:',scores_cv.mean())

MAE scores:
 [0.79458856 0.77729203 0.78600624 0.79433477 0.7623631 ]
Average MAE score: 0.7829169386600292
