In [1]:
import pandas as pd
import numpy as np
import glob
import time

import seaborn as sns
import matplotlib.pyplot as plt
from pandas_profiling import ProfileReport

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as md
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls

from functools import reduce
from scipy import stats

from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [2]:
path = r"C:\Users\Roy\Desktop\\"

df = pd.read_csv(path + 'df.csv')
df = df.drop(columns='Unnamed: 0')

def SplitNovData(df):
    """Splits up the data in
    train and test data from
    a six week period."""

    train_x = df[0:88665].drop(columns=['date', 'sup', 'energy', 'sup_diff', 'kwh'])
    train_y = df[0:88665].filter(['sup'])

    test_x = df[88733:90928].drop(columns=['date', 'sup', 'energy', 'sup_diff', 'kwh'])
    test_y = df[88733:90928].filter(['sup'])

    return train_x, train_y, test_x, test_y

train_x, train_y, test_x, test_y = SplitNovData(df)

In [3]:
from sklearn.model_selection import GridSearchCV

def FindParameters(regressor, parameter_space, train_x, train_y):
    """Finds out the optimal parameters
    for the neural network.""" 

    rgr = GridSearchCV(regressor, parameter_space, n_jobs=-1, cv=3)
    rgr.fit(train_x, train_y)

    print('Best parameters found:\n', rgr.best_params_)

    params = rgr.best_params_

    return params

GB = GradientBoostingRegressor()

parameter_space = {
    'max_depth': [80, 90],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4],
    'min_samples_split': [8, 10],
    'n_estimators': [100, 200]
}

# gb_params = FindParameters(GB, parameter_space, train_x, train_y)

In [4]:
from numpy import mean
from numpy import std

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

from sklearn.ensemble import StackingRegressor

from matplotlib import pyplot

def StackData():

    x = df[0:76588].drop(columns=['date', 'sup', 'energy', 'sup_diff', 'kwh'])
    y = df[0:76588].filter(['sup'])

    return x, y

# def GetStacking():

#     level0 = list()

#     level0.append(('svm', SVR()))
#     level0.append(('gb', GradientBoostingRegressor()))
#     level0.append(('rf', RandomForestRegressor()))

#     level1 = MLPRegressor()

#     model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)

#     return model

def GetModels():

    models = dict()

    models['svm'] = SVR(kernel='linear', gamma='scale', C=1)
    models['gb'] = GradientBoostingRegressor(max_depth=80, max_features=3, min_samples_leaf=4,
                                             min_samples_split=8, n_estimators=100)
    models['rf'] = RandomForestRegressor()
    models['nn'] = MLPRegressor(activation='relu', alpha=0.0001, 
                                hidden_layer_sizes=(50, 50, 50), 
                                learning_rate='constant', solver='adam', 
                                random_state=22)

    return models

def EvaluateModel(model, x, y):

    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=22)
    RMSE_score = cross_val_score(model, x,y , scoring='neg_root_mean_squared_error', cv=cv)

    return RMSE_score

x, y = StackData()

In [4]:
models = GetModels()

results, names = list(), list()

for name, model in models.items():

    RMSE_score = EvaluateModel(model, x, y)

    results.append(RMSE_score)

    names.append(name)

    print('>RMSE: %s %.3f (%.3f)' % (name, mean(RMSE_score), std(RMSE_score)))

pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

In [None]:
def PredictModels():

    predictions = list()

    X = df[88733:90928].drop(columns=['date', 'sup', 'energy', 'sup_diff', 'kwh', 'z'])
    Y = df[88733:90928].filter(['sup'])

    models = GetModels()

    for name, models in models.items():

        model.fit(x, y)
        results = model.predict(X)

        predictions.append(results)
        
    return predictions

predictions = PredictModels()

In [None]:
DAT = pd.DataFrame()

Y = df[88733:90928].filter(['sup'])
Y.index = np.arange(0, len(Y))

DAT['svm'] = predictions[0]
DAT['gb'] = predictions[1]
DAT['rf'] = predictions[2]
DAT['nn'] = predictions[3]

sns.lineplot(data=DAT)