In [16]:
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import datetime 
import numpy as np
import pickle 

from utils import train_val_test_split
from utils import get_model_metrics

In [2]:
df = pd.read_csv("all_stations_start_2016.csv",parse_dates=[0])

In [3]:
target_vars = ["System total load in MAW","Wind Offshore in MAW","Wind Onshore in MAW","Solar in MAW"]

In [4]:
df[["naive_System total load in MAW","naive_Wind Offshore in MAW","naive_Wind Onshore in MAW","naive_Solar in MAW"]]=df.loc[:,target_vars].shift(periods=1)

In [5]:
df.dropna(inplace=True)

In [6]:
df_options = ["temp","dwpt","rhum","prcp","snow","wdir","wspd","wpgt","pres","tsun","coco","Particulate matter","Ozone","NO₂"]
for i in df_options:
    mean = 0
    column_names = []
    for j in df.columns:
        if i in j:
            column_names.append(j)
    df[("Mean "+i)] = df.loc[:,column_names].mean(axis=1)
    df.drop(columns=column_names,inplace=True)


In [7]:
df.columnsyear 2016+

Index(['Date', 'System total load in MAW', 'Wind Offshore in MAW',
       'Wind Onshore in MAW', 'Solar in MAW', 'predicted_Wind Offshore in MAW',
       'predicted_Wind Onshore in MAW', 'predicted_System total load in MAW',
       'predicted_Solar in MAW', 'Year', 'Month', 'Week', 'Weekday', 'Hour',
       'naive_System total load in MAW', 'naive_Wind Offshore in MAW',
       'naive_Wind Onshore in MAW', 'naive_Solar in MAW', 'Mean temp',
       'Mean dwpt', 'Mean rhum', 'Mean prcp', 'Mean snow', 'Mean wdir',
       'Mean wspd', 'Mean wpgt', 'Mean pres', 'Mean tsun', 'Mean coco',
       'Mean Particulate matter', 'Mean Ozone', 'Mean NO₂'],
      dtype='object')

In [8]:
df.drop(columns=["Mean dwpt","Mean coco"],inplace=True)
df.drop(columns=["Mean snow","Mean wpgt"],inplace=True)
df.isnull().any().any()

False

In [9]:
X_train, y_train, X_val, y_val, X_test, y_test = train_val_test_split(df, target_vars, 90, 90)

The shape of the data set is: (145559, 28)

--------------------------------------------
The shape of the train set is: (128279, 23)
The shape of the target variable is: (128279, 4)
--------------------------------------------

--------------------------------------------
The shape of the validation set is: (8640, 23)
The shape of the target variable for the validation set is: (8640, 4)
--------------------------------------------

--------------------------------------------
The shape of the test set is: (8640, 23)
The shape of the target variable for the test set is: (8640, 5)
--------------------------------------------


In [10]:
list_entso = ["predicted_System total load in MAW", "predicted_Wind Offshore in MAW", "predicted_Wind Onshore in MAW",  "predicted_Solar in MAW"]
_, _, _, _, _, y_entso = train_val_test_split(df, list_entso, 90, 90)
y_entso_e = y_entso.iloc[:,[0,1,2,3]].to_numpy()

The shape of the data set is: (145559, 28)

--------------------------------------------
The shape of the train set is: (128279, 23)
The shape of the target variable is: (128279, 4)
--------------------------------------------

--------------------------------------------
The shape of the validation set is: (8640, 23)
The shape of the target variable for the validation set is: (8640, 4)
--------------------------------------------

--------------------------------------------
The shape of the test set is: (8640, 23)
The shape of the target variable for the test set is: (8640, 5)
--------------------------------------------


In [11]:
def decision_tree(X_train,y_train,target):
    model = tree.DecisionTreeRegressor()
    model = model.fit(X_train,y_train)   
#     with open(("./models/DecisionTreeModel_"+target+".pickle"),"wb") as f:
#         pickle.dump(model, f)
    return model

def random_forest(X_train,y_train,target):
    model = RandomForestRegressor(n_jobs=-1)
    model = model.fit(X_train,y_train)  
#     with open(("./models/RandomForestModel_"+target+".pickle"),"wb") as f:
#         pickle.dump(model, f)  
    return model

In [19]:
cnt = 0
for i in target_vars:
        y_train_specific = y_train.loc[:,i]
        y_test_specific = y_test.loc[:,["Date",i]]
        
        tree_ = decision_tree(X_train, y_train_specific,i)
        RF = random_forest(X_train,y_train_specific,i)
        
        y_tree_pred = tree_.predict(X_test)
        y_rf_pred = RF.predict(X_test)
        
        print("Tree:")
        print(i)
        print("______======________")
        get_model_metrics(y_test_specific,y_tree_pred)
        print("Random Forest:")
        print(i)
        print("______======________")
        get_model_metrics(y_test_specific,y_rf_pred)
        print("Entso-e:")
        print(i)
        print("______======________")
        get_model_metrics(y_test_specific,y_entso_e[:,cnt])
        cnt += 1

Tree:
System total load in MAW
----------------------------------------------
The overall mean absolute error of the model in MW is: 387.73
----------------------------------------------
----------------------------------------------
The overall mean absolute scaled error of the model in MW is: 0.7367
Please note: to calculate the MASE, the prediction for the first observation was omitted
----------------------------------------------
Random Forest:
System total load in MAW
----------------------------------------------
The overall mean absolute error of the model in MW is: 267.67
----------------------------------------------
----------------------------------------------
The overall mean absolute scaled error of the model in MW is: 0.5085
Please note: to calculate the MASE, the prediction for the first observation was omitted
----------------------------------------------
Entso-e:
System total load in MAW
----------------------------------------------
The overall mean absolute error 