In [1]:
from datetime import datetime, timedelta
import jupyter_contrib_nbextensions
import random
import warnings
import sys 
import logging
import pandas as pd
import time
import numpy as np
import pickle
from tqdm import tqdm
import psycopg2 as pg
import sqlalchemy as sq
import networkx as nx
logging.disable()
if not sys.warnoptions:
    warnings.simplefilter("ignore")
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingClassifier

def errorf (real,forecast):
    error=[]
    for i in range(len(real)):
        error.append(real[i]-forecast[i])
    return error

def open_connection():
    '''
    FUNCTION TO CONNECT TO THE POSTGRESQL DATABASE
    '''
    conn = pg.connect(dbname='postgres', user = 'postgres', password = 123, host = 'localhost')
    return conn
def get_connection():
    '''
    FUNCTION TO CONNECT TO THE POSTGRESQL DATABASE AND RETURN THE SQLACHEMY ENGINE OBJECT
    -----------
    output: object
        SQLACHEMY ENGINE OBJECT - POSTGRESQL DATABASE CONNECTION
    '''
    user = 'postgres'
    password = 123
    host = 'localhost'
    port = 5432
    database = 'postgres'
    return sq.create_engine(url="postgresql://{0}:{1}@{2}:{3}/{4}".format(user, password, host, port, database))


def get_all_dates(pais):
    q = '''select distinct cast("Date" as DATE) as datas from pre_processed_data.dbn_features_selected_{pais} order by datas'''.format(pais=pais)
    conn = open_connection()
    date = pd.read_sql(q,conn)
    conn.close()
    datas = date['datas'].tolist()
    return datas

def get_dataset(pais,date_ini, date_fin):
    q = '''select * 
    from pre_processed_data.dbn_features_selected_{pais} where "Date" between '{date_ini}' and '{date_fin}' '''.format(pais=pais,date_ini=date_ini,date_fin=date_fin)
    conn = open_connection()
    dataset = pd.read_sql(q,conn)
    conn.close()
    return dataset

def get_dataset_allfeatures(pais,date_ini, date_fin):
    q = '''select * 
    from pre_processed_data.dbn_{pais} where "Date" between '{date_ini}' and '{date_fin}' '''.format(pais=pais,date_ini=date_ini,date_fin=date_fin)
    conn = open_connection()
    dataset = pd.read_sql(q,conn)
    conn.close()
    return dataset


def bins_values(pais):
    q = '''select "Emission" from pre_processed_data.bins_{pais} where "Emission" is not null'''.format(pais = pais)
    conn = open_connection()
    df = pd.read_sql(q,conn)
    conn.close()
    return df

def real_values(pais, data):
    q = '''select "Emission" from pre_processed_data.{pais} where "Date" = '{dataf}' '''.format(pais = pais, dataf = (data+timedelta(days = 1)).strftime("%Y/%m/%d"))
    conn = open_connection()
    df = pd.read_sql(q,conn)
    conn.close()
    return df

In [16]:
paises = ['Alemanha','Belgica','Espanha', 'Portugal']
for pais in paises:
    #initialize auxiliary variables
    k=1 #total days used
    target_variable = 'Emission'
    #ln
    timeinferenceln = []
    forecast_valuesln = pd.DataFrame()

    #read all available dates
    dates = get_all_dates(pais)

    for i in tqdm(dates):
        if i >= dates[6] and i+timedelta(days = 1) in dates:
            #bins
            bins = bins_values(pais)

            #fit dataset (last 7 days)
            fit_data = get_dataset(pais,i-timedelta(days = 6), i)
            fit_dataall = get_dataset_allfeatures(pais,i-timedelta(days = 6), i)

            #predict data of the entire day
            predict_data_day = get_dataset(pais,i+timedelta(days = 1), i+timedelta(days = 1))
            predict_dataall = get_dataset_allfeatures(pais,i+timedelta(days = 1), i+timedelta(days = 1))

            #aux
            aux_foreln = []
            #dataset to save de forecast values
            forecast_auxln = pd.DataFrame()
            forecast_date = []
            forecast_hour = []

            #predict each point of day i+1
            ti_inf = time.time()
            for h in range(len(predict_data_day)):
                forecast_date.append(i+timedelta(days = 1))
                forecast_hour.append(h)
                predict_data = predict_data_day.iloc[[h]]
                predictall = predict_dataall.iloc[[h]]
                fit_datah = fit_data.loc[0:len(fit_data)-3+h] #tau = 3 (forecast horizon)

                #drop all variable in time window T+1 (unknown values - future states)
                predict_data.drop(['Date', 'Hour'], axis = 1, inplace = True)
                for c in predict_data.columns:
                    if '-1' not in c:
                        predict_data[c] = predictall[c+str('-1')]
                del predict_data[target_variable]

                #ln model
                X = fit_datah.copy()
                y = fit_datah[[target_variable]]
                cvln = []
                for v in y[target_variable]:
                    cvln.append((bins[target_variable][v]+bins[target_variable][v+1])/2)
                y[target_variable] = cvln
                X.drop([target_variable,'Date', 'Hour'], axis = 1, inplace = True)
                clf = LinearRegression().fit(X, y)
                ln_predict = clf.predict(predict_data)
                    
                for v in ln_predict[0]:
                    aux_foreln.append(v)
                fit_data = fit_data.append(predict_data_day.loc[h-3:h-3]).reset_index(drop = True) 

            forecast_auxln['Date'] = forecast_date
            forecast_auxln['Hour'] = forecast_hour
            forecast_auxln['Emissions Forecast'] = aux_foreln
            real_value = real_values(pais, i)
            forecast_auxln[target_variable] = real_value[target_variable]
            forecast_valuesln = forecast_valuesln.append(forecast_auxln)
            tf_inf = time.time()
            timeinferenceln.append(tf_inf-ti_inf)
        #save the results on postgres
        df_time_inferenceln = pd.DataFrame()
        df_time_inferenceln['tempo'] = timeinferenceln
        df_time_inferenceln.to_sql(name='time_inference_'+str(pais)+str('ln'), con = get_connection(),schema = 'results', if_exists = 'replace', chunksize = None, index = False)
        forecast_valuesln.to_sql(name='forecast_'+str(pais)+str('ln'), con = get_connection(),schema = 'results', if_exists = 'replace', chunksize = None, index = False)

100%|████████████████████████████████████████████████████████████████████████████| 1091/1091 [1:07:49<00:00,  3.73s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 1080/1080 [54:07<00:00,  3.01s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 1089/1089 [54:31<00:00,  3.00s/it]
100%|██████████████████████████████████████████████████████████████████████████████| 1090/1090 [54:26<00:00,  3.00s/it]


In [21]:
q = '''select * 
from results."forecast_Alemanhaln" fa '''
conn = open_connection()
dataset = pd.read_sql(q,conn)
conn.close()
median_absolute_error(dataset['Emission'],dataset['Emissions Forecast'])

23.064854938766928

In [22]:
q = '''select * 
from results."forecast_Alemanharf" fa '''
conn = open_connection()
dataset = pd.read_sql(q,conn)
conn.close()
median_absolute_error(dataset['Emission'],dataset['Emissions Forecast'])

24.985586403735482

In [23]:
q = '''select * 
from results."forecast_Alemanhaann" fa '''
conn = open_connection()
dataset = pd.read_sql(q,conn)
conn.close()
median_absolute_error(dataset['Emission'],dataset['Emissions Forecast'])

26.016727965185993

In [24]:
q = '''select * 
from results.forecast_alemanha fa '''
conn = open_connection()
dataset = pd.read_sql(q,conn)
conn.close()
median_absolute_error(dataset['Emission'],dataset['Emissions Forecast'])

44.29539922566826

In [37]:
reg = LinearRegression().fit(X, y)
teste = reg.predict(predict_data)
int(teste)

14

In [14]:
for i in ln_predict[0]:
    print(i)

184.2387140128314


In [6]:
predict_data

Unnamed: 0,Emission-1,Lignite,Lignite-1,Hard coal,Hard coal-1,W. Onshore,Fossil Gas,W. Onshore-1,Fossil Gas-1,Nuclear,Nuclear-1,Solar,W. Offshore,W. Offshore-1
23,7,13,13,9,9,33,11,33,11,33,33,0,18,18


In [9]:
cvln

[204.48080529667115,
 177.66634897048567,
 177.66634897048567,
 177.66634897048567,
 164.25912080739295,
 164.25912080739295,
 150.8518926443002,
 150.8518926443002,
 150.8518926443002,
 150.8518926443002,
 150.8518926443002,
 150.8518926443002,
 150.8518926443002,
 150.8518926443002,
 150.8518926443002,
 150.8518926443002,
 137.44466448120744,
 137.44466448120744,
 137.44466448120744,
 137.44466448120744,
 137.44466448120744,
 150.8518926443002,
 150.8518926443002,
 150.8518926443002,
 150.8518926443002,
 150.8518926443002,
 150.8518926443002,
 164.25912080739295,
 164.25912080739295,
 164.25912080739295,
 164.25912080739295,
 177.66634897048567,
 191.07357713357842,
 217.8880334597639,
 217.8880334597639,
 204.48080529667115,
 204.48080529667115,
 204.48080529667115,
 231.29526162285663,
 258.10971794904214,
 284.92417427522764,
 298.33140243832037,
 311.7386306014131,
 325.1458587645059,
 338.5530869275986,
 338.5530869275986,
 351.9603150906913,
 338.5530869275986,
 338.55308692759