# Import the necessary libraries

In [1]:
import sqlite3
import numpy as np
import pandas as pd
#import plotly.express as px
from sklearn.preprocessing import StandardScaler

# Connect to the database

In [2]:
conn = sqlite3.connect('clean_database.db')
cursor = conn.cursor()

# Check all the tables available within the database

In [3]:
cursor.execute("SELECT name FROM sqlite_master WHERE type ='table';")
print(cursor.fetchall())

[('Meteostat_Data',), ('Entsoe_Data',), ('Entsoe_Meteostat_Data',), ('Entsoe_real_data',), ('Entsoe_real_values_and_Meteostat_data',), ('Entsoe_forecasted_data',), ('Entsoe_forecasted_data_and_Meteostat_data',), ('Feature_selected_real_data',), ('Feature_selected_forecasted_data',), ('X_test_real',), ('X_train_forecasted',), ('X_test_forecasted',), ('y_train',), ('y_test',), ('X_train_real',), ('RandomForest_Train_Real_Test_Forecast',), ('LinearRegresion_Train_Real_Test_Forecast',), ('LinearRegresion_Train_Forecast_Test_Forecast',), ('LinearRegresion_Train_Real_Test_Real',), ('RandomForest_Train_Forecast_Test_Forecast',), ('RandomForest_Train_Real_Test_Real',), ('Prophet_Train_Real_Test_Forecast',), ('Prophet_Train_Forecast_Test_Forecast',), ('Optimized_Prophet_Train_Forecast_Test_Forecast',), ('Prophet_Train_Real_Test_Real',), ('Optimized_Prophet_Train_Real_Test_Real',), ('Optimized_Prophet_Train_Real_Test_Forecast',)]


# Retrieving the necesary tables from database

In [4]:
entsoe_meteostat_real = pd.read_sql("SELECT * FROM Feature_selected_real_data;", conn)
entsoe_meteostat_real["timestamp"] = pd.to_datetime(entsoe_meteostat_real["timestamp"])
entsoe_meteostat_real = entsoe_meteostat_real.set_index("timestamp")

In [5]:
entsoe_meteostat_forecast = pd.read_sql("SELECT * FROM Feature_selected_forecasted_data;", conn)
entsoe_meteostat_forecast["timestamp"] = pd.to_datetime(entsoe_meteostat_forecast["timestamp"])
entsoe_meteostat_forecast = entsoe_meteostat_forecast.set_index("timestamp")
entsoe_meteostat_forecast["forecasted_energy_load"] = entsoe_meteostat_real["real_energy_load"]
entsoe_meteostat_forecast = entsoe_meteostat_forecast.rename(columns={"forecasted_energy_load": "real_energy_load"})


# Split data in:  train, validation and test data

## Train data

In [6]:
def split_data(df: pd.DataFrame, training_start, training_end): 
    df2 = df[(df.index  >= training_start) & (df.index < training_end)].copy(deep=True)
    
    return df2

In [7]:
# Start date and End Date of Training dataset
start = pd.to_datetime('2017-10-03 01:00:00') 
end = pd.to_datetime('2021-08-03 01:00:00')

In [8]:
train_real_data = split_data(entsoe_meteostat_real, start, end)

In [9]:
train_forecasted_data = split_data(entsoe_meteostat_forecast, start, end)

### Test data

In [10]:
start = pd.to_datetime('2021-08-03 01:00:00') 
end = pd.to_datetime('2022-01-01 01:00:00')

In [11]:
test_real_data = split_data(entsoe_meteostat_real, start, end)

In [12]:
test_forecasted_data = split_data(entsoe_meteostat_forecast, start, end)

In [13]:
y_train = train_real_data.iloc[:,:1].copy()

In [14]:
y_train.to_sql("y_train", conn, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None, method=None)

In [15]:
y_test = test_forecasted_data.iloc[:,:1].copy()
y_test.to_sql("y_test", conn, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None, method=None)

### Normalization

### Min - Max Normalization

### Log - Scalling

### Z - score

> To prevent the leakage between train data and validation and test data, I fitted and transform the train set and only transform the validation and test set

In [16]:
scaler = StandardScaler()

In [17]:
# Train data for real data
X_train_real = train_real_data.iloc[:, 1:]
X_train_real = scaler.fit_transform(X_train_real)
X_train_real = pd.DataFrame(X_train_real, columns = train_real_data.iloc[:, 1:].columns )
X_train_real.index = train_real_data.index

# Test data for real data
X_test_real= test_real_data.iloc[:, 1:]
X_test_real = scaler.transform(X_test_real)
X_test_real = pd.DataFrame(X_test_real, columns = test_real_data.iloc[:, 1:].columns )
X_test_real.index = test_real_data.index

In [18]:
X_train_real.to_sql("X_train_real", conn, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None, method=None)
X_test_real.to_sql("X_test_real", conn, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None, method=None)


  sql.to_sql(


In [19]:
scaler_2 = StandardScaler()

In [20]:
# Train data for forecasted data
X_train_forecasted = train_forecasted_data.iloc[:, 1:]
X_train_forecasted = scaler_2.fit_transform(X_train_forecasted)
X_train_forecasted = pd.DataFrame(X_train_forecasted, columns = train_real_data.iloc[:, 1:].columns)
X_train_forecasted.index = train_forecasted_data.index

# Test data for forecasted data

X_test_forecasted = test_forecasted_data.iloc[:, 1:]
X_test_forecasted = scaler_2.transform(X_test_forecasted)
X_test_forecasted = pd.DataFrame(X_test_forecasted, columns = test_real_data.iloc[:, 1:].columns)
X_test_forecasted.index = test_forecasted_data.index

In [21]:
X_train_forecasted.to_sql("X_train_forecasted", conn, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None, method=None)
X_test_forecasted.to_sql("X_test_forecasted", conn, if_exists='replace', index=True, index_label=None, chunksize=None, dtype=None, method=None)


  sql.to_sql(
