In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from src.data.data_fetcher import get_all_features, get_raw_data
from src.features.feature_engineering import prepare_data
from src.features.preprocess_data import get_preprocessed_test_data, fetch_preprocessed_data

%pip install prophet

from prophet import Prophet

from sklearn.metrics import mean_squared_error, mean_absolute_error

import warnings
warnings.filterwarnings("ignore")

plt.style.use('ggplot')
plt.style.use('fivethirtyeight')

def mean_absolute_percentage_error(y_true, y_pred): 
    """Calculates MAPE given y_true and y_pred"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

You should consider upgrading via the 'c:\Users\gunna\Documents\Maskinlæring\Prosjekt\power-predictor\venv\Scripts\python.exe -m pip install --upgrade pip' command.


Note: you may need to restart the kernel to use updated packages.


Importing plotly failed. Interactive plots will not work.


# Data


In [2]:
from typing import Tuple
import pandas as pd
from src.data.data_fetcher import get_raw_data, get_tests
from src.features.feature_engineering import (
    feature_engineer,
    prepare_data,
    temporal_alignment,
    add_location,
)

import src.features.preprocess_data
import src.features.feature_engineering

from datetime import datetime
import pandas as pd
from typing import List, Tuple
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import skew
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
import math

In [3]:
def prepare_prophet_data(
    train_observed: pd.DataFrame,
    train_estimated: pd.DataFrame,
    test_size=0.2,
    random_state=42,
) -> Tuple[
    pd.DataFrame,
    pd.DataFrame,
    pd.Series,
    pd.Series,
    pd.DataFrame,
    pd.DataFrame,
    pd.Series,
    pd.Series,
]:
    """
    Prepares the data for modeling by handling missing values and splitting the data.

    Args:
    train_observed (pd.DataFrame): The aligned training DataFrame with observed features.
    train_estimated (pd.DataFrame): The aligned training DataFrame with estimated features.
    test_size (float): The proportion of the dataset to include in the test split.
    random_state (int): Controls the shuffling applied to the data before applying the split.

    Returns:
    X_train_obs (pd.DataFrame): The training features with observed data.
    X_val_obs (pd.DataFrame): The validation features with observed data.
    y_train_obs (pd.Series): The training target with observed data.
    y_val_obs (pd.Series): The validation target with observed data.
    X_train_est (pd.DataFrame): The training features with estimated data.
    X_val_est (pd.DataFrame): The validation features with estimated data.
    y_train_est (pd.Series): The training target with estimated data.
    y_val_est (pd.Series): The validation target with estimated data.
    """

    # Handle missing values (e.g., imputation, removal)
    train_observed_clean = train_observed.dropna()
    train_estimated_clean = train_estimated.dropna()

    # # Feature engineer
    train_observed_clean = feature_engineer(train_observed_clean)
    train_estimated_clean = feature_engineer(train_estimated_clean)

    # Split the data into features (X) and target (y)
    X_obs = train_observed_clean.drop(
        columns=["time"]
    )

    X_obs = X_obs.rename(columns= {"date_forecast": 'ds',
                                   'pv_measurement': 'y'})
    
    y_obs = train_observed_clean["pv_measurement"]

    X_est = train_estimated_clean.drop(
        columns=["time", "date_calc"]
    )

    X_est = X_est.rename(columns= {"date_forecast": 'ds',
                                   'pv_measurement': 'y'})
    
    y_est = train_estimated_clean["pv_measurement"]

    # Split the data into training and validation sets
    X_train_obs, X_val_obs, y_train_obs, y_val_obs = train_test_split(
        X_obs, y_obs, test_size=test_size, random_state=random_state
    )
    X_train_est, X_val_est, y_train_est, y_val_est = train_test_split(
        X_est, y_est, test_size=test_size, random_state=random_state
    )

    return (
        X_train_obs,
        X_val_obs,
        y_train_obs,
        y_val_obs,
        X_train_est,
        X_val_est,
        y_train_est,
        y_val_est,
    )




def fetch_prophet_preprocessed_data() -> (
    Tuple[
        pd.DataFrame,
        pd.DataFrame,
        pd.DataFrame,
        pd.DataFrame,
        pd.DataFrame,
        pd.DataFrame,
        pd.DataFrame,
        pd.DataFrame,
    ]
):
    """
    Fetch the preprocessed data for training and validation.

    Returns:
        X_train_obs_combined: The observed data for training
        X_val_obs_combined: The observed data for validation
        y_train_obs_combined: The observed labels for training
        y_val_obs_combined: The observed labels for validation
        X_train_est_combined: The estimated data for training
        X_val_est_combined: The estimated data for validation
        y_train_est_combined: The estimated labels for training
        y_val_est_combined: The estimated labels for validation
    """
    (
        train_a,
        train_b,
        train_c,
        X_train_estimated_a,
        X_train_estimated_b,
        X_train_estimated_c,
        X_train_observed_a,
        X_train_observed_b,
        X_train_observed_c,
        _,
        _,
        _,
    ) = get_raw_data()

    # Temporally align the data from all three locations to the same time.
    train_observed_a, train_estimated_a = temporal_alignment(
        train_a, X_train_observed_a, X_train_estimated_a
    )
    train_observed_b, train_estimated_b = temporal_alignment(
        train_b, X_train_observed_b, X_train_estimated_b
    )
    train_observed_c, train_estimated_c = temporal_alignment(
        train_c, X_train_observed_c, X_train_estimated_c
    )

    # Add location data
    train_observed_a = add_location(train_observed_a, "a")
    train_estimated_a = add_location(train_estimated_a, "a")

    train_observed_b = add_location(train_observed_b, "b")
    train_estimated_b = add_location(train_estimated_b, "b")

    train_observed_c = add_location(train_observed_c, "c")
    train_estimated_c = add_location(train_estimated_c, "c")

    # Combine the temporally aligned datasets from all three locations
    train_observed_combined = pd.concat(
        [train_observed_a, train_observed_b, train_observed_c], ignore_index=True
    )
    train_estimated_combined = pd.concat(
        [train_estimated_a, train_estimated_b, train_estimated_c], ignore_index=True
    )

    # Prepare the combined dataset by handling missing values and splitting the data
    (
        X_train_obs_combined,
        X_val_obs_combined,
        y_train_obs_combined,
        y_val_obs_combined,
        X_train_est_combined,
        X_val_est_combined,
        y_train_est_combined,
        y_val_est_combined,
    ) = prepare_prophet_data(train_observed_combined, train_estimated_combined)

    return (
        X_train_obs_combined,
        X_val_obs_combined,
        y_train_obs_combined,
        y_val_obs_combined,
        X_train_est_combined,
        X_val_est_combined,
        y_train_est_combined,
        y_val_est_combined,
    )






train_a, train_b, train_c, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = get_raw_data()


X_train_obs_combined, X_val_obs_combined, y_train_obs_combined, y_val_obs_combined, X_train_est_combined, X_val_est_combined, y_train_est_combined, y_val_est_combined = fetch_prophet_preprocessed_data()


X_test_estimated_a_processed, X_test_estimated_b_processed, X_test_estimated_c_processed = get_preprocessed_test_data()



pd.set_option('display.max_rows', 200)

X_train_obs_combined

count = (X_train_obs_combined["y"] == 0).sum()
print(count)

count = (X_train_obs_combined["y"] != 0).sum()
print(count)


2061
282


In [4]:
X_train_obs_combined.head(200)

Unnamed: 0,y,ds,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,...,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,location_a,location_b,location_c,sin_day_of_year,cos_day_of_year,sin_hour,cos_hour
93098,0.0,2021-12-26 10:00:00,4.4,1.278,1032.724976,38284.23,18.15,438.274994,0.0,271.849976,...,1.4,1.975,0.0,0,0,1,-0.107308,0.994226,0.5,-0.8660254
93170,0.0,2021-12-29 10:00:00,2.5,1.2955,1793.474976,40293.77,18.950001,1274.599976,-0.5,264.75,...,0.975,0.8,0.0,0,0,1,-0.055879,0.998438,0.5,-0.8660254
34577,0.0,2019-01-21 11:00:00,1.8,1.32875,743.900024,210171.0,63.474998,743.900024,0.0,260.75,...,-0.95,0.55,0.0,0,1,0,0.337301,0.941397,0.258819,-0.9659258
34646,0.0,2019-01-24 08:00:00,1.9,1.324,927.349976,904.35,2.375,927.349976,0.0,261.700012,...,-0.4,0.75,0.0,0,1,0,0.385413,0.922744,0.8660254,-0.5
85456,0.0,2021-02-10 22:00:00,0.9,1.402,28.125,0.0,0.0,28.075001,0.0,253.5,...,-0.35,1.525,0.0,0,0,1,0.635068,0.772456,-0.5,0.8660254
34697,6.9,2019-01-26 11:00:00,2.3,1.28925,2195.350098,270906.7,81.224998,2195.350098,0.0,263.700012,...,-4.8,2.55,0.0,0,1,0,0.416926,0.90894,0.258819,-0.9659258
85287,0.0,2021-02-03 21:00:00,1.05,1.3755,28.4,0.0,0.0,28.5,0.0,254.925003,...,-2.525,0.55,0.0,0,0,1,0.537677,0.843151,-0.7071068,0.7071068
52667,18.975,2021-02-13 11:00:00,3.525,1.317,3482.800049,614182.0,179.574997,1056.175049,0.0,268.875,...,6.975,2.0,0.0,0,1,0,0.674069,0.738668,0.258819,-0.9659258
42111,0.0,2019-12-01 12:00:00,4.75,1.275,923.699951,125974.8,27.450001,923.699951,0.0,272.950012,...,4.125,-1.45,0.0,0,1,0,-0.512055,0.858953,1.224647e-16,-1.0
85409,0.0,2021-02-08 23:00:00,0.9,1.403,28.299999,0.0,0.0,31.799999,0.0,253.050003,...,0.025,1.4,0.0,0,0,1,0.608121,0.793844,-0.258819,0.9659258


In [5]:
train_a, train_b, train_c, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = get_raw_data()


X_train_obs_combined, X_val_obs_combined, y_train_obs_combined, y_val_obs_combined, X_train_est_combined, X_val_est_combined, y_train_est_combined, y_val_est_combined = fetch_prophet_preprocessed_data()


X_test_estimated_a_processed, X_test_estimated_b_processed, X_test_estimated_c_processed = get_preprocessed_test_data()

X_train_obs_combined.head()

Unnamed: 0,y,ds,absolute_humidity_2m:gm3,air_density_2m:kgm3,ceiling_height_agl:m,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,...,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,location_a,location_b,location_c,sin_day_of_year,cos_day_of_year,sin_hour,cos_hour
93098,0.0,2021-12-26 10:00:00,4.4,1.278,1032.724976,38284.226562,18.15,438.274994,0.0,271.849976,...,1.4,1.975,0.0,0,0,1,-0.107308,0.994226,0.5,-0.866025
93170,0.0,2021-12-29 10:00:00,2.5,1.2955,1793.474976,40293.773438,18.950001,1274.599976,-0.5,264.75,...,0.975,0.8,0.0,0,0,1,-0.055879,0.998438,0.5,-0.866025
34577,0.0,2019-01-21 11:00:00,1.8,1.32875,743.900024,210171.046875,63.474998,743.900024,0.0,260.75,...,-0.95,0.55,0.0,0,1,0,0.337301,0.941397,0.258819,-0.965926
34646,0.0,2019-01-24 08:00:00,1.9,1.324,927.349976,904.349976,2.375,927.349976,0.0,261.700012,...,-0.4,0.75,0.0,0,1,0,0.385413,0.922744,0.866025,-0.5
85456,0.0,2021-02-10 22:00:00,0.9,1.402,28.125,0.0,0.0,28.075001,0.0,253.5,...,-0.35,1.525,0.0,0,0,1,0.635068,0.772456,-0.5,0.866025
