In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


In [14]:
# Read in the data
data_path = '../../preprocessing/data'


columns_A = [
    "pv_measurement",
    "direct_rad:W",
    "diffuse_rad:W",
    "direct_rad:W_rate_of_change",
    "clear_sky_rad:W",
    "date_forecast_fft_amplitude",
    "clear_sky_rad:W_rate_of_change_of_change",
    "clear_sky_rad:W_rate_of_change",
    "sun_azimuth:d",
    "direct_rad:W_rate_of_change_of_change",
    "diffuse_rad_1h:J",
    "t_1000hPa:K",
    "precip_5min:mm",
    "msl_pressure:hPa",
    "sun_elevation:d",
    "sun_elevation:d_fft_phase",
    "t_1000hPa:K_rate_of_change",
    "fresh_snow_24h:cm",
    "diffuse_rad:W_rate_of_change",
    "direct_rad_1h:J",
    "absolute_humidity_2m:gm3"
]
columns_B = [
    "pv_measurement",
    "date_forecast_fft_phase",
    "direct_rad:W",
    "diffuse_rad:W",
    "sun_elevation:d",
    "clear_sky_rad:W",
    "clear_sky_rad:W_rate_of_change",
    "date_forecast_fft_amplitude",
    "cloud_base_agl:m",
    "year",
    "t_1000hPa:K",
    "snow_drift:idx_fft_amplitude",
    "air_density_2m:kgm3",
    "diffuse_rad:W_rate_of_change",
    "clear_sky_rad:W_rate_of_change_of_change",
    "t_1000hPa:K_rate_of_change",
    "month",
    "diffuse_rad_1h:J",
    "direct_rad:W_rate_of_change",
    "visibility:m",
    "precip_5min:mm"
]
columns_C = [
    "pv_measurement",
    "direct_rad:W",
    "sun_elevation:d",
    "diffuse_rad:W",
    "t_1000hPa:K",
    "direct_rad_1h:J",
    "date_forecast_fft_amplitude",
    "clear_sky_rad:W",
    "clear_sky_energy_1h:J",
    "direct_rad:W_rate_of_change_of_change",
    "snow_melt_10min:mm",
    "direct_rad:W_rate_of_change",
    "precip_5min:mm",
    "relative_humidity_1000hPa:p",
    "msl_pressure:hPa",
    "precip_type_5min:idx_fft_amplitude",
    "wind_speed_u_10m:ms",
    "diffuse_rad_1h:J",
    "sfc_pressure:hPa",
    "dew_point_2m:K",
    "effective_cloud_cover:p"
]

# For A
obs_A = pd.read_parquet(f'{data_path}/obs_A.parquet')
est_A = pd.read_parquet(f'{data_path}/est_A.parquet')
A = pd.concat([obs_A, est_A])[columns_A]  # Select only the columns for A

# For B
obs_B = pd.read_parquet(f'{data_path}/obs_B.parquet')
est_B = pd.read_parquet(f'{data_path}/est_B.parquet')
B = pd.concat([obs_B, est_B])[columns_B]  # Select only the columns for B

# For C
obs_C = pd.read_parquet(f'{data_path}/obs_C.parquet')
est_C = pd.read_parquet(f'{data_path}/est_C.parquet')
C = pd.concat([obs_C, est_C])[columns_C]  # Select only the columns for C


columns_A = [
    "direct_rad:W",
    "diffuse_rad:W",
    "direct_rad:W_rate_of_change",
    "clear_sky_rad:W",
    "date_forecast_fft_amplitude",
    "clear_sky_rad:W_rate_of_change_of_change",
    "clear_sky_rad:W_rate_of_change",
    "sun_azimuth:d",
    "direct_rad:W_rate_of_change_of_change",
    "diffuse_rad_1h:J",
    "t_1000hPa:K",
    "precip_5min:mm",
    "msl_pressure:hPa",
    "sun_elevation:d",
    "sun_elevation:d_fft_phase",
    "t_1000hPa:K_rate_of_change",
    "fresh_snow_24h:cm",
    "diffuse_rad:W_rate_of_change",
    "direct_rad_1h:J",
    "absolute_humidity_2m:gm3"
]
columns_B = [
    "date_forecast_fft_phase",
    "direct_rad:W",
    "diffuse_rad:W",
    "sun_elevation:d",
    "clear_sky_rad:W",
    "clear_sky_rad:W_rate_of_change",
    "date_forecast_fft_amplitude",
    "cloud_base_agl:m",
    "year",
    "t_1000hPa:K",
    "snow_drift:idx_fft_amplitude",
    "air_density_2m:kgm3",
    "diffuse_rad:W_rate_of_change",
    "clear_sky_rad:W_rate_of_change_of_change",
    "t_1000hPa:K_rate_of_change",
    "month",
    "diffuse_rad_1h:J",
    "direct_rad:W_rate_of_change",
    "visibility:m",
    "precip_5min:mm"
]
columns_C = [
    "direct_rad:W",
    "sun_elevation:d",
    "diffuse_rad:W",
    "t_1000hPa:K",
    "direct_rad_1h:J",
    "date_forecast_fft_amplitude",
    "clear_sky_rad:W",
    "clear_sky_energy_1h:J",
    "direct_rad:W_rate_of_change_of_change",
    "snow_melt_10min:mm",
    "direct_rad:W_rate_of_change",
    "precip_5min:mm",
    "relative_humidity_1000hPa:p",
    "msl_pressure:hPa",
    "precip_type_5min:idx_fft_amplitude",
    "wind_speed_u_10m:ms",
    "diffuse_rad_1h:J",
    "sfc_pressure:hPa",
    "dew_point_2m:K",
    "effective_cloud_cover:p"
]
# For testing
test_A = pd.read_parquet(f'{data_path}/test_A.parquet')[columns_A]
test_B = pd.read_parquet(f'{data_path}/test_B.parquet')[columns_B]
test_C = pd.read_parquet(f'{data_path}/test_C.parquet')[columns_C]


In [15]:
X_A = A.drop(columns=['pv_measurement'])
y_A = A['pv_measurement']

X_B = B.drop(columns=['pv_measurement'])
y_B = B['pv_measurement']

X_C = C.drop(columns=['pv_measurement'])
y_C = C['pv_measurement']