In [72]:
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
import numpy as np


In [73]:
# Read in the data
data_path = '../preprocessing/data'
obs_A = pd.read_parquet(f'{data_path}/obs_A.parquet')
est_A = pd.read_parquet(f'{data_path}/est_A.parquet')
obs_B = pd.read_parquet(f'{data_path}/obs_B.parquet')
est_B = pd.read_parquet(f'{data_path}/est_B.parquet')
obs_C = pd.read_parquet(f'{data_path}/obs_C.parquet')
est_C = pd.read_parquet(f'{data_path}/est_C.parquet')

test_A = pd.read_parquet(f'{data_path}/test_A.parquet').dropna()
test_B = pd.read_parquet(f'{data_path}/test_B.parquet').dropna()
test_C = pd.read_parquet(f'{data_path}/test_C.parquet').dropna()

# Concatenate
A = pd.concat([obs_A, est_A])
A.drop(columns=['date_forecast'], inplace=True)
B = pd.concat([obs_B, est_B])
B.drop(columns=['date_forecast'], inplace=True)
C = pd.concat([obs_C, est_C])
C.drop(columns=['date_forecast'], inplace=True)


print(A.dtypes)

# Split to features and labels
X_A = A.drop(columns=['pv_measurement'])
y_A = A['pv_measurement']
X_B = B.drop(columns=['pv_measurement'])
y_B = B['pv_measurement']
X_C = C.drop(columns=['pv_measurement'])
y_C = C['pv_measurement']

pv_measurement                                      float64
diffuse_rad:W                                       float32
direct_rad:W                                        float32
effective_cloud_cover:p                             float32
fresh_snow_24h:cm                                   float32
                                                     ...   
direct_rad:W_rate_of_change_of_change               float32
effective_cloud_cover:p_rate_of_change              float32
effective_cloud_cover:p_rate_of_change_of_change    float32
total_cloud_cover:p_rate_of_change                  float32
total_cloud_cover:p_rate_of_change_of_change        float32
Length: 263, dtype: object


In [74]:
obs_A

Unnamed: 0,pv_measurement,date_forecast,diffuse_rad:W,direct_rad:W,effective_cloud_cover:p,fresh_snow_24h:cm,sun_elevation:d,absolute_humidity_2m:gm3,super_cooled_liquid_water:kgm2,t_1000hPa:K,...,clear_sky_rad:W_rate_of_change,clear_sky_rad:W_rate_of_change_of_change,diffuse_rad:W_rate_of_change,diffuse_rad:W_rate_of_change_of_change,direct_rad:W_rate_of_change,direct_rad:W_rate_of_change_of_change,effective_cloud_cover:p_rate_of_change,effective_cloud_cover:p_rate_of_change_of_change,total_cloud_cover:p_rate_of_change,total_cloud_cover:p_rate_of_change_of_change
0,0.00,2019-06-02 22:00:00,0.000,0.00,99.074997,0.0,-3.774250,7.700,0.0,286.225006,...,0.00,0.00,0.000,0.000,0.00,0.00,0.000000,0.000000,0.000000,0.000000
1,0.00,2019-06-02 23:00:00,0.000,0.00,99.750000,0.0,-4.357250,7.700,0.0,286.899994,...,0.00,0.00,0.000,0.000,0.00,0.00,0.675003,0.675003,0.000000,0.000000
2,0.00,2019-06-03 00:00:00,0.000,0.00,100.000000,0.0,-3.309500,7.875,0.0,286.950012,...,0.00,0.00,0.000,0.000,0.00,0.00,0.250000,-0.425003,0.000000,0.000000
3,0.00,2019-06-03 01:00:00,0.300,0.00,100.000000,0.0,-0.822500,8.425,0.0,286.750000,...,0.75,0.75,0.300,0.300,0.00,0.00,0.000000,-0.250000,0.000000,0.000000
4,19.36,2019-06-03 02:00:00,11.975,0.15,84.875000,0.0,3.051250,8.950,0.0,286.450012,...,22.35,21.60,11.675,11.375,0.15,0.15,-15.125000,-15.125000,-0.775002,-0.775002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29662,0.00,2022-10-20 20:00:00,0.000,0.00,38.125000,0.0,-29.704250,5.800,0.0,278.250000,...,0.00,0.00,0.000,0.000,0.00,0.00,-13.125000,5.775002,-8.274998,6.775002
29663,0.00,2022-10-20 21:00:00,0.000,0.00,16.275000,0.0,-33.860748,5.575,0.0,277.850006,...,0.00,0.00,0.000,0.000,0.00,0.00,-21.850000,-8.725000,-27.550003,-19.275005
29664,0.00,2022-10-20 22:00:00,0.000,0.00,9.700000,0.0,-36.270000,5.350,0.0,277.475006,...,0.00,0.00,0.000,0.000,0.00,0.00,-6.575000,15.275001,-9.800000,17.750004
29665,0.00,2022-10-20 23:00:00,0.000,0.00,4.525000,0.0,-36.614998,5.200,0.0,277.149994,...,0.00,0.00,0.000,0.000,0.00,0.00,-5.175000,1.400000,-5.175000,4.625000


In [75]:
# # Standardize the features (important for Lasso)
# scaler = StandardScaler()
# X_A_scaled = scaler.fit_transform(X_A)
# X_B_scaled = scaler.fit_transform(X_B)
# X_C_scaled = scaler.fit_transform(X_C)

# # Use LassoCV to automatically find the best alpha (regularization strength)
# lasso_A = LassoCV(cv=5).fit(X_A_scaled, y_A)
# lasso_B = LassoCV(cv=5).fit(X_B_scaled, y_B)
# lasso_C = LassoCV(cv=5).fit(X_C_scaled, y_C)

# # Extract features with non-zero coefficients for each dataset
# selected_features_A = X_A.columns[lasso_A.coef_ != 0]
# selected_features_B = X_B.columns[lasso_B.coef_ != 0]
# selected_features_C = X_C.columns[lasso_C.coef_ != 0]

# print("Selected features for A:", selected_features_A)
# print("Selected features for B:", selected_features_B)
# print("Selected features for C:", selected_features_C)

In [76]:
from sklearn.ensemble import RandomForestRegressor

# # Your existing code for fitting the Random Forest models
# rf_A = RandomForestRegressor().fit(X_A, y_A)
# rf_B = RandomForestRegressor().fit(X_B, y_B)
# rf_C = RandomForestRegressor().fit(X_C, y_C)

# # Extract feature importances
# importance_A = rf_A.feature_importances_
# importance_B = rf_B.feature_importances_
# importance_C = rf_C.feature_importances_

# # Sort features based on importance
# sorted_idx_A = importance_A.argsort()[::-1]
# sorted_idx_B = importance_B.argsort()[::-1]
# sorted_idx_C = importance_C.argsort()[::-1]

# # Print the 10 most important features for dataset A
# print("Features sorted by importance for A:")
# for idx in sorted_idx_A[:10]:  # Only consider the top 10 features
#     print(X_A.columns[idx], importance_A[idx])

# # Print the 10 most important features for dataset B
# print("\nFeatures sorted by importance for B:")
# for idx in sorted_idx_B[:10]:  # Only consider the top 10 features
#     print(X_B.columns[idx], importance_B[idx])




In [77]:
# Concatenate
A = pd.concat([obs_A, est_A])
A.drop(columns=['date_forecast'], inplace=True)
B = pd.concat([obs_B, est_B])
B.drop(columns=['date_forecast'], inplace=True)
C = pd.concat([obs_C, est_C])
C.drop(columns=['date_forecast'], inplace=True)

print(A.columns)

# Split to features and labels
X_A = A.drop(columns=['pv_measurement'])
y_A = A['pv_measurement']
X_B = B.drop(columns=['pv_measurement'])
y_B = B['pv_measurement']
X_C = C.drop(columns=['pv_measurement'])
y_C = C['pv_measurement']

Index(['pv_measurement', 'diffuse_rad:W', 'direct_rad:W',
       'effective_cloud_cover:p', 'fresh_snow_24h:cm', 'sun_elevation:d',
       'absolute_humidity_2m:gm3', 'super_cooled_liquid_water:kgm2',
       't_1000hPa:K', 'total_cloud_cover:p',
       ...
       'clear_sky_rad:W_rate_of_change',
       'clear_sky_rad:W_rate_of_change_of_change',
       'diffuse_rad:W_rate_of_change',
       'diffuse_rad:W_rate_of_change_of_change', 'direct_rad:W_rate_of_change',
       'direct_rad:W_rate_of_change_of_change',
       'effective_cloud_cover:p_rate_of_change',
       'effective_cloud_cover:p_rate_of_change_of_change',
       'total_cloud_cover:p_rate_of_change',
       'total_cloud_cover:p_rate_of_change_of_change'],
      dtype='object', length=263)


In [78]:
import xgboost as xgb

# Your existing code for creating the DMatrix for XGBoost models
dtrain_A = xgb.DMatrix(X_A, label=y_A)
dtrain_B = xgb.DMatrix(X_B, label=y_B)
dtrain_C = xgb.DMatrix(X_C, label=y_C)

# Specify parameters for XGBoost
params = {
    'max_depth': 3, 
    'eta': 0.1, 
    'objective': 'reg:squarederror'
}

# Fit the XGBoost models
model_A = xgb.train(params, dtrain_A, num_boost_round=100)
model_B = xgb.train(params, dtrain_B, num_boost_round=100)
model_C = xgb.train(params, dtrain_C, num_boost_round=100)

# Get feature importances
importance_A = model_A.get_score(importance_type='weight')
importance_B = model_B.get_score(importance_type='weight')
importance_C = model_C.get_score(importance_type='weight')

# Convert importances to the same format as sklearn feature_importances_
importance_A = np.array([importance_A.get(f, 0.) for f in X_A.columns], dtype=np.float32)
importance_B = np.array([importance_B.get(f, 0.) for f in X_B.columns], dtype=np.float32)
importance_C = np.array([importance_C.get(f, 0.) for f in X_C.columns], dtype=np.float32)

# Sort features based on importance
sorted_idx_A = importance_A.argsort()[::-1]
sorted_idx_B = importance_B.argsort()[::-1]
sorted_idx_C = importance_C.argsort()[::-1]

# Print the 10 most important features for dataset A
print("Features sorted by importance for A:")
for idx in sorted_idx_A[:20]:  # Only consider the top 10 features
    print(X_A.columns[idx], importance_A[idx])

# Print the 10 most important features for dataset B
print("\nFeatures sorted by importance for B:")
for idx in sorted_idx_B[:20]:  # Only consider the top 10 features
    print(X_B.columns[idx], importance_B[idx])

# Print the 10 most important features for dataset C
print("\nFeatures sorted by importance for C:")
for idx in sorted_idx_C[:20]:  # Only consider the top 10 features
    print(X_C.columns[idx], importance_C[idx])


Features sorted by importance for A:
direct_rad:W 96.0
diffuse_rad:W 54.0
direct_rad:W_rate_of_change 47.0
clear_sky_rad:W 31.0
date_forecast_fft_amplitude 28.0
clear_sky_rad:W_rate_of_change_of_change 24.0
clear_sky_rad:W_rate_of_change 24.0
sun_azimuth:d 19.0
direct_rad:W_rate_of_change_of_change 18.0
diffuse_rad_1h:J 18.0
t_1000hPa:K 16.0
precip_5min:mm 15.0
msl_pressure:hPa 15.0
sun_elevation:d 15.0
sun_elevation:d_fft_phase 14.0
t_1000hPa:K_rate_of_change 12.0
fresh_snow_24h:cm 11.0
diffuse_rad:W_rate_of_change 10.0
direct_rad_1h:J 10.0
absolute_humidity_2m:gm3 9.0

Features sorted by importance for B:
date_forecast_fft_phase 89.0
direct_rad:W 75.0
diffuse_rad:W 52.0
sun_elevation:d 36.0
clear_sky_rad:W 27.0
clear_sky_rad:W_rate_of_change 22.0
date_forecast_fft_amplitude 17.0
cloud_base_agl:m 17.0
year 16.0
t_1000hPa:K 15.0
snow_drift:idx_fft_amplitude 14.0
air_density_2m:kgm3 13.0
diffuse_rad:W_rate_of_change 12.0
clear_sky_rad:W_rate_of_change_of_change 12.0
t_1000hPa:K_rate_of_