In [94]:
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
import numpy as np


In [95]:
# Read in the data
data_path = '../preprocessing/data'
obs_A = pd.read_parquet(f'{data_path}/obs_A.parquet')
est_A = pd.read_parquet(f'{data_path}/est_A.parquet')
obs_B = pd.read_parquet(f'{data_path}/obs_B.parquet')
est_B = pd.read_parquet(f'{data_path}/est_B.parquet')
obs_C = pd.read_parquet(f'{data_path}/obs_C.parquet')
est_C = pd.read_parquet(f'{data_path}/est_C.parquet')

test_A = pd.read_parquet(f'{data_path}/test_A.parquet').dropna()
test_B = pd.read_parquet(f'{data_path}/test_B.parquet').dropna()
test_C = pd.read_parquet(f'{data_path}/test_C.parquet').dropna()

# Concatenate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])


In [96]:

print(A.columns)

# Split to features and labels
X_A = A.drop(columns=['pv_measurement'])
y_A = A['pv_measurement']
X_B = B.drop(columns=['pv_measurement'])
y_B = B['pv_measurement']
X_C = C.drop(columns=['pv_measurement'])
y_C = C['pv_measurement']

Index(['pv_measurement', 'clear_sky_rad:W', 'clear_sky_energy_1h:J',
       'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J',
       'effective_cloud_cover:p', 'fresh_snow_24h:cm', 'is_day:idx',
       'is_in_shadow:idx', 'sun_elevation:d', 't_1000hPa:K',
       'total_cloud_cover:p', 'visibility:m', 'wind_speed_10m:ms',
       'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms', 'snow_drift:idx',
       'clear_sky_rad:W_rate_of_change',
       'clear_sky_rad:W_rate_of_change_of_change',
       'clear_sky_energy_1h:J_rate_of_change',
       'clear_sky_energy_1h:J_rate_of_change_of_change',
       'diffuse_rad:W_rate_of_change',
       'diffuse_rad:W_rate_of_change_of_change',
       'diffuse_rad_1h:J_rate_of_change',
       'diffuse_rad_1h:J_rate_of_change_of_change',
       'direct_rad:W_rate_of_change', 'direct_rad:W_rate_of_change_of_change',
       'direct_rad_1h:J_rate_of_change',
       'direct_rad_1h:J_rate_of_change_of_change',
       'effective_cloud_cover:p_rat

In [97]:
import xgboost as xgb

# Your existing code for creating the DMatrix for XGBoost models
dtrain_A = xgb.DMatrix(X_A, label=y_A)
dtrain_B = xgb.DMatrix(X_B, label=y_B)
dtrain_C = xgb.DMatrix(X_C, label=y_C)

# Specify parameters for XGBoost
params = {
    'max_depth': 3, 
    'eta': 0.1, 
    'objective': 'reg:squarederror'
}

# Fit the XGBoost models
model_A = xgb.train(params, dtrain_A, num_boost_round=100)
model_B = xgb.train(params, dtrain_B, num_boost_round=100)
model_C = xgb.train(params, dtrain_C, num_boost_round=100)

# Get feature importances
importance_A = model_A.get_score(importance_type='weight')
importance_B = model_B.get_score(importance_type='weight')
importance_C = model_C.get_score(importance_type='weight')

# Convert importances to the same format as sklearn feature_importances_
importance_A = np.array([importance_A.get(f, 0.) for f in X_A.columns], dtype=np.float32)
importance_B = np.array([importance_B.get(f, 0.) for f in X_B.columns], dtype=np.float32)
importance_C = np.array([importance_C.get(f, 0.) for f in X_C.columns], dtype=np.float32)

# Sort features based on importance
sorted_idx_A = importance_A.argsort()[::-1]
sorted_idx_B = importance_B.argsort()[::-1]
sorted_idx_C = importance_C.argsort()[::-1]

# Print the 10 most important features for dataset A
print("Features sorted by importance for A:")
for idx in sorted_idx_A[:20]:  # Only consider the top 10 features
    print(X_A.columns[idx], importance_A[idx])

# Print the 10 most important features for dataset B
print("\nFeatures sorted by importance for B:")
for idx in sorted_idx_B[:20]:  # Only consider the top 10 features
    print(X_B.columns[idx], importance_B[idx])

# Print the 10 most important features for dataset C
print("\nFeatures sorted by importance for C:")
for idx in sorted_idx_C[:20]:  # Only consider the top 10 features
    print(X_C.columns[idx], importance_C[idx])


Features sorted by importance for A:
direct_rad:W 86.0
diffuse_rad:W 62.0
t_1000hPa:K 44.0
sun_elevation:d_rate_of_change_of_change 40.0
direct_rad_1h:J_rate_of_change 38.0
clear_sky_rad:W 34.0
t_1000hPa:K_rate_of_change 21.0
wind_speed_v_10m:ms 21.0
clear_sky_rad:W_rate_of_change_of_change 21.0
diffuse_rad_1h:J_rate_of_change 21.0
clear_sky_energy_1h:J_rate_of_change 20.0
direct_rad:W_rate_of_change 20.0
direct_rad:W_rate_of_change_of_change 18.0
direct_rad_1h:J_rate_of_change_of_change 18.0
effective_cloud_cover:p 17.0
wind_speed_u_10m:ms 16.0
fresh_snow_24h:cm 15.0
total_cloud_cover:p 15.0
wind_speed_10m:ms 12.0
visibility:m 12.0

Features sorted by importance for B:
direct_rad:W 80.0
t_1000hPa:K 59.0
diffuse_rad:W 46.0
clear_sky_rad:W 43.0
sun_elevation:d 36.0
clear_sky_rad:W_rate_of_change 25.0
direct_rad:W_rate_of_change 23.0
direct_rad_1h:J 22.0
clear_sky_energy_1h:J_rate_of_change 21.0
clear_sky_rad:W_rate_of_change_of_change 19.0
direct_rad_1h:J_rate_of_change 18.0
diffuse_rad