In [96]:
import pandas as pd
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
import numpy as np


In [97]:
# Read in the data
data_path = '../preprocessing/data'
obs_A = pd.read_parquet(f'{data_path}/obs_A.parquet')
est_A = pd.read_parquet(f'{data_path}/est_A.parquet')
obs_B = pd.read_parquet(f'{data_path}/obs_B.parquet')
est_B = pd.read_parquet(f'{data_path}/est_B.parquet')
obs_C = pd.read_parquet(f'{data_path}/obs_C.parquet')
est_C = pd.read_parquet(f'{data_path}/est_C.parquet')

test_A = pd.read_parquet(f'{data_path}/test_A.parquet').dropna()
test_B = pd.read_parquet(f'{data_path}/test_B.parquet').dropna()
test_C = pd.read_parquet(f'{data_path}/test_C.parquet').dropna()

# Concatenate
A = pd.concat([obs_A, est_A]).drop(columns=['date_forecast'])
B = pd.concat([obs_B, est_B]).drop(columns=['date_forecast'])
C = pd.concat([obs_C, est_C]).drop(columns=['date_forecast'])

print(A.columns)


Index(['pv_measurement', 'clear_sky_rad:W', 'clear_sky_energy_1h:J',
       'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J',
       'effective_cloud_cover:p', 'sun_elevation:d',
       'radiation_cloud_interaction',
       ...
       'cloud_cover_interaction_75', 'angle_radiation_75',
       'sun_elevation_interaction_85', 'sun_azimuth_interaction_85',
       'cloud_cover_interaction_85', 'angle_radiation_85',
       'sun_elevation_interaction_100', 'sun_azimuth_interaction_100',
       'cloud_cover_interaction_100', 'angle_radiation_100'],
      dtype='object', length=116)


In [98]:
# import numpy as np

# angles = [0,10,25,35,50,75,85,100]
# datasets = [A, B, C]  # Assuming A, B, and C are defined dictionaries with the mentioned keys.

# for dataset in datasets:  # Loop over each dataset
#     for angle in angles:  # Loop over each angle
#         # Use 'dataset' instead of 'A' to access the elements
#         sun_elevation_interaction = np.cos(np.radians(angle - dataset['sun_elevation:d'])) * dataset['direct_rad:W']
#         sun_azimuth_interaction = np.cos(np.radians(dataset['sun_azimuth:d'] - 180)) * dataset['direct_rad:W']
#         cloud_cover_interaction = (1 - dataset['effective_cloud_cover:p']/100) * dataset['direct_rad:W']

#         # Combine interactions into a composite feature
#         composite_feature = sun_elevation_interaction + sun_azimuth_interaction + cloud_cover_interaction

#         # Assign the composite feature to the current angle key in the dataset
#         dataset[f'sun_elevation_interaction_{angle}'] = sun_elevation_interaction
#         dataset[f'sun_azimuth_interaction_{angle}'] = sun_azimuth_interaction
#         dataset[f'cloud_cover_interaction_{angle}'] = cloud_cover_interaction
#         dataset[f'angle_radiation_{angle}'] = composite_feature


In [99]:

print(A.columns)

# Split to features and labels
X_A = A.drop(columns=['pv_measurement'])
y_A = A['pv_measurement']
X_B = B.drop(columns=['pv_measurement'])
y_B = B['pv_measurement']
X_C = C.drop(columns=['pv_measurement'])
y_C = C['pv_measurement']

Index(['pv_measurement', 'clear_sky_rad:W', 'clear_sky_energy_1h:J',
       'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J',
       'effective_cloud_cover:p', 'sun_elevation:d',
       'radiation_cloud_interaction',
       ...
       'cloud_cover_interaction_75', 'angle_radiation_75',
       'sun_elevation_interaction_85', 'sun_azimuth_interaction_85',
       'cloud_cover_interaction_85', 'angle_radiation_85',
       'sun_elevation_interaction_100', 'sun_azimuth_interaction_100',
       'cloud_cover_interaction_100', 'angle_radiation_100'],
      dtype='object', length=116)


In [100]:
import xgboost as xgb


# angles = range(-90, 91, 5)

# def calculate_theoretical_max_capture(X, angle):
#     # Calculating the incidence angle based on the angle provided
#     incidence_angle = np.radians(90 - angle - X['sun_elevation:d'])
#     # Creating a feature name specific to this angle
#     feature_name = f'theoretical_max_capture_at_{angle}deg'
#     # Calculating the theoretical max capture and adding it to the dataframe
#     X[feature_name] = X['clear_sky_rad:W'] * np.cos(incidence_angle)

# # Loop through each angle and calculate for all three locations
# for angle in angles:
#     # Calculate for location A
#     calculate_theoretical_max_capture(X_A, angle)
    
#     # Calculate for location B
#     calculate_theoretical_max_capture(X_B, angle)
    
#     # Calculate for location C
#     calculate_theoretical_max_capture(X_C, angle)



# Your existing code for creating the DMatrix for XGBoost models
dtrain_A = xgb.DMatrix(X_A, label=y_A)
dtrain_B = xgb.DMatrix(X_B, label=y_B)
dtrain_C = xgb.DMatrix(X_C, label=y_C)

# Specify parameters for XGBoost
params = {
    'max_depth': 3, 
    'eta': 0.1, 
    'objective': 'reg:squarederror'
}

# Fit the XGBoost models
model_A = xgb.train(params, dtrain_A, num_boost_round=100)
model_B = xgb.train(params, dtrain_B, num_boost_round=100)
model_C = xgb.train(params, dtrain_C, num_boost_round=100)

# Get feature importances
importance_A = model_A.get_score(importance_type='weight')
importance_B = model_B.get_score(importance_type='weight')
importance_C = model_C.get_score(importance_type='weight')

# Convert importances to the same format as sklearn feature_importances_
importance_A = np.array([importance_A.get(f, 0.) for f in X_A.columns], dtype=np.float32)
importance_B = np.array([importance_B.get(f, 0.) for f in X_B.columns], dtype=np.float32)
importance_C = np.array([importance_C.get(f, 0.) for f in X_C.columns], dtype=np.float32)

# Sort features based on importance
sorted_idx_A = importance_A.argsort()[::-1]
sorted_idx_B = importance_B.argsort()[::-1]
sorted_idx_C = importance_C.argsort()[::-1]

# Print the 10 most important features for dataset A
print("Features sorted by importance for A:")
for idx in sorted_idx_A[:40]:  # Only consider the top 10 features
    print(X_A.columns[idx], importance_A[idx])

# Print the 10 most important features for dataset B
print("\nFeatures sorted by importance for B:")
for idx in sorted_idx_B[:40]:  # Only consider the top 10 features
    print(X_B.columns[idx], importance_B[idx])

# Print the 10 most important features for dataset C
print("\nFeatures sorted by importance for C:")
for idx in sorted_idx_C[:40]:  # Only consider the top 10 features
    print(X_C.columns[idx], importance_C[idx])


Features sorted by importance for A:
sun_azimuth_interaction_0 37.0
radiation_cloud_interaction 31.0
total_radiation 31.0
date_forecast_fft_amplitude 30.0
angle_radiation_100 26.0
diffuse_rad:W 26.0
sun_elevation:d_fft_phase 24.0
sun_azimuth:d_lag_7 22.0
visibility:m_lag_-2 21.0
clear_sky_rad:W 20.0
relative_humidity_1000hPa:p_lag_-3 18.0
radiation_cloud_interaction_rate_of_change 17.0
precip_5min:mm 17.0
t_1000hPa:K_rolling_avg_24 16.0
diffuse_rad_1h:J 15.0
total_radiation_rate_of_change 15.0
sun_elevation:d_rolling_avg_6 13.0
sun_elevation:d 12.0
diffuse_rad:W_rate_of_change 11.0
average_wind_speed 11.0
wind_vector_magnitude 10.0
direct_rad:W_rate_of_change 10.0
direct_rad:W_rate_of_change_of_change 9.0
clear_sky_energy_1h:J 9.0
radiation_cloud_interaction_rate_of_change_of_change 8.0
angle_radiation_0 8.0
temp_dewpoint_diff_lag_-4 8.0
dew_point_2m:K_lag_19 8.0
air_density_2m:kgm3 8.0
snow_accumulation 7.0
sun_elevation:d_fft_amplitude 7.0
sfc_pressure:hPa_lag_8 7.0
t_1000hPa:K_lag_4