In [32]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit

import warnings
warnings.filterwarnings('ignore')

In [33]:
# Import data
obs_A = pd.read_parquet('../../preprocessing/data/obs_A.parquet')
obs_B = pd.read_parquet('../../preprocessing/data/obs_B.parquet')
obs_C = pd.read_parquet('../../preprocessing/data/obs_C.parquet')
est_A = pd.read_parquet('../../preprocessing/data/est_A.parquet')
est_B = pd.read_parquet('../../preprocessing/data/est_B.parquet')
est_C = pd.read_parquet('../../preprocessing/data/est_C.parquet')
test_A = pd.read_parquet('../../preprocessing/data/test_A.parquet')
test_B = pd.read_parquet('../../preprocessing/data/test_B.parquet')
test_C = pd.read_parquet('../../preprocessing/data/test_C.parquet')

# Columns to drop
columns = [
    'date_forecast',
    'super_cooled_liquid_water:kgm2',
    'air_density_2m:kgm3',
    'snow_water:kgm2',
    'precip_5min:mm',
    'precip_type_5min:idx',
    'rain_water:kgm2',
    'snow_melt_10min:mm',
    'dew_or_rime:idx',
    'snow_depth:cm'
]

columns += [
    't_1000hPa:K_rate_of_change',
    'dew_or_rime:idx_lag_11',
    'total_cloud_cover:p_rate_of_change',
    'visibility:m',
    'temp_dewpoint_diff',
    'diffuse_rad:W_rate_of_change',
    'total_radiation_rate_of_change_of_change',
    'sun_elevation:d_rolling_avg_6',
    'direct_rad:W_rate_of_change_of_change',
    'dew_point_2m:K_rate_of_change',
    'relative_humidity_1000hPa:p',
    'effective_cloud_cover:p_rate_of_change_of_change',
    'total_cloud_cover:p_rate_of_change_of_change',
    'dew_point_2m:K_rate_of_change_of_change',
    'diffuse_rad:W_rate_of_change_of_change',
    't_1000hPa:K_rate_of_change_of_change',
    'is_day:idx',
    'is_in_shadow:idx',
    'prob_rime:p'
]

# Drop columns
obs_A = obs_A.drop(columns=columns)
obs_B = obs_B.drop(columns=columns)
obs_C = obs_C.drop(columns=columns)
est_A = est_A.drop(columns=columns)
est_B = est_B.drop(columns=columns)
est_C = est_C.drop(columns=columns)
test_A = test_A.drop(columns=columns)
test_B = test_B.drop(columns=columns)
test_C = test_C.drop(columns=columns)

In [34]:
# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Data splits for submissions
# X_A = A.drop(columns='pv_measurement')
# y_A = A['pv_measurement']
# X_B = B.drop(columns='pv_measurement')
# y_B = B['pv_measurement']
# X_C = C.drop(columns='pv_measurement')
# y_C = C['pv_measurement']

# Data splits for testing
train_A, test_A = train_test_split(A, test_size=0.2, shuffle=True, random_state=42)
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_test_A = test_A.drop(columns='pv_measurement')
y_test_A = test_A['pv_measurement']

train_B, test_B = train_test_split(B, test_size=0.2, shuffle=True, random_state=42)
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_test_B = test_B.drop(columns='pv_measurement')
y_test_B = test_B['pv_measurement']

train_C, test_C = train_test_split(C, test_size=0.2, shuffle=True, random_state=42)
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_test_C = test_C.drop(columns='pv_measurement')
y_test_C = test_C['pv_measurement']



In [35]:
# Initalize the models
parameters = {'colsample_bytree': 0.664, 
              'gamma': 3, 
              'learning_rate': 0.012, 
              'max_depth': 12, 
              'min_child_weight': 15, 
              'n_estimators': 500, 
              'reg_alpha': 2, 
              'reg_lambda': 2, 
              'subsample': 0.912}

model_A = xgb.XGBRegressor(**parameters)
model_B = xgb.XGBRegressor(**parameters)
model_C = xgb.XGBRegressor(**parameters)

In [36]:
# Train prediction model
model_A.fit(X_train_A, X_train_A)


In [37]:
model_B.fit(X_train_B, X_train_B)


In [38]:
model_C.fit(X_train_C, X_train_C)

In [39]:
# Evaluate
print('MAE A:', mean_absolute_error(y_test_A, model_A.predict(X_test_A)))
print('MAE B:', mean_absolute_error(y_test_B, model_B.predict(X_test_B)))
print('MAE C:', mean_absolute_error(y_test_C, model_C.predict(X_test_C)))

ValueError: y_true and y_pred have different number of output (1!=59)

In [None]:
# Feature importance
feature_importances = model_A.feature_importances_
feature_importances = pd.DataFrame({'feature': list(X_train_A.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

# Print feature importance
for i in range(feature_importances.shape[0]):
    print(f"{i} {feature_importances.iloc[i, 0]}: {feature_importances.iloc[i, 1]}")

0 clear_sky_energy_1h:J: 0.4887338876724243
1 sun_elevation:d: 0.15642227232456207
2 direct_rad_1h:J: 0.11852945387363434
3 clear_sky_rad:W: 0.10437404364347458
4 diffuse_rad_1h:J: 0.043675653636455536
5 direct_rad:W: 0.042580749839544296
6 diffuse_rad:W: 0.014687432907521725
7 total_radiation: 0.013117500580847263
8 clear_sky_rad:W_rolling_avg_6: 0.007018702104687691
9 total_radiation_rolling_avg_3: 0.006020738277584314
10 clear_sky_rad:W_rate_of_change: 0.0012543570483103395
11 sun_azimuth:d_lag_7: 0.0007235597004182637
12 effective_cloud_cover:p: 0.00039219530299305916
13 direct_rad:W_rate_of_change: 0.00037439141306094825
14 month: 0.00036176375579088926
15 clear_sky_rad:W_rate_of_change_of_change: 0.000346258602803573
16 sun_azimuth:d: 0.00030681650969199836
17 visibility:m_lag_-2: 0.00026883173268288374
18 effective_cloud_cover:p_rolling_avg_6: 0.00025041127810254693
19 total_radiation_rate_of_change: 0.00015927357890177518
20 total_cloud_cover:p_rolling_avg_6: 0.0001507050183136

In [None]:
# Create submission

output_file = 'submission.csv'

pred_A = model_A.predict(test_A)
pred_B = model_B.predict(test_B)
pred_C = model_C.predict(test_C)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Create an id array
ids = np.arange(0, len(predictions))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions
})

# Save to CSV
df.to_csv(output_file, index=False)
print(f"Submission saved to {output_file}")

Submission saved to submission.csv
