In [33]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit

import warnings
warnings.filterwarnings('ignore')

In [34]:
# Import data
obs_A = pd.read_parquet('../../preprocessing/data/obs_A.parquet')
obs_B = pd.read_parquet('../../preprocessing/data/obs_B.parquet')
obs_C = pd.read_parquet('../../preprocessing/data/obs_C.parquet')
est_A = pd.read_parquet('../../preprocessing/data/est_A.parquet')
est_B = pd.read_parquet('../../preprocessing/data/est_B.parquet')
est_C = pd.read_parquet('../../preprocessing/data/est_C.parquet')
test_A = pd.read_parquet('../../preprocessing/data/test_A.parquet')
test_B = pd.read_parquet('../../preprocessing/data/test_B.parquet')
test_C = pd.read_parquet('../../preprocessing/data/test_C.parquet')

In [35]:
# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Data splits for submissions
X_A = A.drop(columns='pv_measurement')
y_A = A['pv_measurement']
X_B = B.drop(columns='pv_measurement')
y_B = B['pv_measurement']
X_C = C.drop(columns='pv_measurement')
y_C = C['pv_measurement']

In [36]:
# Inspect data
print(X_A.shape)
print(test_A.shape)

(34085, 19)
(720, 19)


In [37]:
# Initalize the models
parameters_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 20, 
    'min_child_weight': 10, 
    'n_estimators': 1200, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'dart',
    'n_jobs': -1
}

parameters_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 20, 
    'min_child_weight': 10, 
    'n_estimators': 1200, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'dart',
    'n_jobs': -1
}

parameters_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 20, 
    'min_child_weight': 10, 
    'n_estimators': 1200, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'dart',
    'n_jobs': -1
}

model_A = xgb.XGBRegressor(**parameters_A)
model_B = xgb.XGBRegressor(**parameters_B)
model_C = xgb.XGBRegressor(**parameters_C)

In [38]:
# Fit the models, verbose=False
model_A.fit(
    X=X_A, y=y_A,
    eval_metric='mae',
    verbose=False
)

model_B.fit(
    X=X_B, y=y_B,
    eval_metric='mae',
    verbose=False
)

model_C.fit(
    X=X_C, y=y_C,
    eval_metric='mae',
    verbose=False
)

In [None]:
# Feature importance
feature_importances = model_C.feature_importances_
feature_importances = pd.DataFrame({'feature': list(X_A.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

# Print feature importance
for i in range(feature_importances.shape[0]):
    print(f"{i} {feature_importances.iloc[i, 0]}: {feature_importances.iloc[i, 1]}")

0 total_radiation: 0.5175920128822327
1 sun_elevation:d: 0.23989932239055634
2 clear_sky_rad:W: 0.06455494463443756
3 t_1000hPa:C: 0.02462759055197239
4 effective_cloud_cover:p: 0.02279519848525524
5 snow_accumulation: 0.017812466248869896
6 rain_water:kgm2: 0.013332683593034744
7 msl_pressure:hPa_scaled: 0.012126365676522255
8 sfc_pressure:hPa_scaled: 0.011091955937445164
9 total_cloud_cover:p: 0.01098593045026064
10 absolute_humidity_2m:gm3: 0.010929371230304241
11 binned_month: 0.009734206832945347
12 temp_dewpoint_diff: 0.009173420257866383
13 average_wind_speed: 0.008523290045559406
14 year: 0.008304593153297901
15 super_cooled_liquid_water:kgm2: 0.00682792067527771
16 sun_azimuth:d: 0.005962737835943699
17 hour: 0.005517392884939909
18 dew_or_rime:idx: 0.000208531113457866


In [None]:
# Create submission

output_file = 'xgb_submission.csv'

pred_A = model_A.predict(test_A)
pred_B = model_B.predict(test_B)
pred_C = model_C.predict(test_C)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Create an id array
ids = np.arange(0, len(predictions))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions
})

# Save to CSV
df.to_csv(output_file, index=False)
print(f"Submission saved to {output_file}")

Submission saved to xgb_submission.csv
