In [10]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit

import warnings
warnings.filterwarnings('ignore')

In [11]:
# Import data
obs_A = pd.read_parquet('../../preprocessing/data/obs_A.parquet')
obs_B = pd.read_parquet('../../preprocessing/data/obs_B.parquet')
obs_C = pd.read_parquet('../../preprocessing/data/obs_C.parquet')
est_A = pd.read_parquet('../../preprocessing/data/est_A.parquet')
est_B = pd.read_parquet('../../preprocessing/data/est_B.parquet')
est_C = pd.read_parquet('../../preprocessing/data/est_C.parquet')
test_A = pd.read_parquet('../../preprocessing/data/test_A.parquet')
test_B = pd.read_parquet('../../preprocessing/data/test_B.parquet')
test_C = pd.read_parquet('../../preprocessing/data/test_C.parquet')

In [12]:
# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Data splits for submissions
X_A = A.drop(columns='pv_measurement')
y_A = A['pv_measurement']
X_B = B.drop(columns='pv_measurement')
y_B = B['pv_measurement']
X_C = C.drop(columns='pv_measurement')
y_C = C['pv_measurement']

In [13]:
# Inspect data
print(X_A.shape)
print(test_A.shape)

(34085, 17)
(720, 17)


In [14]:
def custom_mae_objective(y_true, y_pred):
    """
    Custom objective function for XGBoost.
    Focuses on minimizing the MAE while penalizing under-predictions 1.3 times more than over-predictions.

    Parameters:
    y_true (array): The true values.
    y_pred (array): The predicted values.

    Returns:
    grad (array): The gradient.
    hess (array): The Hessian (second derivative).
    """
    # Calculate the residual (error)
    residual = y_pred - y_true

    # Define the factor for under-prediction penalty
    under_prediction_factor = 1.3

    # Gradient: 1 or -1 multiplied by the under_prediction_factor for under-predictions
    grad = np.where(residual < 0, -under_prediction_factor, 1)

    # Hessian: Set to a small constant value since the second derivative of MAE is zero
    # (This is a common practice for handling MAE in gradient boosting)
    hess = np.ones_like(y_pred) * 0.1

    return grad, hess


In [15]:
# Initalize the models
parameters_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 20, 
    'min_child_weight': 10, 
    'n_estimators': 1000, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

parameters_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 20, 
    'min_child_weight': 10, 
    'n_estimators': 1000, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

parameters_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 20, 
    'min_child_weight': 10, 
    'n_estimators': 1000, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

model_A = xgb.XGBRegressor(**parameters_A)
model_B = xgb.XGBRegressor(**parameters_B)
model_C = xgb.XGBRegressor(**parameters_C)

In [16]:
# Fit the models, verbose=False
model_A.fit(
    X=X_A, y=y_A,
    eval_metric='mae',
    verbose=False
)

model_B.fit(
    X=X_B, y=y_B,
    eval_metric='mae',
    verbose=False
)

model_C.fit(
    X=X_C, y=y_C,
    eval_metric='mae',
    verbose=False
)

In [17]:
# Feature importance
feature_importances = model_A.feature_importances_
feature_importances = pd.DataFrame({'feature': list(X_A.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

# Print feature importance
for i in range(feature_importances.shape[0]):
    print(f"{i} {feature_importances.iloc[i, 0]}: {feature_importances.iloc[i, 1]}")

0 total_radiation: 0.5771934390068054
1 clear_sky_rad:W: 0.15224255621433258
2 effective_cloud_cover:p: 0.03957810252904892
3 snow_accumulation: 0.034548476338386536
4 sun_elevation:d: 0.03267226368188858
5 rain_water:kgm2: 0.02706042304635048
6 total_cloud_cover:p: 0.01868657022714615
7 sun_azimuth:d: 0.016988426446914673
8 month: 0.015764674171805382
9 hour: 0.013749673962593079
10 year: 0.013071900233626366
11 average_wind_speed: 0.012946315109729767
12 t_1000hPa:C: 0.012570992112159729
13 absolute_humidity_2m:gm3: 0.01239833701401949
14 temp_dewpoint_diff: 0.011018941178917885
15 super_cooled_liquid_water:kgm2: 0.0094391955062747
16 dew_or_rime:idx: 6.97435243637301e-05


In [18]:
# Create submission

output_file = 'xgb_submission.csv'

pred_A = model_A.predict(test_A)
pred_B = model_B.predict(test_B)
pred_C = model_C.predict(test_C)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Create an id array
ids = np.arange(0, len(predictions))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions
})

# Save to CSV
df.to_csv(output_file, index=False)
print(f"Submission saved to {output_file}")

Submission saved to xgb_submission.csv
