In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler

In [4]:
# Aggregate the data to hourly with some aggregation methods for each column

aggregation_methods = {
    'date_forecast': 'first',
    'diffuse_rad:W': 'sum',
    'direct_rad:W': 'last',
    'clear_sky_rad:W': 'sum',
    'diffuse_rad_1h:J': 'last',
    'direct_rad_1h:J': 'last',
    'clear_sky_energy_1h:J': 'last',
    'absolute_humidity_2m:gm3': 'mean',
    'air_density_2m:kgm3': 'mean',
    'ceiling_height_agl:m': 'max',
    'cloud_base_agl:m': 'mean',
    'dew_or_rime:idx': 'min',
    'dew_point_2m:K': 'mean',
    'effective_cloud_cover:p': 'sum',
    'elevation:m': 'first',
    'fresh_snow_12h:cm': 'max',
    'fresh_snow_1h:cm': 'sum',
    'fresh_snow_24h:cm': 'max',
    'fresh_snow_3h:cm': 'max',
    'fresh_snow_6h:cm': 'max',
    'is_day:idx': 'max',
    'is_in_shadow:idx': 'max',
    'msl_pressure:hPa': 'mean',
    'precip_5min:mm': 'sum',
    'precip_type_5min:idx': 'sum',
    'pressure_100m:hPa': 'mean',
    'pressure_50m:hPa': 'mean',
    'prob_rime:p': 'max',
    'rain_water:kgm2': 'sum',
    'relative_humidity_1000hPa:p': 'mean',
    'sfc_pressure:hPa': 'mean',
    'snow_density:kgm3': 'mean',
    'snow_depth:cm': 'max',
    'snow_drift:idx': 'max',
    'snow_melt_10min:mm': 'sum',
    'snow_water:kgm2': 'sum',
    'sun_azimuth:d': 'first',
    'sun_elevation:d': 'sum',
    'super_cooled_liquid_water:kgm2': 'sum',
    't_1000hPa:K': 'mean',
    'total_cloud_cover:p': 'mean',
    'visibility:m': 'mean',
    'wind_speed_10m:ms': 'mean',
    'wind_speed_u_10m:ms': 'mean',
    'wind_speed_v_10m:ms': 'mean',
    'wind_speed_w_1000hPa:ms': 'mean',
    'cloud_base_agl:m': 'max',
    'snow_density:kgm3': 'mean'
}


# Read in the data
x_target_A = pd.read_parquet('./data/A/train_targets.parquet')
x_train_obs_A = pd.read_parquet('./data/A/X_train_observed.parquet')
x_train_est_A = pd.read_parquet('./data/A/X_train_estimated.parquet')
x_test_est_A = pd.read_parquet('./data/A/X_test_estimated.parquet')

x_target_B = pd.read_parquet('./data/B/train_targets.parquet')
x_train_obs_B = pd.read_parquet('./data/B/X_train_observed.parquet')
x_train_est_B = pd.read_parquet('./data/B/X_train_estimated.parquet')
x_test_est_B = pd.read_parquet('./data/B/X_test_estimated.parquet')

x_target_C = pd.read_parquet('./data/C/train_targets.parquet')
x_train_obs_C = pd.read_parquet('./data/C/X_train_observed.parquet')
x_train_est_C = pd.read_parquet('./data/C/X_train_estimated.parquet')
x_test_est_C = pd.read_parquet('./data/C/X_test_estimated.parquet')

# Rename time to date_forecast in target
x_target_A.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_B.rename(columns={'time': 'date_forecast'}, inplace=True)
x_target_C.rename(columns={'time': 'date_forecast'}, inplace=True)

# Fix missing data for test set. Assumin NaN means 0 in these categories
x_test_est_A['effective_cloud_cover:p'] = x_test_est_A['effective_cloud_cover:p'].fillna(0)
x_test_est_B['effective_cloud_cover:p'] = x_test_est_B['effective_cloud_cover:p'].fillna(0)
x_test_est_C['effective_cloud_cover:p'] = x_test_est_C['effective_cloud_cover:p'].fillna(0)

x_test_est_A['total_cloud_cover:p'] = x_test_est_A['total_cloud_cover:p'].fillna(0)
x_test_est_B['total_cloud_cover:p'] = x_test_est_B['total_cloud_cover:p'].fillna(0)
x_test_est_C['total_cloud_cover:p'] = x_test_est_C['total_cloud_cover:p'].fillna(0)

x_test_est_A['cloud_base_agl:m'] = x_test_est_A['cloud_base_agl:m'].fillna(0)
x_test_est_B['cloud_base_agl:m'] = x_test_est_B['cloud_base_agl:m'].fillna(0)
x_test_est_C['cloud_base_agl:m'] = x_test_est_C['cloud_base_agl:m'].fillna(0)

x_test_est_A['ceiling_height_agl:m'] = x_test_est_A['ceiling_height_agl:m'].fillna(0)
x_test_est_B['ceiling_height_agl:m'] = x_test_est_B['ceiling_height_agl:m'].fillna(0)
x_test_est_C['ceiling_height_agl:m'] = x_test_est_C['ceiling_height_agl:m'].fillna(0)

x_test_est_A['snow_density:kgm3'] = x_test_est_A['snow_density:kgm3'].fillna(0)
x_test_est_B['snow_density:kgm3'] = x_test_est_B['snow_density:kgm3'].fillna(0)
x_test_est_C['snow_density:kgm3'] = x_test_est_C['snow_density:kgm3'].fillna(0)

x_test_est_A['snow_drift:idx'] = x_test_est_A['snow_drift:idx'].fillna(0)
x_test_est_B['snow_drift:idx'] = x_test_est_B['snow_drift:idx'].fillna(0)
x_test_est_C['snow_drift:idx'] = x_test_est_C['snow_drift:idx'].fillna(0)

# Resample
x_train_obs_A_resampled = x_train_obs_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_A_resampled = x_train_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_A_resampled = x_test_est_A.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_B_resampled = x_train_obs_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_B_resampled = x_train_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_B_resampled = x_test_est_B.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

x_train_obs_C_resampled = x_train_obs_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_train_est_C_resampled = x_train_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)
x_test_est_C_resampled = x_test_est_C.groupby(pd.Grouper(key='date_forecast', freq='1H')).aggregate(aggregation_methods)

# Merge
split_value = x_train_est_A['date_forecast'].iloc[0]
split_index = x_target_A[x_target_A['date_forecast'] == split_value].index[0]

x_target_obs_A = x_target_A.iloc[:split_index]
x_target_est_A = x_target_A.iloc[split_index:]

obs_A = x_train_obs_A_resampled.merge(x_target_obs_A, left_index=True, right_on='date_forecast')
est_A = x_train_est_A_resampled.merge(x_target_est_A, left_index=True, right_on='date_forecast')

split_value = x_train_est_B['date_forecast'].iloc[0]
split_index = x_target_B[x_target_B['date_forecast'] == split_value].index[0]

x_target_obs_B = x_target_B.iloc[:split_index]
x_target_est_B = x_target_B.iloc[split_index:]

obs_B = x_train_obs_B_resampled.merge(x_target_obs_B, left_index=True, right_on='date_forecast')
est_B = x_train_est_B_resampled.merge(x_target_est_B, left_index=True, right_on='date_forecast')

split_value = x_train_est_C['date_forecast'].iloc[0]
split_index = x_target_C[x_target_C['date_forecast'] == split_value].index[0]

x_target_obs_C = x_target_C.iloc[:split_index]
x_target_est_C = x_target_C.iloc[split_index:]

obs_C = x_train_obs_C_resampled.merge(x_target_obs_C, left_index=True, right_on='date_forecast')
est_C = x_train_est_C_resampled.merge(x_target_est_C, left_index=True, right_on='date_forecast')

# Keep date_forecast in test dfs
test_A = x_test_est_A_resampled
test_B = x_test_est_B_resampled
test_C = x_test_est_C_resampled

# Drop all the NaNs
test_A = test_A.dropna()
test_B = test_B.dropna()
test_C = test_C.dropna()

In [5]:
def add_experimental_features(df):
    """
    Experimental feature engineering.
    """
    
    # Radiation Features
    df['total_radiation:W'] = df['direct_rad:W'] + df['diffuse_rad:W']

    # Temperature and Pressure Features
    df['t_1000hPa:C'] = df['t_1000hPa:K'] - 273.15
    df['msl_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['msl_pressure:hPa'].values.reshape(-1, 1))

    # Wind Features
    df['average_wind_speed'] = (df['wind_speed_10m:ms'] + df['wind_speed_u_10m:ms']) / 2

    # Snow Features
    df['snow_accumulation'] = df[['fresh_snow_24h:cm', 'fresh_snow_12h:cm', 'fresh_snow_6h:cm', 'fresh_snow_3h:cm', 'fresh_snow_1h:cm']].sum(axis=1)

    # Time features
    if 'date_forecast' in df.columns:
        
        # Convert the 'date_forecast' column to datetime format
        df['date_forecast'] = pd.to_datetime(df['date_forecast'])
        
        # Extract month, year, hour and day
        df['month'] = df['date_forecast'].dt.month

    # Safeguard in case of inf values
    df.replace([np.inf, -np.inf], 0, inplace=True)
    
    return df

In [6]:
# Columns to keep based on EDA
columns_to_keep = [
    'date_forecast',
    'total_radiation:W',
    'snow_accumulation',
    'super_cooled_liquid_water:kgm2',
    'average_wind_speed',
    'sun_elevation:d',
    'sun_azimuth:d',
    'clear_sky_rad:W',
    'month',
    't_1000hPa:C',
    'msl_pressure:hPa_scaled',
    'rain_water:kgm2',
    'cloud_base_agl:m',
    'effective_cloud_cover:p',
    'dew_or_rime:idx'
]

# Concatinate
A = pd.concat([obs_A, est_A]).dropna()
B = pd.concat([obs_B, est_B]).dropna()
C = pd.concat([obs_C, est_C]).dropna()

# Add experimental features
test_A = add_experimental_features(test_A)
test_B = add_experimental_features(test_B)
test_C = add_experimental_features(test_C)

A = add_experimental_features(A)
B = add_experimental_features(B)
C = add_experimental_features(C)

test_A = test_A[columns_to_keep]
test_B = test_B[columns_to_keep]
test_C = test_C[columns_to_keep]

columns_to_keep.append('pv_measurement')

A = A[columns_to_keep]
B = B[columns_to_keep]
C = C[columns_to_keep]

X_A = A.drop(columns='pv_measurement')
y_A = A['pv_measurement']

X_B = B.drop(columns='pv_measurement')
y_B = B['pv_measurement']

X_C = C.drop(columns='pv_measurement')
y_C = C['pv_measurement']

# Drop date_forecast
X_A = X_A.drop(columns='date_forecast')
X_B = X_B.drop(columns='date_forecast')
X_C = X_C.drop(columns='date_forecast')
test_A = test_A.drop(columns='date_forecast')
test_B = test_B.drop(columns='date_forecast')
test_C = test_C.drop(columns='date_forecast')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['total_radiation:W'] = df['direct_rad:W'] + df['diffuse_rad:W']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['t_1000hPa:C'] = df['t_1000hPa:K'] - 273.15
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['msl_pressure:hPa_scaled'] = MinMaxScaler().fit_transform(df['msl_pressure:hPa'].values.

In [7]:
# Initalize the models
parameters_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

parameters_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

parameters_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1
}

model_A = xgb.XGBRegressor(**parameters_A)
model_B = xgb.XGBRegressor(**parameters_B)
model_C = xgb.XGBRegressor(**parameters_C)

In [8]:
# Fit the models, verbose=False
model_A.fit(
    X=X_A, y=y_A,
    verbose=False
)

model_B.fit(
    X=X_B, y=y_B,
    verbose=False
)

model_C.fit(
    X=X_C, y=y_C,
    verbose=False
)

In [9]:
# Create submission

output_file = 'xgb_submission.csv'

pred_A = model_A.predict(test_A)
pred_B = model_B.predict(test_B)
pred_C = model_C.predict(test_C)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Create an id array
ids = np.arange(0, len(predictions))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions
})

# Save to CSV
df.to_csv(output_file, index=False)
print(f"Submission saved to {output_file}")

Submission saved to xgb_submission.csv
