In [26]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

In [27]:
# Import data
obs_A = pd.read_parquet('../../preprocessing/data/obs_A.parquet')
obs_B = pd.read_parquet('../../preprocessing/data/obs_B.parquet')
obs_C = pd.read_parquet('../../preprocessing/data/obs_C.parquet')
est_A = pd.read_parquet('../../preprocessing/data/est_A.parquet')
est_B = pd.read_parquet('../../preprocessing/data/est_B.parquet')
est_C = pd.read_parquet('../../preprocessing/data/est_C.parquet')
test_A = pd.read_parquet('../../preprocessing/data/test_A.parquet')
test_B = pd.read_parquet('../../preprocessing/data/test_B.parquet')
test_C = pd.read_parquet('../../preprocessing/data/test_C.parquet')

In [28]:
# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Data splits for submissions
X_A = A.drop(columns='pv_measurement')
y_A = A['pv_measurement']
X_B = B.drop(columns='pv_measurement')
y_B = B['pv_measurement']
X_C = C.drop(columns='pv_measurement')
y_C = C['pv_measurement']

# Data splits for testing
train_A, test_A = train_test_split(A, test_size=0.2, shuffle=True, random_state=42)
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_test_A = test_A.drop(columns='pv_measurement')
y_test_A = test_A['pv_measurement']

train_B, test_B = train_test_split(B, test_size=0.2, shuffle=True, random_state=42)
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_test_B = test_B.drop(columns='pv_measurement')
y_test_B = test_B['pv_measurement']

train_C, test_C = train_test_split(C, test_size=0.2, shuffle=True, random_state=42)
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_test_C = test_C.drop(columns='pv_measurement')
y_test_C = test_C['pv_measurement']



In [29]:
# Inspect data
X_train_A.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27233 entries, 1851 to 15795
Data columns (total 38 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   snow_accumulation                        27233 non-null  float32
 1   total_radiation                          27233 non-null  float32
 2   sfc_pressure:hPa                         27233 non-null  float32
 3   month                                    27233 non-null  int32  
 4   year                                     27233 non-null  int32  
 5   date_forecast_fft_amplitude              27233 non-null  float64
 6   date_forecast_fft_phase                  27233 non-null  float64
 7   sun_elevation:d_fft_amplitude            27233 non-null  float64
 8   sun_elevation:d_fft_phase                27233 non-null  float64
 9   t_1000hPa:K_rate_of_change               27233 non-null  float32
 10  clear_sky_rad:W_rate_of_change           27233 n

In [30]:
# XGBoost parameters
xgb_parameters = {'colsample_bytree': 0.664, 
              'gamma': 3, 
              'learning_rate': 0.012, 
              'max_depth': 12, 
              'min_child_weight': 15, 
              'n_estimators': 400, 
              'reg_alpha': 2, 
              'reg_lambda': 2, 
              'subsample': 0.912}

# Define base models
base_models = [
    ('ridge', Ridge()),
    ('rf', RandomForestRegressor(n_estimators=100, n_jobs=-1)),
    ('xgb', XGBRegressor(**xgb_parameters, n_jobs=-1)),
    ('svr', SVR()),
    ('cat', CatBoostRegressor(verbose=0)),
    ('knn', KNeighborsRegressor()),
    ('mlp', MLPRegressor(hidden_layer_sizes=(100,), max_iter=500))
]

# Define meta-model
meta_model = Ridge()

# Define stacking ensemble
stack_A = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, n_jobs=-1, verbose=1)
stack_B = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, n_jobs=-1, verbose=1)
stack_C = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, n_jobs=-1, verbose=1)

In [31]:
# Fit stacking ensemble
stack_A.fit(X_A, y_A)
pickle.dump(stack_A, open('./stack_models/stack_A.pkl', 'wb'))

# stack_B.fit(X_B, y_B)
# pickle.dump(stack_B, open('./stack_models/stack_B.pkl', 'wb'))

# stack_C.fit(X_C, y_C)
# pickle.dump(stack_C, open('./stack_models/stack_C.pkl', 'wb'))

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [None]:
# Evaluate
print('MAE A:', mean_absolute_error(y_test_A, stack_A.predict(X_test_A)))
# print('MAE B:', mean_absolute_error(y_test_B, stack_B.predict(X_test_B)))
# print('MAE C:', mean_absolute_error(y_test_C, stack_C.predict(X_test_C)))

# Total MAE for all three locations
# print('Total MAE:', (mean_absolute_error(y_test_A, stack_A.predict(X_test_A)) + mean_absolute_error(y_test_B, stack_B.predict(X_test_B)) + mean_absolute_error(y_test_C, stack_C.predict(X_test_C)))/3)

MAE A: 105.10166213678608
MAE B: 15.987515736930082
MAE C: 13.354463918146225
Total MAE: 44.814547263954125


In [None]:
stack_A.score(X_A, y_A)

0.961812905434636

In [None]:
# Create submission

output_file = 'submission.csv'

pred_A = stack_A.predict(test_A)
pred_B = stack_B.predict(test_B)
pred_C = stack_C.predict(test_C)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Create an id array
ids = np.arange(0, len(predictions))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions
})

# Save to CSV
df.to_csv(output_file, index=False)
print(f"Submission saved to {output_file}")

Submission saved to submission.csv
