In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import data
obs_A = pd.read_parquet('../../preprocessing/data/obs_A.parquet').drop(columns='date_forecast')
obs_B = pd.read_parquet('../../preprocessing/data/obs_B.parquet').drop(columns='date_forecast')
obs_C = pd.read_parquet('../../preprocessing/data/obs_C.parquet').drop(columns='date_forecast')
est_A = pd.read_parquet('../../preprocessing/data/est_A.parquet').drop(columns='date_forecast')
est_B = pd.read_parquet('../../preprocessing/data/est_B.parquet').drop(columns='date_forecast')
est_C = pd.read_parquet('../../preprocessing/data/est_C.parquet').drop(columns='date_forecast')
test_A = pd.read_parquet('../../preprocessing/data/test_A.parquet').drop(columns='date_forecast')
test_B = pd.read_parquet('../../preprocessing/data/test_B.parquet').drop(columns='date_forecast')
test_C = pd.read_parquet('../../preprocessing/data/test_C.parquet').drop(columns='date_forecast')

obs_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in obs_A.columns]
est_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in est_A.columns]
test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]

obs_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in obs_B.columns]
est_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in est_B.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]

obs_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in obs_C.columns]
est_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in est_C.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

In [3]:
# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Data splits for submissions
X_A = A.drop(columns='pv_measurement')
y_A = A['pv_measurement']
X_B = B.drop(columns='pv_measurement')
y_B = B['pv_measurement']
X_C = C.drop(columns='pv_measurement')
y_C = C['pv_measurement']

# Data splits for testing
# train_A, test_A = train_test_split(A, test_size=0.2, shuffle=True, random_state=42)
# X_train_A = train_A.drop(columns='pv_measurement')
# y_train_A = train_A['pv_measurement']
# X_test_A = test_A.drop(columns='pv_measurement')
# y_test_A = test_A['pv_measurement']

# train_B, test_B = train_test_split(B, test_size=0.2, shuffle=True, random_state=42)
# X_train_B = train_B.drop(columns='pv_measurement')
# y_train_B = train_B['pv_measurement']
# X_test_B = test_B.drop(columns='pv_measurement')
# y_test_B = test_B['pv_measurement']

# train_C, test_C = train_test_split(C, test_size=0.2, shuffle=True, random_state=42)
# X_train_C = train_C.drop(columns='pv_measurement')
# y_train_C = train_C['pv_measurement']
# X_test_C = test_C.drop(columns='pv_measurement')
# y_test_C = test_C['pv_measurement']



In [4]:
# Inspect data
# X_train_A.info()

In [5]:
# XGBoost parameters
xgb_parameters          = {
                            'colsample_bytree': 0.8, 
                            'gamma': 0.8, 
                            'learning_rate': 0.008, 
                            'max_depth': 10, 
                            'min_child_weight': 10, 
                            'n_estimators': 450, 
                            'reg_alpha': 1, 
                            'reg_lambda': 3, 
                            'subsample': 0.912,
                            'random_state': 0, 
                            'booster': 'gbtree'
                        }


catboost_parameters     = { 
                            'subsample': 0.7222222222222222, 
                            'rsm': 0.6111111111111112, 
                            'random_strength': 0.4111111111111111, 
                            'min_data_in_leaf': 19, 
                            'learning_rate': 0.020000000000000004, 
                            'l2_leaf_reg': 1.0, 
                            'iterations': 750, 
                            'grow_policy': 'Depthwise', 
                            'depth': 10, 
                            'border_count': 115, 
                            'bootstrap_type': 'MVS' }

lgbm_parameters         = { 
                            'boosting_type': 'dart',
                            'objective': 'regression',
                            'metric': 'mae',
                            'num_leaves': 100,
                            'learning_rate': 0.05,
                            'verbose': 1,
                            'num_iterations': 300 }
# Define base models
base_models = [
    ('xgb', XGBRegressor(**xgb_parameters, n_jobs=-1)),
    ('cat', CatBoostRegressor(**catboost_parameters)),
    ('lgbm', LGBMRegressor(**lgbm_parameters, n_jobs=-1))
]

# Define meta-model
meta_model = Ridge()

# Define stacking ensemble
stack_A = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, verbose=1)
stack_B = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, verbose=1)
stack_C = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, verbose=1)

In [6]:
# Fit stacking ensemble
stack_A.fit(X_A, y_A)
pickle.dump(stack_A, open('./stack_models/stack_A.pkl', 'wb'))

stack_B.fit(X_B, y_B)
pickle.dump(stack_B, open('./stack_models/stack_B.pkl', 'wb'))

stack_C.fit(X_C, y_C)
pickle.dump(stack_C, open('./stack_models/stack_C.pkl', 'wb'))

0:	learn: 1146.9882834	total: 74.2ms	remaining: 55.6s
1:	learn: 1128.0152087	total: 93.6ms	remaining: 35s
2:	learn: 1109.4311673	total: 112ms	remaining: 27.8s
3:	learn: 1091.2948385	total: 132ms	remaining: 24.6s
4:	learn: 1073.5286741	total: 150ms	remaining: 22.3s
5:	learn: 1056.1374786	total: 169ms	remaining: 20.9s
6:	learn: 1039.2257623	total: 185ms	remaining: 19.7s
7:	learn: 1022.7180324	total: 201ms	remaining: 18.7s
8:	learn: 1006.6246981	total: 216ms	remaining: 17.8s
9:	learn: 990.7910970	total: 237ms	remaining: 17.6s
10:	learn: 975.3351136	total: 257ms	remaining: 17.2s
11:	learn: 960.1460899	total: 279ms	remaining: 17.1s
12:	learn: 945.2899076	total: 301ms	remaining: 17.1s
13:	learn: 930.9435524	total: 322ms	remaining: 16.9s
14:	learn: 916.7928690	total: 339ms	remaining: 16.6s
15:	learn: 903.0048610	total: 361ms	remaining: 16.5s
16:	learn: 889.4958562	total: 383ms	remaining: 16.5s
17:	learn: 876.3009162	total: 404ms	remaining: 16.4s
18:	learn: 863.3713715	total: 428ms	remaining: 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   47.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 1175.1247055	total: 16.6ms	remaining: 12.5s
1:	learn: 1155.4904746	total: 36.1ms	remaining: 13.5s
2:	learn: 1136.2860308	total: 51.8ms	remaining: 12.9s
3:	learn: 1117.5384331	total: 67.5ms	remaining: 12.6s
4:	learn: 1099.1964848	total: 83.6ms	remaining: 12.5s
5:	learn: 1081.2581112	total: 97.8ms	remaining: 12.1s
6:	learn: 1063.7642653	total: 116ms	remaining: 12.3s
7:	learn: 1046.6701587	total: 132ms	remaining: 12.2s
8:	learn: 1029.9697524	total: 145ms	remaining: 11.9s
9:	learn: 1013.5256561	total: 166ms	remaining: 12.3s
10:	learn: 997.5233851	total: 183ms	remaining: 12.3s
11:	learn: 981.8699302	total: 200ms	remaining: 12.3s
12:	learn: 966.7348235	total: 216ms	remaining: 12.3s
13:	learn: 951.6767506	total: 234ms	remaining: 12.3s
14:	learn: 937.0706487	total: 251ms	remaining: 12.3s
15:	learn: 922.7942596	total: 264ms	remaining: 12.1s
16:	learn: 908.8562993	total: 281ms	remaining: 12.1s
17:	learn: 895.1268223	total: 300ms	remaining: 12.2s
18:	learn: 881.7816951	total: 318ms	rema

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   51.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2384
[LightGBM] [Info] Number of data points in the train set: 27233, number of used features: 13
[LightGBM] [Info] Start training from score 593.478290
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2378
[LightGBM] [Info] Number of data points in the train set: 27234, number of used features: 13
[LightGBM] [Info] Start training from score 586.969924
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2380
[LightGBM] [Info] Number of data points in the train set: 27234, number of used features: 13
[LightGBM] [Info] Start training from score 652.770596
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   57.6s finished


0:	learn: 208.8123156	total: 15.9ms	remaining: 11.9s
1:	learn: 205.1844820	total: 29.3ms	remaining: 11s
2:	learn: 201.6220538	total: 40ms	remaining: 9.95s
3:	learn: 198.1488001	total: 56.1ms	remaining: 10.5s
4:	learn: 194.7609066	total: 71ms	remaining: 10.6s
5:	learn: 191.4358304	total: 86.2ms	remaining: 10.7s
6:	learn: 188.1828689	total: 105ms	remaining: 11.1s
7:	learn: 184.9964796	total: 120ms	remaining: 11.1s
8:	learn: 181.8856747	total: 138ms	remaining: 11.3s
9:	learn: 178.8360706	total: 154ms	remaining: 11.4s
10:	learn: 175.8570062	total: 168ms	remaining: 11.3s
11:	learn: 172.9494627	total: 180ms	remaining: 11.1s
12:	learn: 170.0900496	total: 193ms	remaining: 10.9s
13:	learn: 167.3113362	total: 209ms	remaining: 11s
14:	learn: 164.5838874	total: 220ms	remaining: 10.8s
15:	learn: 161.9214744	total: 238ms	remaining: 10.9s
16:	learn: 159.3042442	total: 254ms	remaining: 10.9s
17:	learn: 156.7459952	total: 272ms	remaining: 11.1s
18:	learn: 154.2447935	total: 288ms	remaining: 11.1s
19:	l

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   39.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 190.8673622	total: 15.5ms	remaining: 11.6s
1:	learn: 187.5692458	total: 29.6ms	remaining: 11.1s
2:	learn: 184.3401356	total: 44.8ms	remaining: 11.2s
3:	learn: 181.1740767	total: 58.6ms	remaining: 10.9s
4:	learn: 178.0728855	total: 69.9ms	remaining: 10.4s
5:	learn: 175.0472773	total: 83.6ms	remaining: 10.4s
6:	learn: 172.0699724	total: 98.8ms	remaining: 10.5s
7:	learn: 169.1620368	total: 112ms	remaining: 10.4s
8:	learn: 166.3065419	total: 125ms	remaining: 10.3s
9:	learn: 163.5339001	total: 136ms	remaining: 10.1s
10:	learn: 160.8262731	total: 150ms	remaining: 10.1s
11:	learn: 158.1984546	total: 163ms	remaining: 10s
12:	learn: 155.6062001	total: 176ms	remaining: 10s
13:	learn: 153.0417519	total: 192ms	remaining: 10.1s
14:	learn: 150.5438851	total: 206ms	remaining: 10.1s
15:	learn: 148.1138082	total: 220ms	remaining: 10.1s
16:	learn: 145.7324423	total: 245ms	remaining: 10.6s
17:	learn: 143.3712453	total: 268ms	remaining: 10.9s
18:	learn: 141.0656792	total: 284ms	remaining: 10.9s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   42.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2376
[LightGBM] [Info] Number of data points in the train set: 20739, number of used features: 13
[LightGBM] [Info] Start training from score 119.855417
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2375
[LightGBM] [Info] Number of data points in the train set: 20739, number of used features: 13
[LightGBM] [Info] Start training from score 108.231068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2369
[LightGBM] [Info] Number of data points in the train set: 20739, number of used features: 13
[LightGBM] [Info] Start training from score 98.212859
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, yo

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min finished


0:	learn: 176.3221774	total: 26.6ms	remaining: 19.9s
1:	learn: 173.2735948	total: 40ms	remaining: 14.9s
2:	learn: 170.2750978	total: 54.5ms	remaining: 13.6s
3:	learn: 167.3465373	total: 71.1ms	remaining: 13.3s
4:	learn: 164.4962720	total: 87.2ms	remaining: 13s
5:	learn: 161.6979765	total: 99.4ms	remaining: 12.3s
6:	learn: 158.9874113	total: 112ms	remaining: 11.9s
7:	learn: 156.3161454	total: 125ms	remaining: 11.6s
8:	learn: 153.7086099	total: 141ms	remaining: 11.6s
9:	learn: 151.1668059	total: 155ms	remaining: 11.5s
10:	learn: 148.6675247	total: 170ms	remaining: 11.5s
11:	learn: 146.2161745	total: 183ms	remaining: 11.2s
12:	learn: 143.8226383	total: 198ms	remaining: 11.2s
13:	learn: 141.4778771	total: 211ms	remaining: 11.1s
14:	learn: 139.1706102	total: 227ms	remaining: 11.1s
15:	learn: 136.9346286	total: 242ms	remaining: 11.1s
16:	learn: 134.7410706	total: 257ms	remaining: 11.1s
17:	learn: 132.6094760	total: 271ms	remaining: 11s
18:	learn: 130.5229093	total: 287ms	remaining: 11s
19:	l

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   35.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 187.8683690	total: 13.1ms	remaining: 9.81s
1:	learn: 184.6443915	total: 24.3ms	remaining: 9.1s
2:	learn: 181.4624070	total: 37ms	remaining: 9.21s
3:	learn: 178.3566616	total: 46.9ms	remaining: 8.75s
4:	learn: 175.3043042	total: 59.4ms	remaining: 8.85s
5:	learn: 172.3285151	total: 69.6ms	remaining: 8.63s
6:	learn: 169.4294599	total: 81.8ms	remaining: 8.68s
7:	learn: 166.5732888	total: 95.9ms	remaining: 8.89s
8:	learn: 163.7925799	total: 107ms	remaining: 8.85s
9:	learn: 161.0642461	total: 119ms	remaining: 8.84s
10:	learn: 158.3961288	total: 132ms	remaining: 8.85s
11:	learn: 155.7950904	total: 144ms	remaining: 8.83s
12:	learn: 153.2259045	total: 156ms	remaining: 8.84s
13:	learn: 150.7135698	total: 169ms	remaining: 8.91s
14:	learn: 148.2631975	total: 182ms	remaining: 8.9s
15:	learn: 145.8787853	total: 195ms	remaining: 8.93s
16:	learn: 143.5385714	total: 206ms	remaining: 8.87s
17:	learn: 141.2447374	total: 220ms	remaining: 8.95s
18:	learn: 138.9949833	total: 234ms	remaining: 8.99s

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   39.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2381
[LightGBM] [Info] Number of data points in the train set: 16908, number of used features: 13
[LightGBM] [Info] Start training from score 75.687728
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2378
[LightGBM] [Info] Number of data points in the train set: 16908, number of used features: 13
[LightGBM] [Info] Start training from score 98.943891
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2372
[LightGBM] [Info] Number of data points in the train set: 16908, number of used features: 13
[LightGBM] [Info] Start training from score 91.513905
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you 

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   48.8s finished


In [7]:
# Evaluate
# print('MAE A:', mean_absolute_error(y_test_A, stack_A.predict(X_test_A)))
# print('MAE B:', mean_absolute_error(y_test_B, stack_B.predict(X_test_B)))
# print('MAE C:', mean_absolute_error(y_test_C, stack_C.predict(X_test_C)))

# # Total MAE for all three locations
# print('Total MAE:', (mean_absolute_error(y_test_A, stack_A.predict(X_test_A)) + mean_absolute_error(y_test_B, stack_B.predict(X_test_B)) + mean_absolute_error(y_test_C, stack_C.predict(X_test_C)))/3)

In [8]:
# Create submission

output_file = 'stack_submission.csv'

pred_A = stack_A.predict(test_A)
pred_B = stack_B.predict(test_B)
pred_C = stack_C.predict(test_C)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Create an id array
ids = np.arange(0, len(predictions))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions
})

# Save to CSV
df.to_csv(output_file, index=False)
print(f"Submission saved to {output_file}")

Submission saved to stack_submission.csv
