In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import data
obs_A = pd.read_parquet('../../preprocessing/data/obs_A.parquet').drop(columns='date_forecast')
obs_B = pd.read_parquet('../../preprocessing/data/obs_B.parquet').drop(columns='date_forecast')
obs_C = pd.read_parquet('../../preprocessing/data/obs_C.parquet').drop(columns='date_forecast')
est_A = pd.read_parquet('../../preprocessing/data/est_A.parquet').drop(columns='date_forecast')
est_B = pd.read_parquet('../../preprocessing/data/est_B.parquet').drop(columns='date_forecast')
est_C = pd.read_parquet('../../preprocessing/data/est_C.parquet').drop(columns='date_forecast')
test_A = pd.read_parquet('../../preprocessing/data/test_A.parquet').drop(columns='date_forecast')
test_B = pd.read_parquet('../../preprocessing/data/test_B.parquet').drop(columns='date_forecast')
test_C = pd.read_parquet('../../preprocessing/data/test_C.parquet').drop(columns='date_forecast')

obs_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in obs_A.columns]
est_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in est_A.columns]
test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]

obs_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in obs_B.columns]
est_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in est_B.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]

obs_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in obs_C.columns]
est_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in est_C.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

In [3]:
# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Data splits for submissions
X_A = A.drop(columns='pv_measurement')
y_A = A['pv_measurement']
X_B = B.drop(columns='pv_measurement')
y_B = B['pv_measurement']
X_C = C.drop(columns='pv_measurement')
y_C = C['pv_measurement']

# Data splits for testing
# train_A, test_A = train_test_split(A, test_size=0.2, shuffle=True, random_state=42)
# X_train_A = train_A.drop(columns='pv_measurement')
# y_train_A = train_A['pv_measurement']
# X_test_A = test_A.drop(columns='pv_measurement')
# y_test_A = test_A['pv_measurement']

# train_B, test_B = train_test_split(B, test_size=0.2, shuffle=True, random_state=42)
# X_train_B = train_B.drop(columns='pv_measurement')
# y_train_B = train_B['pv_measurement']
# X_test_B = test_B.drop(columns='pv_measurement')
# y_test_B = test_B['pv_measurement']

# train_C, test_C = train_test_split(C, test_size=0.2, shuffle=True, random_state=42)
# X_train_C = train_C.drop(columns='pv_measurement')
# y_train_C = train_C['pv_measurement']
# X_test_C = test_C.drop(columns='pv_measurement')
# y_test_C = test_C['pv_measurement']



In [4]:
# Inspect data
# X_train_A.info()

In [5]:
# XGBoost parameters
xgb_parameters          = {
                            'colsample_bytree': 0.8, 
                            'gamma': 0.8, 
                            'learning_rate': 0.008, 
                            'max_depth': 10, 
                            'min_child_weight': 10, 
                            'n_estimators': 450, 
                            'reg_alpha': 1, 
                            'reg_lambda': 3, 
                            'subsample': 0.912,
                            'random_state': 0, 
                            'booster': 'gbtree'
                        }


catboost_parameters     = { 
                            'subsample': 0.7222222222222222, 
                            'rsm': 0.6111111111111112, 
                            'random_strength': 0.4111111111111111, 
                            'min_data_in_leaf': 19, 
                            'learning_rate': 0.020000000000000004, 
                            'l2_leaf_reg': 1.0, 
                            'iterations': 750, 
                            'grow_policy': 'Depthwise', 
                            'depth': 10, 
                            'border_count': 115, 
                            'bootstrap_type': 'MVS' }

lgbm_parameters         = { 
                            'boosting_type': 'dart',
                            'objective': 'regression',
                            'metric': 'mae',
                            'num_leaves': 100,
                            'learning_rate': 0.05,
                            'verbose': 1,
                            'num_iterations': 300 }
# Define base models
base_models = [
    ('xgb', XGBRegressor(**xgb_parameters, n_jobs=-1)),
    ('cat', CatBoostRegressor(**catboost_parameters)),
    ('lgbm', LGBMRegressor(**lgbm_parameters, n_jobs=-1))
]

# Define meta-model
meta_model = Ridge()

# Define stacking ensemble
stack_A = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, verbose=1)
stack_B = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, verbose=1)
stack_C = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, verbose=1)

In [6]:
# Fit stacking ensemble
stack_A.fit(X_A, y_A)
pickle.dump(stack_A, open('./stack_models/stack_A.pkl', 'wb'))

stack_B.fit(X_B, y_B)
pickle.dump(stack_B, open('./stack_models/stack_B.pkl', 'wb'))

stack_C.fit(X_C, y_C)
pickle.dump(stack_C, open('./stack_models/stack_C.pkl', 'wb'))

0:	learn: 1146.8598740	total: 104ms	remaining: 1m 17s
1:	learn: 1127.8122439	total: 149ms	remaining: 55.7s
2:	learn: 1109.1808273	total: 190ms	remaining: 47.3s
3:	learn: 1090.8846982	total: 227ms	remaining: 42.3s
4:	learn: 1073.0556020	total: 263ms	remaining: 39.2s
5:	learn: 1055.5983019	total: 301ms	remaining: 37.3s
6:	learn: 1038.3562883	total: 345ms	remaining: 36.6s
7:	learn: 1021.5747949	total: 394ms	remaining: 36.6s
8:	learn: 1005.1808426	total: 433ms	remaining: 35.7s
9:	learn: 989.2418034	total: 471ms	remaining: 34.8s
10:	learn: 973.6054388	total: 509ms	remaining: 34.2s
11:	learn: 958.3520738	total: 541ms	remaining: 33.3s
12:	learn: 943.3487776	total: 584ms	remaining: 33.1s
13:	learn: 928.6979347	total: 637ms	remaining: 33.5s
14:	learn: 914.4232006	total: 684ms	remaining: 33.5s
15:	learn: 900.4154493	total: 724ms	remaining: 33.2s
16:	learn: 886.6144989	total: 769ms	remaining: 33.1s
17:	learn: 873.1287675	total: 810ms	remaining: 32.9s
18:	learn: 859.9482876	total: 854ms	remaining:

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 1175.1988873	total: 42.4ms	remaining: 31.8s
1:	learn: 1155.5194934	total: 79.6ms	remaining: 29.8s
2:	learn: 1136.2687777	total: 113ms	remaining: 28.2s
3:	learn: 1117.4609754	total: 153ms	remaining: 28.6s
4:	learn: 1099.1516944	total: 185ms	remaining: 27.6s
5:	learn: 1081.0925612	total: 217ms	remaining: 26.9s
6:	learn: 1063.4637593	total: 253ms	remaining: 26.9s
7:	learn: 1046.1392224	total: 326ms	remaining: 30.2s
8:	learn: 1029.2542266	total: 361ms	remaining: 29.7s
9:	learn: 1012.6872933	total: 394ms	remaining: 29.2s
10:	learn: 996.6055214	total: 427ms	remaining: 28.7s
11:	learn: 980.8448963	total: 462ms	remaining: 28.4s
12:	learn: 965.4270879	total: 506ms	remaining: 28.7s
13:	learn: 950.3743441	total: 539ms	remaining: 28.3s
14:	learn: 935.6822662	total: 572ms	remaining: 28s
15:	learn: 921.2556474	total: 603ms	remaining: 27.7s
16:	learn: 907.1334376	total: 637ms	remaining: 27.5s
17:	learn: 893.1720160	total: 674ms	remaining: 27.4s
18:	learn: 879.5619264	total: 714ms	remaining:

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13206
[LightGBM] [Info] Number of data points in the train set: 27233, number of used features: 58
[LightGBM] [Info] Start training from score 659.182406
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13201
[LightGBM] [Info] Number of data points in the train set: 27233, number of used features: 58
[LightGBM] [Info] Start training from score 593.478290
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13180
[LightGBM] [Info] Number of data points in the train set: 27234, number of used features: 58
[LightGBM] [Info] Start training from score 586.969924
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bin

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.9min finished


0:	learn: 208.7886009	total: 39.7ms	remaining: 29.7s
1:	learn: 205.1711900	total: 76.4ms	remaining: 28.6s
2:	learn: 201.6185907	total: 107ms	remaining: 26.8s
3:	learn: 198.1375025	total: 141ms	remaining: 26.4s
4:	learn: 194.7459721	total: 177ms	remaining: 26.3s
5:	learn: 191.4089228	total: 207ms	remaining: 25.7s
6:	learn: 188.1388709	total: 241ms	remaining: 25.6s
7:	learn: 184.9570017	total: 279ms	remaining: 25.9s
8:	learn: 181.8365613	total: 309ms	remaining: 25.4s
9:	learn: 178.7687507	total: 342ms	remaining: 25.3s
10:	learn: 175.7664738	total: 378ms	remaining: 25.4s
11:	learn: 172.8177882	total: 413ms	remaining: 25.4s
12:	learn: 169.9498020	total: 454ms	remaining: 25.7s
13:	learn: 167.1436179	total: 493ms	remaining: 25.9s
14:	learn: 164.3807611	total: 527ms	remaining: 25.8s
15:	learn: 161.6769041	total: 571ms	remaining: 26.2s
16:	learn: 159.0265677	total: 605ms	remaining: 26.1s
17:	learn: 156.4447620	total: 633ms	remaining: 25.7s
18:	learn: 153.9024972	total: 668ms	remaining: 25.7s
1

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 190.8757064	total: 32.1ms	remaining: 24.1s
1:	learn: 187.5765581	total: 64.4ms	remaining: 24.1s
2:	learn: 184.3166019	total: 91ms	remaining: 22.7s
3:	learn: 181.1373700	total: 124ms	remaining: 23.2s
4:	learn: 178.0460961	total: 154ms	remaining: 23s
5:	learn: 174.9965475	total: 186ms	remaining: 23s
6:	learn: 171.9954494	total: 216ms	remaining: 22.9s
7:	learn: 169.0761448	total: 264ms	remaining: 24.5s
8:	learn: 166.2196110	total: 294ms	remaining: 24.2s
9:	learn: 163.3978971	total: 323ms	remaining: 23.9s
10:	learn: 160.6877137	total: 352ms	remaining: 23.6s
11:	learn: 158.0063306	total: 383ms	remaining: 23.6s
12:	learn: 155.3685720	total: 413ms	remaining: 23.4s
13:	learn: 152.7830255	total: 444ms	remaining: 23.4s
14:	learn: 150.2657997	total: 475ms	remaining: 23.3s
15:	learn: 147.8034170	total: 508ms	remaining: 23.3s
16:	learn: 145.3945332	total: 542ms	remaining: 23.4s
17:	learn: 143.0280982	total: 580ms	remaining: 23.6s
18:	learn: 140.7173585	total: 622ms	remaining: 23.9s
19:	le

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13193
[LightGBM] [Info] Number of data points in the train set: 20739, number of used features: 58
[LightGBM] [Info] Start training from score 90.977616
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13204
[LightGBM] [Info] Number of data points in the train set: 20739, number of used features: 58
[LightGBM] [Info] Start training from score 119.855417
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13184
[LightGBM] [Info] Number of data points in the train set: 20739, number of used features: 58
[LightGBM] [Info] Start training from score 108.231068
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13195
[LightGBM] [Info] Number of data points in the train set: 20739, number of used features: 58
[LightGBM] [Info] Start trai

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.8min finished


0:	learn: 176.3096288	total: 30.4ms	remaining: 22.7s
1:	learn: 173.2622388	total: 62.1ms	remaining: 23.2s
2:	learn: 170.2748720	total: 92.1ms	remaining: 22.9s
3:	learn: 167.3726151	total: 125ms	remaining: 23.2s
4:	learn: 164.5062390	total: 151ms	remaining: 22.5s
5:	learn: 161.7078426	total: 184ms	remaining: 22.8s
6:	learn: 158.9614709	total: 209ms	remaining: 22.2s
7:	learn: 156.2912977	total: 242ms	remaining: 22.4s
8:	learn: 153.6515274	total: 277ms	remaining: 22.8s
9:	learn: 151.0983740	total: 304ms	remaining: 22.5s
10:	learn: 148.5750871	total: 335ms	remaining: 22.5s
11:	learn: 146.0863694	total: 368ms	remaining: 22.6s
12:	learn: 143.6593030	total: 399ms	remaining: 22.6s
13:	learn: 141.2785541	total: 431ms	remaining: 22.6s
14:	learn: 138.9519017	total: 461ms	remaining: 22.6s
15:	learn: 136.6806719	total: 492ms	remaining: 22.6s
16:	learn: 134.4597314	total: 520ms	remaining: 22.4s
17:	learn: 132.2819382	total: 559ms	remaining: 22.7s
18:	learn: 130.1539791	total: 590ms	remaining: 22.7s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 26.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 187.9028188	total: 27.3ms	remaining: 20.5s
1:	learn: 184.6523125	total: 53.1ms	remaining: 19.9s
2:	learn: 181.4821436	total: 78.3ms	remaining: 19.5s
3:	learn: 178.3667638	total: 102ms	remaining: 18.9s
4:	learn: 175.3073876	total: 121ms	remaining: 18.1s
5:	learn: 172.3221469	total: 144ms	remaining: 17.9s
6:	learn: 169.3922207	total: 166ms	remaining: 17.6s
7:	learn: 166.5362089	total: 189ms	remaining: 17.5s
8:	learn: 163.7278451	total: 212ms	remaining: 17.5s
9:	learn: 160.9639958	total: 231ms	remaining: 17.1s
10:	learn: 158.2708418	total: 255ms	remaining: 17.1s
11:	learn: 155.6293548	total: 279ms	remaining: 17.2s
12:	learn: 153.0620567	total: 299ms	remaining: 17s
13:	learn: 150.5564670	total: 322ms	remaining: 16.9s
14:	learn: 148.0933695	total: 346ms	remaining: 16.9s
15:	learn: 145.6800244	total: 369ms	remaining: 16.9s
16:	learn: 143.3180778	total: 389ms	remaining: 16.8s
17:	learn: 140.9994686	total: 412ms	remaining: 16.8s
18:	learn: 138.7084427	total: 431ms	remaining: 16.6s
19

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13180
[LightGBM] [Info] Number of data points in the train set: 16908, number of used features: 58
[LightGBM] [Info] Start training from score 107.865206
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13174
[LightGBM] [Info] Number of data points in the train set: 16908, number of used features: 58
[LightGBM] [Info] Start training from score 75.687728
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13176
[LightGBM] [Info] Number of data points in the train set: 16908, number of used features: 58
[LightGBM] [Info] Start training from score 98.943891
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, 

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  7.3min finished


In [7]:
# Evaluate
# print('MAE A:', mean_absolute_error(y_test_A, stack_A.predict(X_test_A)))
# print('MAE B:', mean_absolute_error(y_test_B, stack_B.predict(X_test_B)))
# print('MAE C:', mean_absolute_error(y_test_C, stack_C.predict(X_test_C)))

# # Total MAE for all three locations
# print('Total MAE:', (mean_absolute_error(y_test_A, stack_A.predict(X_test_A)) + mean_absolute_error(y_test_B, stack_B.predict(X_test_B)) + mean_absolute_error(y_test_C, stack_C.predict(X_test_C)))/3)

In [8]:
# Create submission

output_file = 'stack_submission_all.csv'

pred_A = stack_A.predict(test_A)
pred_B = stack_B.predict(test_B)
pred_C = stack_C.predict(test_C)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Create an id array
ids = np.arange(0, len(predictions))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions
})

# Save to CSV
df.to_csv(output_file, index=False)
print(f"Submission saved to {output_file}")

Submission saved to stack_submission_all.csv
