In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import data
obs_A = pd.read_parquet('../../preprocessing/data/obs_A.parquet')
obs_B = pd.read_parquet('../../preprocessing/data/obs_B.parquet')
obs_C = pd.read_parquet('../../preprocessing/data/obs_C.parquet')
est_A = pd.read_parquet('../../preprocessing/data/est_A.parquet')
est_B = pd.read_parquet('../../preprocessing/data/est_B.parquet')
est_C = pd.read_parquet('../../preprocessing/data/est_C.parquet')
test_A = pd.read_parquet('../../preprocessing/data/test_A.parquet')
test_B = pd.read_parquet('../../preprocessing/data/test_B.parquet')
test_C = pd.read_parquet('../../preprocessing/data/test_C.parquet')

obs_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in obs_A.columns]
est_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in est_A.columns]
test_A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_A.columns]

obs_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in obs_B.columns]
est_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in est_B.columns]
test_B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_B.columns]

obs_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in obs_C.columns]
est_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in est_C.columns]
test_C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in test_C.columns]

In [3]:
# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

# Data splits for submissions
X_A = A.drop(columns='pv_measurement')
y_A = A['pv_measurement']
X_B = B.drop(columns='pv_measurement')
y_B = B['pv_measurement']
X_C = C.drop(columns='pv_measurement')
y_C = C['pv_measurement']

# Data splits for testing
# train_A, test_A = train_test_split(A, test_size=0.2, shuffle=True, random_state=42)
# X_train_A = train_A.drop(columns='pv_measurement')
# y_train_A = train_A['pv_measurement']
# X_test_A = test_A.drop(columns='pv_measurement')
# y_test_A = test_A['pv_measurement']

# train_B, test_B = train_test_split(B, test_size=0.2, shuffle=True, random_state=42)
# X_train_B = train_B.drop(columns='pv_measurement')
# y_train_B = train_B['pv_measurement']
# X_test_B = test_B.drop(columns='pv_measurement')
# y_test_B = test_B['pv_measurement']

# train_C, test_C = train_test_split(C, test_size=0.2, shuffle=True, random_state=42)
# X_train_C = train_C.drop(columns='pv_measurement')
# y_train_C = train_C['pv_measurement']
# X_test_C = test_C.drop(columns='pv_measurement')
# y_test_C = test_C['pv_measurement']



In [4]:
# Inspect data
# X_train_A.info()

In [5]:
# XGBoost parameters
xgb_parameters          = {
                            'colsample_bytree': 0.8, 
                            'gamma': 0.8, 
                            'learning_rate': 0.008, 
                            'max_depth': 15, 
                            'min_child_weight': 10, 
                            'n_estimators': 800, 
                            'reg_alpha': 1, 
                            'reg_lambda': 3, 
                            'subsample': 0.912,
                            'random_state': 0, 
                            'booster': 'gbtree'
                        }


catboost_parameters     = { 
                            'subsample': 0.7222222222222222, 
                            'rsm': 0.6111111111111112, 
                            'random_strength': 0.4111111111111111, 
                            'min_data_in_leaf': 19, 
                            'learning_rate': 0.020000000000000004, 
                            'l2_leaf_reg': 1.0, 
                            'iterations': 750, 
                            'grow_policy': 'Depthwise', 
                            'depth': 10, 
                            'border_count': 115, 
                            'bootstrap_type': 'MVS' }

lgbm_parameters         = { 
                            'boosting_type': 'dart',
                            'objective': 'regression',
                            'metric': 'mae',
                            'num_leaves': 100,
                            'learning_rate': 0.05,
                            'verbose': 1,
                            'num_iterations': 300 }
# Define base models
base_models = [
    ('xgb', XGBRegressor(**xgb_parameters, n_jobs=-1)),
    ('cat', CatBoostRegressor(**catboost_parameters)),
    ('lgbm', LGBMRegressor(**lgbm_parameters, n_jobs=-1))
]

# Define meta-model
meta_model = Ridge()

# Define stacking ensemble
stack_A = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, verbose=1)
stack_B = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, verbose=1)
stack_C = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5, verbose=1)

In [6]:
# Fit stacking ensemble
stack_A.fit(X_A, y_A)
pickle.dump(stack_A, open('./stack_models/stack_A.pkl', 'wb'))

stack_B.fit(X_B, y_B)
pickle.dump(stack_B, open('./stack_models/stack_B.pkl', 'wb'))

stack_C.fit(X_C, y_C)
pickle.dump(stack_C, open('./stack_models/stack_C.pkl', 'wb'))

0:	learn: 1146.5624810	total: 84.7ms	remaining: 1m 3s
1:	learn: 1127.5759891	total: 113ms	remaining: 42.4s
2:	learn: 1109.0399276	total: 142ms	remaining: 35.3s
3:	learn: 1090.9191030	total: 167ms	remaining: 31.1s
4:	learn: 1073.1135288	total: 195ms	remaining: 29.1s
5:	learn: 1055.7441300	total: 221ms	remaining: 27.4s
6:	learn: 1038.7279548	total: 252ms	remaining: 26.8s
7:	learn: 1022.0991908	total: 287ms	remaining: 26.6s
8:	learn: 1005.8666216	total: 316ms	remaining: 26s
9:	learn: 989.9881667	total: 344ms	remaining: 25.4s
10:	learn: 974.5443370	total: 381ms	remaining: 25.6s
11:	learn: 959.3083428	total: 417ms	remaining: 25.7s
12:	learn: 944.4801855	total: 438ms	remaining: 24.8s
13:	learn: 929.9673690	total: 470ms	remaining: 24.7s
14:	learn: 915.7652883	total: 501ms	remaining: 24.6s
15:	learn: 901.9593604	total: 543ms	remaining: 24.9s
16:	learn: 888.3972340	total: 606ms	remaining: 26.1s
17:	learn: 875.2387956	total: 637ms	remaining: 25.9s
18:	learn: 862.2657407	total: 672ms	remaining: 2

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 12.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 1174.5945761	total: 21.3ms	remaining: 16s
1:	learn: 1154.9765122	total: 44.6ms	remaining: 16.7s
2:	learn: 1135.9423719	total: 66.6ms	remaining: 16.6s
3:	learn: 1117.1986082	total: 91.1ms	remaining: 17s
4:	learn: 1098.8563724	total: 117ms	remaining: 17.5s
5:	learn: 1081.0672673	total: 139ms	remaining: 17.2s
6:	learn: 1063.5863332	total: 161ms	remaining: 17.1s
7:	learn: 1046.3559491	total: 186ms	remaining: 17.3s
8:	learn: 1029.5463306	total: 216ms	remaining: 17.8s
9:	learn: 1013.2010763	total: 242ms	remaining: 17.9s
10:	learn: 997.2691562	total: 268ms	remaining: 18s
11:	learn: 981.6612530	total: 289ms	remaining: 17.8s
12:	learn: 966.3952323	total: 310ms	remaining: 17.6s
13:	learn: 951.4447543	total: 340ms	remaining: 17.9s
14:	learn: 936.7241050	total: 366ms	remaining: 17.9s
15:	learn: 922.3085486	total: 393ms	remaining: 18s
16:	learn: 908.3045819	total: 417ms	remaining: 18s
17:	learn: 894.5695153	total: 441ms	remaining: 17.9s
18:	learn: 881.0938913	total: 473ms	remaining: 18.2s

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3420
[LightGBM] [Info] Number of data points in the train set: 27268, number of used features: 19
[LightGBM] [Info] Start training from score 592.508243
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3414
[LightGBM] [Info] Number of data points in the train set: 27268, number of used features: 19
[LightGBM] [Info] Start training from score 586.618095
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3416
[LightGBM] [Info] Number of data points in the train set: 27268, number of used features: 19
[LightGBM] [Info] Start training from score 651.721411
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, y

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 37.7min finished


0:	learn: 190.7988612	total: 25ms	remaining: 18.7s
1:	learn: 187.7730228	total: 44.5ms	remaining: 16.6s
2:	learn: 184.7371706	total: 66.9ms	remaining: 16.7s
3:	learn: 181.7775708	total: 89.1ms	remaining: 16.6s
4:	learn: 178.8352558	total: 113ms	remaining: 16.9s
5:	learn: 176.0375071	total: 138ms	remaining: 17.1s
6:	learn: 173.2936846	total: 160ms	remaining: 17s
7:	learn: 170.5744917	total: 186ms	remaining: 17.3s
8:	learn: 167.9317103	total: 209ms	remaining: 17.2s
9:	learn: 165.3297076	total: 231ms	remaining: 17.1s
10:	learn: 162.7839437	total: 257ms	remaining: 17.3s
11:	learn: 160.2813348	total: 281ms	remaining: 17.3s
12:	learn: 157.9227532	total: 305ms	remaining: 17.3s
13:	learn: 155.5993809	total: 330ms	remaining: 17.4s
14:	learn: 153.2985265	total: 355ms	remaining: 17.4s
15:	learn: 150.9991762	total: 379ms	remaining: 17.4s
16:	learn: 148.7769724	total: 403ms	remaining: 17.4s
17:	learn: 146.6099558	total: 424ms	remaining: 17.3s
18:	learn: 144.5084001	total: 449ms	remaining: 17.3s
19:

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  3.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 172.1904653	total: 19.6ms	remaining: 14.7s
1:	learn: 169.3919041	total: 57.5ms	remaining: 21.5s
2:	learn: 166.6654841	total: 82.6ms	remaining: 20.6s
3:	learn: 164.0197213	total: 107ms	remaining: 20s
4:	learn: 161.4485470	total: 127ms	remaining: 18.9s
5:	learn: 158.8859134	total: 147ms	remaining: 18.2s
6:	learn: 156.4161528	total: 164ms	remaining: 17.4s
7:	learn: 153.9400067	total: 187ms	remaining: 17.3s
8:	learn: 151.5115146	total: 209ms	remaining: 17.2s
9:	learn: 149.1635965	total: 230ms	remaining: 17s
10:	learn: 146.8761766	total: 248ms	remaining: 16.7s
11:	learn: 144.6257710	total: 269ms	remaining: 16.5s
12:	learn: 142.4168579	total: 289ms	remaining: 16.4s
13:	learn: 140.3085969	total: 308ms	remaining: 16.2s
14:	learn: 138.2285521	total: 330ms	remaining: 16.1s
15:	learn: 136.1685717	total: 349ms	remaining: 16s
16:	learn: 134.1250307	total: 369ms	remaining: 15.9s
17:	learn: 132.1773627	total: 389ms	remaining: 15.8s
18:	learn: 130.2718059	total: 406ms	remaining: 15.6s
19:	le

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   54.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3414
[LightGBM] [Info] Number of data points in the train set: 26274, number of used features: 19
[LightGBM] [Info] Start training from score 82.446897
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3417
[LightGBM] [Info] Number of data points in the train set: 26274, number of used features: 19
[LightGBM] [Info] Start training from score 99.725226
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3416
[LightGBM] [Info] Number of data points in the train set: 26274, number of used features: 19
[LightGBM] [Info] Start training from score 106.327517
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.7min finished


0:	learn: 162.9796749	total: 19.5ms	remaining: 14.6s
1:	learn: 160.1643167	total: 41.7ms	remaining: 15.6s
2:	learn: 157.4230020	total: 59.7ms	remaining: 14.9s
3:	learn: 154.7120387	total: 81.1ms	remaining: 15.1s
4:	learn: 152.0616653	total: 107ms	remaining: 16s
5:	learn: 149.4798238	total: 128ms	remaining: 15.9s
6:	learn: 146.9468650	total: 151ms	remaining: 16s
7:	learn: 144.4651497	total: 172ms	remaining: 15.9s
8:	learn: 142.0341470	total: 188ms	remaining: 15.5s
9:	learn: 139.6530015	total: 206ms	remaining: 15.3s
10:	learn: 137.3176843	total: 223ms	remaining: 15s
11:	learn: 135.0538126	total: 244ms	remaining: 15s
12:	learn: 132.8207212	total: 262ms	remaining: 14.9s
13:	learn: 130.6338339	total: 283ms	remaining: 14.9s
14:	learn: 128.4991219	total: 301ms	remaining: 14.7s
15:	learn: 126.4063016	total: 316ms	remaining: 14.5s
16:	learn: 124.3712790	total: 332ms	remaining: 14.3s
17:	learn: 122.3693904	total: 351ms	remaining: 14.3s
18:	learn: 120.4025305	total: 371ms	remaining: 14.3s
19:	lea

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 175.3721976	total: 18.9ms	remaining: 14.2s
1:	learn: 172.3472781	total: 36.3ms	remaining: 13.6s
2:	learn: 169.3910554	total: 55.4ms	remaining: 13.8s
3:	learn: 166.4803985	total: 70.9ms	remaining: 13.2s
4:	learn: 163.6513937	total: 89.5ms	remaining: 13.3s
5:	learn: 160.8667055	total: 107ms	remaining: 13.3s
6:	learn: 158.1510723	total: 126ms	remaining: 13.4s
7:	learn: 155.4801708	total: 146ms	remaining: 13.5s
8:	learn: 152.8493228	total: 164ms	remaining: 13.5s
9:	learn: 150.2941679	total: 181ms	remaining: 13.4s
10:	learn: 147.7866068	total: 198ms	remaining: 13.3s
11:	learn: 145.3334513	total: 217ms	remaining: 13.3s
12:	learn: 142.9357412	total: 231ms	remaining: 13.1s
13:	learn: 140.5771417	total: 250ms	remaining: 13.1s
14:	learn: 138.2620659	total: 264ms	remaining: 12.9s
15:	learn: 136.0202909	total: 283ms	remaining: 13s
16:	learn: 133.8091118	total: 307ms	remaining: 13.2s
17:	learn: 131.6494621	total: 324ms	remaining: 13.2s
18:	learn: 129.5295911	total: 340ms	remaining: 13.1s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   49.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3413
[LightGBM] [Info] Number of data points in the train set: 20876, number of used features: 19
[LightGBM] [Info] Start training from score 58.768313
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3415
[LightGBM] [Info] Number of data points in the train set: 20876, number of used features: 19
[LightGBM] [Info] Start training from score 78.078200
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3406
[LightGBM] [Info] Number of data points in the train set: 20876, number of used features: 19
[LightGBM] [Info] Start training from score 76.859068
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you 

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.7min finished


In [7]:
# Evaluate
# print('MAE A:', mean_absolute_error(y_test_A, stack_A.predict(X_test_A)))
# print('MAE B:', mean_absolute_error(y_test_B, stack_B.predict(X_test_B)))
# print('MAE C:', mean_absolute_error(y_test_C, stack_C.predict(X_test_C)))

# # Total MAE for all three locations
# print('Total MAE:', (mean_absolute_error(y_test_A, stack_A.predict(X_test_A)) + mean_absolute_error(y_test_B, stack_B.predict(X_test_B)) + mean_absolute_error(y_test_C, stack_C.predict(X_test_C)))/3)

In [8]:
# Create submission

output_file = 'stack_submission.csv'

pred_A = stack_A.predict(test_A)
pred_B = stack_B.predict(test_B)
pred_C = stack_C.predict(test_C)

pred_A = np.clip(pred_A, 0, None)
pred_B = np.clip(pred_B, 0, None)
pred_C = np.clip(pred_C, 0, None)

# Concatenate predictions
predictions = np.concatenate([pred_A, pred_B, pred_C])

# Create an id array
ids = np.arange(0, len(predictions))

# Create a DataFrame
df = pd.DataFrame({
    'id': ids,
    'prediction': predictions
})

# Save to CSV
df.to_csv(output_file, index=False)
print(f"Submission saved to {output_file}")

Submission saved to stack_submission.csv
