In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier, CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import data
obs_A = pd.read_parquet('../../preprocessing/data/obs_A.parquet')
obs_B = pd.read_parquet('../../preprocessing/data/obs_B.parquet')
obs_C = pd.read_parquet('../../preprocessing/data/obs_C.parquet')
est_A = pd.read_parquet('../../preprocessing/data/est_A.parquet')
est_B = pd.read_parquet('../../preprocessing/data/est_B.parquet')
est_C = pd.read_parquet('../../preprocessing/data/est_C.parquet')
test_A = pd.read_parquet('../../preprocessing/data/test_A.parquet').drop(columns='date_forecast')
test_B = pd.read_parquet('../../preprocessing/data/test_B.parquet').drop(columns='date_forecast')
test_C = pd.read_parquet('../../preprocessing/data/test_C.parquet').drop(columns='date_forecast')

In [None]:
# Concatinate
A = pd.concat([obs_A, est_A])
B = pd.concat([obs_B, est_B])
C = pd.concat([obs_C, est_C])

A.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in A.columns]
B.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in B.columns]
C.columns = [col.replace('[', '').replace(']', '').replace(',', '').replace('{', '').replace('}', '').replace('(', '').replace(')', '').replace('"', '').replace("'", '').replace(':', '').replace('\\', '') for col in C.columns]

# Step 1: Filter A to include only months from March to October
A = A[A['date_forecast'].dt.month.isin([3, 4, 5, 6, 7, 8, 9, 10])]

# Step 2: Identify unique days within May, June, and July
summer_months = A[A['date_forecast'].dt.month.isin([5, 6, 7])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()

# Step 3: Sample these days for val_A
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.2), replace=False)
val_A = A[A['date_forecast'].dt.date.isin(sampled_days)]

# Step 4: Define train_A as the remaining data
train_A = A[~A['date_forecast'].dt.date.isin(sampled_days)]

# Prepare your features and target variables
X_train_A = train_A.drop(columns='pv_measurement')
y_train_A = train_A['pv_measurement']
X_val_A = val_A.drop(columns='pv_measurement')
y_val_A = val_A['pv_measurement']

# Repeat for B and C
B = B[B['date_forecast'].dt.month.isin([3, 4, 5, 6, 7, 8, 9, 10])]
summer_months = B[B['date_forecast'].dt.month.isin([5, 6, 7])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.2), replace=False)
val_B = B[B['date_forecast'].dt.date.isin(sampled_days)]
train_B = B[~B['date_forecast'].dt.date.isin(sampled_days)]
X_train_B = train_B.drop(columns='pv_measurement')
y_train_B = train_B['pv_measurement']
X_val_B = val_B.drop(columns='pv_measurement')
y_val_B = val_B['pv_measurement']

C = C[C['date_forecast'].dt.month.isin([3, 4, 5, 6, 7, 8, 9, 10])]
summer_months = C[C['date_forecast'].dt.month.isin([5, 6, 7])]
unique_summer_days = summer_months['date_forecast'].dt.date.unique()
sampled_days = np.random.choice(unique_summer_days, size=int(len(unique_summer_days) * 0.2), replace=False)
val_C = C[C['date_forecast'].dt.date.isin(sampled_days)]
train_C = C[~C['date_forecast'].dt.date.isin(sampled_days)]
X_train_C = train_C.drop(columns='pv_measurement')
y_train_C = train_C['pv_measurement']
X_val_C = val_C.drop(columns='pv_measurement')
y_val_C = val_C['pv_measurement']

# Drop date_forecast
X_train_A = X_train_A.drop(columns='date_forecast')
X_train_B = X_train_B.drop(columns='date_forecast')
X_train_C = X_train_C.drop(columns='date_forecast')
X_val_A = X_val_A.drop(columns='date_forecast')
X_val_B = X_val_B.drop(columns='date_forecast')
X_val_C = X_val_C.drop(columns='date_forecast')

In [None]:
# print(A.columns)
# print(X_train_A['month'].describe())
# print(X_val_A['month'].describe())

In [None]:
train_data_A = lgb.Dataset(X_train_A, label=y_train_A)
val_data_A = lgb.Dataset(X_val_A, label=y_val_A, reference=train_data_A)

train_data_B = lgb.Dataset(X_train_B, label=y_train_B)
val_data_B = lgb.Dataset(X_val_B, label=y_val_B, reference=train_data_B)

train_data_C = lgb.Dataset(X_train_C, label=y_train_C)
val_data_C = lgb.Dataset(X_val_C, label=y_val_C, reference=train_data_C)

# Set the parameters for the LGBM models
params = {
    'boosting_type': 'dart',
    'objective': 'regression',
    'metric': 'mae',
    'num_leaves': 100,
    'learning_rate': 0.05,
    'verbose': 1
}

# Set the parameters for the XGBoost models
parameters_A = {
    'colsample_bytree': 0.8, 
    'gamma': 0.4, 
    'learning_rate': 0.012, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 0.8, 
    'reg_lambda': 0.8, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'eval_set': [(X_train_A, y_train_A), (X_val_A, y_val_A)],
    'num_parallel_tree': 5
}

parameters_B = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'eval_set': [(X_train_B, y_train_B), (X_val_B, y_val_B)]
}

parameters_C = {
    'colsample_bytree': 0.8, 
    'gamma': 0.8, 
    'learning_rate': 0.008, 
    'max_depth': 15, 
    'min_child_weight': 10, 
    'n_estimators': 600, 
    'reg_alpha': 1, 
    'reg_lambda': 3, 
    'subsample': 0.912,
    'random_state': 0, 
    'booster': 'gbtree',
    'n_jobs': -1,
    'eval_set': [(X_train_C, y_train_C), (X_val_C, y_val_C)]
}

xgb_A = xgb.XGBRegressor(**parameters_A)
xgb_B = xgb.XGBRegressor(**parameters_B)
xgb_C = xgb.XGBRegressor(**parameters_C)

cat_A = CatBoostRegressor(
    iterations=12000,        # The number of trees to build
    #learning_rate=0.09,     # The learning rate
    #depth=10,               # Depth of the tree
    loss_function='MAE',     # Loss function to be optimized. RMSE is common for regression.
    eval_metric='MAE',       # Evaluation metric for the validation set
    #random_seed=42,         # Seed for reproducibility
    #verbose=100             # Frequency of logging the training process
)

cat_B = CatBoostRegressor(
    iterations=12000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)

cat_C = CatBoostRegressor(
    iterations=12000,
    #learning_rate=0.09,
    #depth=10,
    loss_function='MAE',
    eval_metric='MAE',
    #random_seed=42,
    #verbose=100
)


In [None]:
# Train the LGBM models

lgbm_A = lgb.train(params,
                train_data_A,
                num_boost_round=500,
                valid_sets=[val_data_A],
                early_stopping_rounds=50,
                verbose_eval=50)

lgbm_B = lgb.train(params,
                train_data_B,
                num_boost_round=500,
                valid_sets=[val_data_B],
                early_stopping_rounds=50,
                verbose_eval=50)

lgbm_C = lgb.train(params,
                train_data_C,
                num_boost_round=500,
                valid_sets=[val_data_C],
                early_stopping_rounds=50,
                verbose_eval=50)

In [None]:
# Train the XGBoost models
xgb_A.fit(
    X=X_train_A, y=y_train_A,
    eval_set=[(X_train_A, y_train_A), (X_val_A, y_val_A)],
    eval_metric='mae',
    verbose=False
)

xgb_B.fit(
    X=X_train_B, y=y_train_B,
    eval_set=[(X_train_B, y_train_B), (X_val_B, y_val_B)],
    eval_metric='mae',
    verbose=False
)

xgb_C.fit(
    X=X_train_C, y=y_train_C,
    eval_set=[(X_train_C, y_train_C), (X_val_C, y_val_C)],
    eval_metric='mae',
    verbose=False
)

In [None]:
# Train the CatBoost models
cat_A.fit(
    X_train_A, y_train_A,
    eval_set=(X_val_A, y_val_A),
    use_best_model=True,
    plot=True
)

cat_B.fit(
    X_train_B, y_train_B,
    eval_set=(X_val_B, y_val_B),
    use_best_model=True,
    plot=True
)

cat_C.fit(
    X_train_C, y_train_C,
    eval_set=(X_val_C, y_val_C),
    use_best_model=True,
    plot=True
)

In [None]:
# Evaluate the models
y_pred_lgbm_A = lgbm_A.predict(X_val_A, num_iteration=lgbm_A.best_iteration)
y_pred_lgbm_B = lgbm_B.predict(X_val_B, num_iteration=lgbm_B.best_iteration)
y_pred_lgbm_C = lgbm_C.predict(X_val_C, num_iteration=lgbm_C.best_iteration)
print('LGBM MAE:', (mean_absolute_error(y_val_A, y_pred_lgbm_A) + mean_absolute_error(y_val_B, y_pred_lgbm_B) + mean_absolute_error(y_val_C, y_pred_lgbm_C)) / 3)
print('LGBM ME: ', (np.mean(y_val_A - y_pred_lgbm_A) + np.mean(y_val_B - y_pred_lgbm_B) + np.mean(y_val_C - y_pred_lgbm_C)) / 3)

y_pred_xgb_A = xgb_A.predict(X_val_A)
y_pred_xgb_B = xgb_B.predict(X_val_B)
y_pred_xgb_C = xgb_C.predict(X_val_C)
print('XGBoost MAE:', (mean_absolute_error(y_val_A, y_pred_xgb_A) + mean_absolute_error(y_val_B, y_pred_xgb_B) + mean_absolute_error(y_val_C, y_pred_xgb_C)) / 3)
print('XGBoost ME: ', (np.mean(y_val_A - y_pred_xgb_A) + np.mean(y_val_B - y_pred_xgb_B) + np.mean(y_val_C - y_pred_xgb_C)) / 3)

y_pred_cat_A = cat_A.predict(X_val_A)
y_pred_cat_B = cat_B.predict(X_val_B)
y_pred_cat_C = cat_C.predict(X_val_C)
print('CatBoost MAE:', (mean_absolute_error(y_val_A, y_pred_cat_A) + mean_absolute_error(y_val_B, y_pred_cat_B) + mean_absolute_error(y_val_C, y_pred_cat_C)) / 3)
print('CatBoost ME: ', (np.mean(y_val_A - y_pred_cat_A) + np.mean(y_val_B - y_pred_cat_B) + np.mean(y_val_C - y_pred_cat_C)) / 3)

y_pred_A = (y_pred_lgbm_A + y_pred_xgb_A + y_pred_cat_A) / 3
y_pred_B = (y_pred_lgbm_B + y_pred_xgb_B + y_pred_cat_B) / 3
y_pred_C = (y_pred_lgbm_C + y_pred_xgb_C + y_pred_cat_C) / 3
print('MAE:', (mean_absolute_error(y_val_A, y_pred_A) + mean_absolute_error(y_val_B, y_pred_B) + mean_absolute_error(y_val_C, y_pred_C)) / 3)
print('ME: ', (np.mean(y_val_A - y_pred_A) + np.mean(y_val_B - y_pred_B) + np.mean(y_val_C - y_pred_C)) / 3) 

# Correlation between models
corr_lgbm_xgb = np.corrcoef(y_pred_lgbm_A, y_pred_xgb_A)[0, 1]
print("LGBM X XGB:", corr_between_models)

corr_lgbm_cat = np.corrcoef(y_pred_lgbm_A, y_pred_cat_A)[0, 1]
print("LGBM X CAT:", corr_between_models)

corr_xgb_cat = np.corrcoef(y_pred_xgb_A, y_pred_cat_A)[0, 1]
print("XGB X CAT:", corr_between_models)

# Correlation with target
corr_with_target = np.corrcoef(y_val_A, y_pred_lgbm_A)[0, 1]
print("Corr target:", corr_with_target)

# Summer months
# LGBM MAE: 148.28078911595432
# LGBM ME:  11.773495640279117
# XGBoost MAE: 145.50960966911052
# XGBoost ME:  0.8305439643785095
# MAE: 146.10835453794607
# ME:  6.3020198023288145

# LGBM MAE: 126.37001068523796
# LGBM ME:  5.5407880468098485
# XGBoost MAE: 124.35035350695496
# XGBoost ME:  -3.8881690714502617
# MAE: 124.70628429080047
# ME:  0.8263094876797941

# LGBM MAE: 113.34762612551407
# LGBM ME:  16.62480425349082
# XGBoost MAE: 111.98328058203373
# XGBoost ME:  6.948942789763294
# MAE: 112.08168399219078
# ME:  11.786873521627056

# LGBM MAE: 151.2805862765213
# LGBM ME:  -0.3526988319681325
# XGBoost MAE: 151.96840624926145
# XGBoost ME:  -9.558960567470928
# MAE: 150.68537589106043
# ME:  -4.955829699719531
# Corr models: 0.9964906912560275
# Corr target: 0.9196798206675115