In [1]:
# Import the necessary libraries
%matplotlib inline
import numpy as np
import pandas as pd

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
import pickle

from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score as acs_score

from src.data.data_fetcher import get_raw_data
from src.features.preprocess_data import get_preprocessed_test_data, fetch_preprocessed_data
pd.set_option('display.max_columns', 200)

from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Prepare data
train_a, train_b, train_c, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = get_raw_data()

X_train_obs_combined, X_val_obs_combined, y_train_obs_combined, y_val_obs_combined, X_train_est_combined, X_val_est_combined, y_train_est_combined, y_val_est_combined = fetch_preprocessed_data()
x_test_whole = get_preprocessed_test_data()

x_whole = pd.concat([X_train_obs_combined, X_val_obs_combined])
y_whole = pd.concat([y_train_obs_combined, y_val_obs_combined])
x_whole.reset_index(drop=True, inplace=True)
y_whole.reset_index(drop=True, inplace=True)

x_whole_obs = pd.concat([X_train_obs_combined, X_val_obs_combined])
y_whole_obs = pd.concat([y_train_obs_combined, y_val_obs_combined])

x_whole_est = pd.concat([X_train_est_combined, X_val_est_combined])
y_whole_est = pd.concat([y_train_est_combined, y_val_est_combined])

x_whole.head()

After temporal alignment
X_test_estimated_a.shape = (720, 47), X_test_estimated_b.shape = (720, 47), X_test_estimated_c.shape = (720, 47)
X_test_estimated_a_processed.shape = (720, 46), X_test_estimated_b_processed.shape = (720, 46), X_test_estimated_c_processed.shape = (720, 46)


Unnamed: 0,absolute_humidity_2m:gm3,air_density_2m:kgm3,clear_sky_energy_1h:J,clear_sky_rad:W,cloud_base_agl:m,dew_or_rime:idx,dew_point_2m:K,diffuse_rad:W,diffuse_rad_1h:J,direct_rad:W,direct_rad_1h:J,effective_cloud_cover:p,is_day:idx,is_in_shadow:idx,precip_5min:mm,precip_type_5min:idx,pressure_50m:hPa,prob_rime:p,rain_water:kgm2,relative_humidity_1000hPa:p,sun_azimuth:d,super_cooled_liquid_water:kgm2,t_1000hPa:K,total_cloud_cover:p,visibility:m,wind_speed_10m:ms,wind_speed_u_10m:ms,wind_speed_v_10m:ms,wind_speed_w_1000hPa:ms,location_a,location_b,location_c,sin_day_of_year,cos_day_of_year,sin_hour,cos_hour,sun_product,modified_solar_elevation,effective_radiation,time_since_prediction,cloud_ratio,cloud_cover_over_30%,sun_addition,is_freezing,is_snow,is_rain
0,15.2,1.18175,1351672.0,440.600006,7231.174805,0.0,291.25,95.5,294605.53125,271.225006,872357.5625,54.224998,1.0,0.0,0.0,0.0,1007.5,0.0,0.0,72.375,92.172501,0.0,294.625,98.925003,48152.375,2.125,-2.0,-0.75,0.0,1,0,0,0.02795,-0.999609,1.0,6.123234000000001e-17,25901.988281,0.455669,0.645392,0,0.548142,1,366.725006,0,0,0
1,13.7,1.1925,2456032.0,637.599976,1410.5,0.0,289.399994,145.525009,447652.6875,5.8,11749.875,99.675003,1.0,0.0,0.0,0.0,1001.400024,0.0,0.1,80.699997,236.257751,0.1,292.100006,100.0,5903.875,1.35,-1.225,0.525,0.0,0,1,0,-0.310857,-0.950457,-0.5,-0.8660254,844.045105,0.611489,0.004784,0,0.99675,1,151.325012,0,0,0
2,10.5,1.2015,217229.3,107.400002,2315.0,0.0,285.299988,42.200001,100314.65625,65.099998,135992.90625,23.299999,1.0,0.0,0.0,0.0,1006.849976,0.0,0.0,59.049999,85.390503,0.0,291.700012,43.549999,49974.699219,3.375,-0.4,3.35,0.0,1,0,0,-0.804826,-0.593511,0.965926,0.258819,2747.219971,0.164161,0.626034,0,0.535017,0,107.300003,0,0,0
3,6.3,1.248,0.0,0.0,131.375,0.0,277.100006,0.0,0.0,0.0,0.0,99.574997,0.0,1.0,0.0,0.0,992.949951,0.0,0.0,93.650002,127.701752,0.35,278.100006,99.974998,3399.375,6.0,5.95,-0.5,0.0,1,0,0,-0.141444,0.989946,0.965926,-0.258819,0.0,0.0,0.0,0,0.995999,1,0.0,0,0,0
4,6.3,1.27175,0.0,0.0,8580.474609,0.0,277.0,0.0,0.0,0.0,0.0,49.400002,0.0,1.0,0.0,0.0,1012.200012,0.0,0.0,81.649994,346.724243,0.0,278.600006,98.824997,27873.150391,1.45,1.45,-0.275,0.0,1,0,0,0.93121,-0.364483,-0.5,0.8660254,0.0,0.0,0.0,0,0.499874,1,0.0,0,0,0


In [3]:
import optuna



def compute_sample_weight(data):
    # Assign weight of 2 for estimated data and 1 for observed data
    return np.where(data["time_since_prediction"] > 0, 2, 1)


def objective(trial):
    param = {
        'depth': trial.suggest_int('depth', 8, 11),  # Optimizing the number of iterations
        'eval_metric': 'MAE',
        'random_seed': 42,
        'verbose': 200,
        'loss_function': 'MAE',
        # Add more parameters here if you want
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 8, 11),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
    }
    
    fold_mae = []
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    
    for train_index, test_index in kf.split(x_whole):
        X_train_fold, X_val_fold = x_whole.iloc[train_index], x_whole.iloc[test_index]
        y_train_fold, y_val_fold = y_whole.iloc[train_index], y_whole.iloc[test_index]


        # Compute sample weights for training and testing data
        train_weight = compute_sample_weight(X_train_fold)
        test_weight = compute_sample_weight(X_val_fold)

        # Create Pool for training and testing
        train_pool = Pool(data=X_train_fold, label=y_train_fold, weight=train_weight)
        test_pool = Pool(data=X_train_fold, label=y_val_fold, weight=test_weight)
        
        model = CatBoostRegressor(**param)
        model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50)
        
        y_pred_fold = model.predict(X_val_fold)
        fold_mae.append(mean_absolute_error(y_val_fold, y_pred_fold))
    
    return np.mean(fold_mae)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)  # You can increase n_trials to try more combinations

# Best hyperparameters
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

# Retrain with best parameters
best_params = study.best_trial.params
best_model = CatBoostRegressor(**best_params)
best_model.fit(x_whole, y_whole)


[I 2023-11-08 10:56:52,857] A new study created in memory with name: no-name-b44eaa58-1805-4b81-ab6c-efdcf2f30ffe


0:	learn: 317.2289420	test: 300.6183069	best: 300.6183069 (0)	total: 194ms	remaining: 3m 13s
200:	learn: 119.8197981	test: 113.3479744	best: 113.3479744 (200)	total: 8.65s	remaining: 34.4s
400:	learn: 94.1398951	test: 89.9853160	best: 89.9853160 (400)	total: 17s	remaining: 25.4s
600:	learn: 89.3734164	test: 86.1446996	best: 86.1446996 (600)	total: 25.1s	remaining: 16.7s
800:	learn: 86.9771013	test: 84.4669128	best: 84.4669128 (800)	total: 33s	remaining: 8.2s
999:	learn: 85.7772325	test: 83.6763802	best: 83.6763802 (999)	total: 40.9s	remaining: 0us

bestTest = 83.67638019
bestIteration = 999

0:	learn: 316.4349897	test: 307.9115924	best: 307.9115924 (0)	total: 35ms	remaining: 35s
200:	learn: 119.6905474	test: 117.4364624	best: 117.4364624 (200)	total: 8s	remaining: 31.8s
400:	learn: 93.6166602	test: 92.8564406	best: 92.8564406 (400)	total: 16s	remaining: 23.9s
600:	learn: 88.8883023	test: 89.4252927	best: 89.4252927 (600)	total: 24.3s	remaining: 16.1s
800:	learn: 86.5871998	test: 87.876

[I 2023-11-08 11:03:43,647] Trial 0 finished with value: 88.58805826345916 and parameters: {'depth': 9, 'learning_rate': 0.009379374798893395, 'subsample': 0.7972302926833397, 'colsample_bylevel': 0.85427504475549, 'min_data_in_leaf': 94}. Best is trial 0 with value: 88.58805826345916.


999:	learn: 85.4184983	test: 85.1902836	best: 85.1902836 (999)	total: 40.5s	remaining: 0us

bestTest = 85.19028363
bestIteration = 999

0:	learn: 304.6524577	test: 288.1052545	best: 288.1052545 (0)	total: 46.6ms	remaining: 46.5s
200:	learn: 70.4578872	test: 81.2281929	best: 81.2281929 (200)	total: 8.87s	remaining: 35.3s
400:	learn: 58.3148377	test: 78.6022872	best: 78.5717444 (390)	total: 17.3s	remaining: 25.8s
600:	learn: 51.2855014	test: 77.8975657	best: 77.8924087 (599)	total: 25.4s	remaining: 16.9s
800:	learn: 45.1310979	test: 77.5566425	best: 77.4472104 (757)	total: 34.7s	remaining: 8.63s
999:	learn: 40.6478444	test: 77.0567482	best: 77.0553505 (997)	total: 43s	remaining: 0us

bestTest = 77.05535054
bestIteration = 997

Shrink model to first 998 iterations.
0:	learn: 304.0062617	test: 296.1809923	best: 296.1809923 (0)	total: 48.5ms	remaining: 48.5s
200:	learn: 69.7832152	test: 81.7674729	best: 81.7674729 (200)	total: 8.6s	remaining: 34.2s
400:	learn: 55.5621128	test: 77.8238313	be

[I 2023-11-08 11:11:30,403] Trial 1 finished with value: 78.390456705452 and parameters: {'depth': 11, 'learning_rate': 0.08912212073693533, 'subsample': 0.6704933398520662, 'colsample_bylevel': 0.1908015173117793, 'min_data_in_leaf': 99}. Best is trial 1 with value: 78.390456705452.


0:	learn: 318.0523663	test: 301.4021818	best: 301.4021818 (0)	total: 82.5ms	remaining: 1m 22s
200:	learn: 110.9743996	test: 105.0158633	best: 105.0158633 (200)	total: 16.5s	remaining: 1m 5s
400:	learn: 89.4790887	test: 87.0614419	best: 87.0614419 (400)	total: 33s	remaining: 49.3s
600:	learn: 84.6389453	test: 83.9695376	best: 83.9695376 (600)	total: 49.6s	remaining: 32.9s
800:	learn: 82.0147243	test: 82.5643898	best: 82.5643898 (800)	total: 1m 5s	remaining: 16.4s
999:	learn: 79.8888384	test: 81.5642456	best: 81.5642456 (999)	total: 1m 21s	remaining: 0us

bestTest = 81.56424565
bestIteration = 999

0:	learn: 317.2298131	test: 308.7133408	best: 308.7133408 (0)	total: 77.4ms	remaining: 1m 17s
200:	learn: 110.0680784	test: 108.2044987	best: 108.2044987 (200)	total: 15.9s	remaining: 1m 3s
400:	learn: 89.3415658	test: 89.8069739	best: 89.8069739 (400)	total: 31.3s	remaining: 46.8s
600:	learn: 84.2671333	test: 86.5410724	best: 86.5410724 (600)	total: 46.7s	remaining: 31s
800:	learn: 81.5765796

[W 2023-11-08 11:20:28,934] Trial 2 failed with parameters: {'depth': 10, 'learning_rate': 0.010642184659165081, 'subsample': 0.43140603303676367, 'colsample_bylevel': 0.7676736207322047, 'min_data_in_leaf': 33} because of the following error: KeyboardInterrupt('').
Traceback (most recent call last):
  File "c:\Users\gunna\Documents\Maskinlæring\Prosjekt\power-predictor\venv\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\gunna\AppData\Local\Temp\ipykernel_28692\358791120.py", line 27, in objective
    model.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], early_stopping_rounds=100)
  File "c:\Users\gunna\Documents\Maskinlæring\Prosjekt\power-predictor\venv\lib\site-packages\catboost\core.py", line 5703, in fit
    return self._fit(X, y, cat_features, text_features, embedding_features, None, sample_weight, None, None, None, None, baseline,
  File "c:\Users\gunna\Documents\Maskinlæring\Prosjekt\

KeyboardInterrupt: 

In [None]:
y_pred = best_model.predict(x_test_whole)
# Save the model
from src.models.saving import save_predictions
from src.features.postprocess_data import postprocess_data

y_predictions = postprocess_data(x_test_whole, pd.Series(y_pred))

save_predictions(y_predictions, 'xgboost direct_rad_cloud_interaction')