In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import datetime
import pandas as pd
from pytz import utc
import os
import mlflow
import numpy as np

# mlflow imports
from mlflow.models.signature import infer_signature

# sklearn imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

# xgboost imports
from xgboost import XGBRegressor

# oma_tracking imports
from oma_tracking.data.utils import read_simulations_csv_files
from oma_tracking.data import make_dataset
from oma_tracking.data.preprocessing import AngleTransformer, sin_cos_angle_inputs
import oma_tracking.models.mlflow_functions as mlflow_f


from dotenv import load_dotenv
load_dotenv()

c:\Users\Max\anaconda3\envs\soiltwin\lib\site-packages\numpy\.libs\libopenblas.4SP5SUA7CBGXUEOC35YP2ASOICYYEQZZ.gfortran-win_amd64.dll
c:\Users\Max\anaconda3\envs\soiltwin\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


True

In [6]:
start =  datetime.datetime(2022,11,1,tzinfo=utc)
stop  = datetime.datetime(2023,3,1,tzinfo=utc)
location = 'nw2c02'
name_location = 'NW2_C02'

home_folder = "../../../"
# Data Paths
data_file_name = '_'.join([location, start.strftime("%Y%m%d"), stop.strftime("%Y%m%d")])
data_path = home_folder + "data/nw2/raw/" + data_file_name + ".parquet"
mvbc_path = home_folder + "data/nw2/mvbc_data.parquet"
tracked_frequencies_path = home_folder + "data/nw2/tracked_modes/harmonics_removed/" + location + ".parquet"
simulations_data_path = home_folder + "data/nw2/simulations/" + location + "/"

# Get all the data
data = pd.read_parquet(data_path)
mvbc_data = pd.read_parquet(mvbc_path)
tracked_frequencies = pd.read_parquet(tracked_frequencies_path)
#simulation_data = read_simulations_csv_files(simulations_data_path + "eigen_frequencies/")
#simulation_shifts = read_simulations_csv_files(simulations_data_path + "mean_shifts/")
#simulation_errors = pd.read_csv(simulations_data_path + "errors/Errors_No_scour.csv", index_col=0)

In [7]:
weather_inputs = make_dataset.get_weather_subset(mvbc_data)
scada_inputs = make_dataset.get_scada_subset(data)

inputs = pd.concat([
            weather_inputs,
            scada_inputs
        ],axis=1)

prediction_params = tracked_frequencies 

In [8]:
inputs

Unnamed: 0,mvbc_WandelaarBuoy_Wave_height,mvbc_WandelaarBuoy_Sea_water_temperature,mvbc_WandelaarMeasuringpile_Tide_TAW,mvbc_WandelaarMeasuringpile_Air_pressure,mvbc_WandelaarMeasuringpile_Air_temperature,mean_NW2_C02_rpm,mean_NW2_C02_yaw,mean_NW2_C02_pitch,mean_NW2_C02_power,mean_NW2_C02_windspeed,mean_NW2_C02_winddirection
2022-11-01 00:00:00+00:00,85.000000,15.0,106.000000,1002.800000,16.700000,10.445,162.459,10.519,9524.928,17.578,162.655
2022-11-01 00:10:00+00:00,83.666667,15.0,112.666667,1002.933333,16.700000,10.445,163.886,10.044,9524.737,17.750,165.761
2022-11-01 00:20:00+00:00,82.333333,15.0,119.333333,1003.066667,16.700000,10.445,176.979,9.932,9523.201,17.238,179.609
2022-11-01 00:30:00+00:00,81.000000,15.0,126.000000,1003.200000,16.700000,10.445,187.023,9.400,9524.224,16.852,188.897
2022-11-01 00:40:00+00:00,80.333333,15.0,134.666667,1003.333333,16.566667,10.445,193.094,12.394,9523.201,18.691,191.919
...,...,...,...,...,...,...,...,...,...,...,...
2023-03-19 23:20:00+00:00,,,,,,,,,,,
2023-03-19 23:30:00+00:00,,,,,,,,,,,
2023-03-19 23:40:00+00:00,,,,,,,,,,,
2023-03-19 23:50:00+00:00,,,,,,,,,,,


In [13]:
# hyperopt imports
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

hyperopt_folder = "../../../data/nw2/model_hyperopt/removed_harmonics/" + location + "_"

In [11]:
seed = 2
def objective_xgb(space):
    model = Pipeline(
            steps=[
                ('preprocessing_angles', AngleTransformer(angles = ['winddirection', 'yaw'])),
                ('regressor', XGBRegressor(
                                 n_estimators = space['n_estimators'],
                                 max_depth = space['max_depth'],
                                 learning_rate = space['learning_rate'],
                                 colsample_bytree = space['colsample_bytree'],
                                 )
                )
            ]
        )
    score = cross_val_score(model,  X_train, y_train, cv=5, scoring='r2').mean()
    # We aim to maximize r2 score, therefore we return it as a negative value
    return {'loss': -score, 'status': STATUS_OK }
def optimize_xgb(trial):
    space = {
        'n_estimators':hp.uniformint('n_estimators',10,500),
        'max_depth':hp.uniformint('max_depth',3,20),
        'learning_rate':hp.uniform('learning_rate',0.01,0.5),
        'colsample_bytree': hp.uniform('colsample_bytree',0.1, 1),
    }
    best = \
        fmin(
            fn = objective_xgb,
            space = space,
            algo = tpe.suggest,
            trials = trial,
            max_evals = 200,
            rstate = np.random.default_rng(seed)
            )
    return best

XGB_optimizations = {}
for mode in prediction_params.columns:
    y = prediction_params[mode].dropna()
    X = inputs.loc[y.index].dropna()
    y = y.loc[X.index]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)
    trial2=Trials()
    XGB_optimizations[mode]=optimize_xgb(trial2)
    print(XGB_optimizations[mode])
pd.DataFrame(XGB_optimizations).to_csv(hyperopt_folder + "xgb_optimizations.csv")

100%|██████████| 200/200 [27:24<00:00,  8.22s/trial, best loss: -0.2499389510542696]
{'colsample_bytree': 0.5434421655442623, 'learning_rate': 0.039209008928574644, 'max_depth': 5.0, 'n_estimators': 251.0}
100%|██████████| 200/200 [13:21<00:00,  4.01s/trial, best loss: -0.7582504537730598]
{'colsample_bytree': 0.5888792304277286, 'learning_rate': 0.04061735318315918, 'max_depth': 6.0, 'n_estimators': 249.0}
100%|██████████| 200/200 [29:57<00:00,  8.99s/trial, best loss: -0.4529683058764927]
{'colsample_bytree': 0.41558226149386285, 'learning_rate': 0.03053233765912594, 'max_depth': 9.0, 'n_estimators': 317.0}
100%|██████████| 200/200 [09:18<00:00,  2.79s/trial, best loss: -0.7839770371232696]
{'colsample_bytree': 0.5992765975130467, 'learning_rate': 0.04731625530396844, 'max_depth': 4.0, 'n_estimators': 281.0}


OSError: Cannot save file into a non-existent directory: '..\..\data\nw2\model_hyperopt\removed_harmonics'

In [14]:
pd.DataFrame(XGB_optimizations).to_csv(hyperopt_folder + "xgb_optimizations.csv")

In [15]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
seed = 2

def objective_rf(space):
    model = Pipeline(
            steps=[
                ('preprocessing_angles', AngleTransformer(angles = ['winddirection', 'yaw'])),
                ('regressor', RandomForestRegressor(
                                 n_estimators = space['n_estimators'],
                                 max_depth = space['max_depth'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split']
                                 )
            )
        ]
    )
    score = cross_val_score(model,  X_train, y_train, cv=5, scoring='r2').mean()
    # We aim to maximize r2 score, therefore we return it as a negative value
    return {'loss': -score, 'status': STATUS_OK }

def optimize_rf(trial):
    space = {
        'n_estimators': hp.uniformint('n_estimators',10,500),
        'max_depth': hp.uniformint('max_depth',3,20),
        'min_samples_leaf': hp.uniformint('min_samples_leaf',1,5),
        'min_samples_split': hp.uniformint('min_samples_split',2,6)
    }
    best = \
        fmin(
            fn = objective_rf,
            space = space,
            algo = tpe.suggest,
            trials = trial,
            max_evals = 100,
            rstate = np.random.default_rng(seed)
            )
    return best

RF_optimizations = {}
for mode in prediction_params.columns:
    y = prediction_params[mode].dropna()
    X = inputs.loc[y.index].dropna()
    y = y.loc[X.index]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    trial=Trials()
    RF_optimizations[mode]=optimize_rf(trial)
    print(RF_optimizations[mode])
pd.DataFrame(RF_optimizations).to_csv(hyperopt_folder + "rf_optimizations.csv")

100%|██████████| 100/100 [3:41:16<00:00, 132.76s/trial, best loss: -0.25336986124618094] 
{'max_depth': 8.0, 'min_samples_leaf': 5.0, 'min_samples_split': 5.0, 'n_estimators': 349.0}
100%|██████████| 100/100 [1:27:10<00:00, 52.31s/trial, best loss: -0.7524177072121373]
{'max_depth': 12.0, 'min_samples_leaf': 3.0, 'min_samples_split': 2.0, 'n_estimators': 300.0}
100%|██████████| 100/100 [3:14:02<00:00, 116.42s/trial, best loss: -0.4630959430231124] 
{'max_depth': 15.0, 'min_samples_leaf': 5.0, 'min_samples_split': 3.0, 'n_estimators': 375.0}
100%|██████████| 100/100 [26:54<00:00, 16.14s/trial, best loss: -0.7796793769502932]
{'max_depth': 14.0, 'min_samples_leaf': 3.0, 'min_samples_split': 3.0, 'n_estimators': 176.0}
