In [1]:
%load_ext autoreload
%autoreload 2

In [9]:
import datetime
import pandas as pd
from pytz import utc
import os
import mlflow
import numpy as np

# mlflow imports
from mlflow.models.signature import infer_signature

# sklearn imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score

# xgboost imports
from xgboost import XGBRegressor

# oma_tracking imports
from oma_tracking.data.utils import read_simulations_csv_files
from oma_tracking.data import make_dataset
from oma_tracking.data.preprocessing import AngleTransformer, sin_cos_angle_inputs
import oma_tracking.models.mlflow_functions as mlflow_f


from dotenv import load_dotenv
load_dotenv()

True

In [3]:
start =  datetime.datetime(2022,11,1,tzinfo=utc)
stop  = datetime.datetime(2023,3,1,tzinfo=utc)
location = 'nw2d01'
name_location = 'NW2_D01'

home_folder = "../../../"
# Data Paths
data_file_name = '_'.join([location, start.strftime("%Y%m%d"), stop.strftime("%Y%m%d")])
data_path = home_folder + "data/nw2/raw/" + data_file_name + ".parquet"
mvbc_path = home_folder + "data/nw2/mvbc_data.parquet"
tracked_frequencies_path = home_folder + "data/nw2/tracked_modes/harmonics_removed/" + location + ".parquet"
simulations_data_path = home_folder + "data/nw2/simulations/" + location + "/"

# Get all the data
data = pd.read_parquet(data_path)
mvbc_data = pd.read_parquet(mvbc_path)
tracked_frequencies = pd.read_parquet(tracked_frequencies_path)
simulation_data = read_simulations_csv_files(simulations_data_path + "eigen_frequencies/")
simulation_shifts = read_simulations_csv_files(simulations_data_path + "mean_shifts/")
simulation_errors = pd.read_csv(simulations_data_path + "errors/Errors_No_scour.csv", index_col=0)

In [4]:
weather_inputs = make_dataset.get_weather_subset(mvbc_data)
scada_inputs = make_dataset.get_scada_subset(data)

inputs = pd.concat([
            weather_inputs,
            scada_inputs
        ],axis=1)

prediction_params = tracked_frequencies 

In [11]:
# hyperopt imports
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

hyperopt_folder = "../../../data/nw2/model_hyperopt/removed_harmonics/" + location + "_"

In [10]:
seed = 2
def objective_xgb(space):
    model = Pipeline(
            steps=[
                ('preprocessing_angles', AngleTransformer(angles = ['winddirection', 'yaw'])),
                ('regressor', XGBRegressor(
                                 n_estimators = space['n_estimators'],
                                 max_depth = space['max_depth'],
                                 learning_rate = space['learning_rate'],
                                 colsample_bytree = space['colsample_bytree'],
                                 )
                )
            ]
        )
    score = cross_val_score(model,  X_train, y_train, cv=5, scoring='r2').mean()
    # We aim to maximize r2 score, therefore we return it as a negative value
    return {'loss': -score, 'status': STATUS_OK }
def optimize_xgb(trial):
    space = {
        'n_estimators':hp.uniformint('n_estimators',10,500),
        'max_depth':hp.uniformint('max_depth',3,20),
        'learning_rate':hp.uniform('learning_rate',0.01,0.5),
        'colsample_bytree': hp.uniform('colsample_bytree',0.1, 1),
    }
    best = \
        fmin(
            fn = objective_xgb,
            space = space,
            algo = tpe.suggest,
            trials = trial,
            max_evals = 200,
            rstate = np.random.default_rng(seed)
            )
    return best

XGB_optimizations = {}
for mode in prediction_params.columns:
    y = prediction_params[mode].dropna()
    X = inputs.loc[y.index].dropna()
    y = y.loc[X.index]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)
    trial2=Trials()
    XGB_optimizations[mode]=optimize_xgb(trial2)
    print(XGB_optimizations[mode])
pd.DataFrame(XGB_optimizations).to_csv(hyperopt_folder + "xgb_optimizations.csv")

100%|██████████| 200/200 [27:40<00:00,  8.30s/trial, best loss: -0.28386145886738084]
{'colsample_bytree': 0.8304655908264129, 'learning_rate': 0.10967314467145307, 'max_depth': 3.0, 'n_estimators': 144.0}
100%|██████████| 200/200 [14:49<00:00,  4.45s/trial, best loss: -0.6765819937078483]
{'colsample_bytree': 0.6814844732712775, 'learning_rate': 0.029332031839881534, 'max_depth': 5.0, 'n_estimators': 436.0}
100%|██████████| 200/200 [33:11<00:00,  9.96s/trial, best loss: -0.3597042542891683]
{'colsample_bytree': 0.33876447960126577, 'learning_rate': 0.02016761225642566, 'max_depth': 10.0, 'n_estimators': 497.0}
100%|██████████| 200/200 [08:59<00:00,  2.70s/trial, best loss: -0.5467622020134109]
{'colsample_bytree': 0.557090108708905, 'learning_rate': 0.06489736432455834, 'max_depth': 4.0, 'n_estimators': 228.0}


OSError: Cannot save file into a non-existent directory: '..\..\data\nw2\model_hyperopt\removed_harmonics'

In [12]:
pd.DataFrame(XGB_optimizations).to_csv(hyperopt_folder + "xgb_optimizations.csv")

In [13]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
seed = 2

def objective_rf(space):
    model = Pipeline(
            steps=[
                ('preprocessing_angles', AngleTransformer(angles = ['winddirection', 'yaw'])),
                ('regressor', RandomForestRegressor(
                                 n_estimators = space['n_estimators'],
                                 max_depth = space['max_depth'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split']
                                 )
            )
        ]
    )
    score = cross_val_score(model,  X_train, y_train, cv=5, scoring='r2').mean()
    # We aim to maximize r2 score, therefore we return it as a negative value
    return {'loss': -score, 'status': STATUS_OK }

def optimize_rf(trial):
    space = {
        'n_estimators': hp.uniformint('n_estimators',10,500),
        'max_depth': hp.uniformint('max_depth',3,20),
        'min_samples_leaf': hp.uniformint('min_samples_leaf',1,5),
        'min_samples_split': hp.uniformint('min_samples_split',2,6)
    }
    best = \
        fmin(
            fn = objective_rf,
            space = space,
            algo = tpe.suggest,
            trials = trial,
            max_evals = 100,
            rstate = np.random.default_rng(seed)
            )
    return best

RF_optimizations = {}
for mode in prediction_params.columns:
    y = prediction_params[mode].dropna()
    X = inputs.loc[y.index].dropna()
    y = y.loc[X.index]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    trial=Trials()
    RF_optimizations[mode]=optimize_rf(trial)
    print(RF_optimizations[mode])
pd.DataFrame(RF_optimizations).to_csv(hyperopt_folder + "rf_optimizations.csv")

100%|██████████| 100/100 [2:58:46<00:00, 107.26s/trial, best loss: -0.28038408152133554] 
{'max_depth': 10.0, 'min_samples_leaf': 5.0, 'min_samples_split': 6.0, 'n_estimators': 206.0}
100%|██████████| 100/100 [1:08:42<00:00, 41.23s/trial, best loss: -0.6694982186070642]
{'max_depth': 12.0, 'min_samples_leaf': 1.0, 'min_samples_split': 4.0, 'n_estimators': 194.0}
100%|██████████| 100/100 [3:11:23<00:00, 114.84s/trial, best loss: -0.37142684740623855] 
{'max_depth': 16.0, 'min_samples_leaf': 5.0, 'min_samples_split': 2.0, 'n_estimators': 464.0}
100%|██████████| 100/100 [30:19<00:00, 18.19s/trial, best loss: -0.5549033275267146]
{'max_depth': 8.0, 'min_samples_leaf': 3.0, 'min_samples_split': 2.0, 'n_estimators': 139.0}
