In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import datetime
import pandas as pd
from pytz import utc
import os
import mlflow
import numpy as np

# mlflow imports
from mlflow.models.signature import infer_signature

# sklearn imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# xgboost imports
from xgboost import XGBRegressor

# hyperopt imports
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

# oma_tracking imports
from oma_tracking.data.utils import read_simulations_csv_files
from oma_tracking.data import make_dataset
from oma_tracking.data.preprocessing import AngleTransformer, sin_cos_angle_inputs
import oma_tracking.models.mlflow_functions as mlflow_f





from dotenv import load_dotenv
load_dotenv()

True

In [19]:
start =  datetime.datetime(2022,11,1,tzinfo=utc)
stop  = datetime.datetime(2023,3,1,tzinfo=utc)
location = 'nw2c02'
name_location = 'NW2_C02'

# Data Paths
data_file_name = '_'.join([location, start.strftime("%Y%m%d"), stop.strftime("%Y%m%d")])
data_path = "../../data/nw2/raw/" + data_file_name + ".parquet"
mvbc_path = "../../data/nw2/mvbc_data.parquet"
tracked_frequencies_path = "../../data/nw2/tracked_modes/" + location + ".parquet"
simulations_data_path = "../../data/nw2/simulations/" + location + "/"

# Get all the data
data = pd.read_parquet(data_path)
mvbc_data = pd.read_parquet(mvbc_path)
tracked_frequencies = pd.read_parquet(tracked_frequencies_path)
#simulation_data = read_simulations_csv_files(simulations_data_path + "eigen_frequencies/")
#simulation_shifts = read_simulations_csv_files(simulations_data_path + "mean_shifts/")
#simulation_errors = pd.read_csv(simulations_data_path + "errors/Errors_No_scour.csv", index_col=0)

In [20]:
AZURE_STORAGE_ACCESS_KEY = os.getenv('AZURE_STORAGE_ACCESS_KEY')
AZURE_STORAGE_CONNECTION_STRING = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
MLFLOW_TRACKING_URI = os.getenv('MLFLOW_TRACKING_URI')

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

artifact_root = 'wasbs://test@mlflowstoragev1.blob.core.windows.net'
mlflow_ui_string = mlflow_f.create_mlflow_ui(MLFLOW_TRACKING_URI, artifact_root)
database_url = 'http://127.0.0.1:5000'
mlflow_f.connect_mlflow_ui(mlflow_ui_string, database_url)

experiment_name = 'NW2_scour'
experiment = mlflow_f.run_mlflow_experiment(experiment_name = experiment_name)



mlflow experiment set to: NW2_scour


In [21]:
weather_inputs = make_dataset.get_weather_subset(mvbc_data)
scada_inputs = make_dataset.get_scada_subset(data)

inputs = pd.concat([
            weather_inputs,
            scada_inputs
        ],axis=1)

prediction_params = tracked_frequencies 

In [22]:
hyperopt_folder = "../../data/nw2/model_hyperopt/" + location + "_"

In [23]:
seed = 2
def objective_xgb(space):
    model = Pipeline(
            steps=[
                ('preprocessing_angles', AngleTransformer(angles = ['winddirection', 'yaw'])),
                ('regressor', XGBRegressor(
                                 n_estimators = space['n_estimators'],
                                 max_depth = space['max_depth'],
                                 learning_rate = space['learning_rate'],
                                 colsample_bytree = space['colsample_bytree'],
                                 )
                )
            ]
        )
    score = cross_val_score(model,  X_train, y_train, cv=5, scoring='r2').mean()
    # We aim to maximize r2 score, therefore we return it as a negative value
    return {'loss': -score, 'status': STATUS_OK }
def optimize_xgb(trial):
    space = {
        'n_estimators':hp.uniformint('n_estimators',10,500),
        'max_depth':hp.uniformint('max_depth',3,20),
        'learning_rate':hp.uniform('learning_rate',0.01,0.5),
        'colsample_bytree': hp.uniform('colsample_bytree',0.1, 1),
    }
    best = \
        fmin(
            fn = objective_xgb,
            space = space,
            algo = tpe.suggest,
            trials = trial,
            max_evals = 200,
            rstate = np.random.default_rng(seed)
            )
    return best

XGB_optimizations = {}
for mode in prediction_params.columns:
    y = prediction_params[mode].loc[inputs.index[0]:inputs.index[-1]].dropna()
    X = inputs.loc[y.index].dropna()
    y = y.loc[X.index]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)
    trial2=Trials()
    XGB_optimizations[mode]=optimize_xgb(trial2)
    print(XGB_optimizations[mode])
pd.DataFrame(XGB_optimizations).to_csv(hyperopt_folder + "xgb_optimizations.csv")

100%|██████████| 200/200 [24:50<00:00,  7.45s/trial, best loss: -0.20437119789601885]
{'colsample_bytree': 0.5041176658380875, 'learning_rate': 0.0253010280465614, 'max_depth': 7.0, 'n_estimators': 331.0}
100%|██████████| 200/200 [33:26<00:00, 10.03s/trial, best loss: -0.7403128766032342]
{'colsample_bytree': 0.7509143903507746, 'learning_rate': 0.03354811107736168, 'max_depth': 12.0, 'n_estimators': 459.0}
100%|██████████| 200/200 [29:07<00:00,  8.74s/trial, best loss: -0.42533298606718917]
{'colsample_bytree': 0.5235919674086554, 'learning_rate': 0.0435470685765176, 'max_depth': 13.0, 'n_estimators': 234.0}
100%|██████████| 200/200 [51:53<00:00, 15.57s/trial, best loss: -0.6736309360396051] 
{'colsample_bytree': 0.6333742869249575, 'learning_rate': 0.02959759563450521, 'max_depth': 10.0, 'n_estimators': 332.0}


In [24]:
trial2

<hyperopt.base.Trials at 0x16e84f825e0>

In [25]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
seed = 2

def objective_rf(space):
    model = Pipeline(
            steps=[
                ('preprocessing_angles', AngleTransformer(angles = ['winddirection', 'yaw'])),
                ('regressor', RandomForestRegressor(
                                 n_estimators = space['n_estimators'],
                                 max_depth = space['max_depth'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split']
                                 )
            )
        ]
    )
    score = cross_val_score(model,  X_train, y_train, cv=5, scoring='r2').mean()
    # We aim to maximize r2 score, therefore we return it as a negative value
    return {'loss': -score, 'status': STATUS_OK }

def optimize_rf(trial):
    space = {
        'n_estimators': hp.uniformint('n_estimators',10,500),
        'max_depth': hp.uniformint('max_depth',3,20),
        'min_samples_leaf': hp.uniformint('min_samples_leaf',1,5),
        'min_samples_split': hp.uniformint('min_samples_split',2,6)
    }
    best = \
        fmin(
            fn = objective_rf,
            space = space,
            algo = tpe.suggest,
            trials = trial,
            max_evals = 100,
            rstate = np.random.default_rng(seed)
            )
    return best

RF_optimizations = {}
for mode in prediction_params.columns:
    y = prediction_params[mode].loc[inputs.index[0]:inputs.index[-1]].dropna()
    X = inputs.loc[y.index].dropna()
    y = y.loc[X.index]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    trial=Trials()
    RF_optimizations[mode]=optimize_rf(trial)
    print(RF_optimizations[mode])
pd.DataFrame(RF_optimizations).to_csv(hyperopt_folder + "rf_optimizations.csv")

100%|██████████| 100/100 [2:31:42<00:00, 91.03s/trial, best loss: -0.22695911690307477] 
{'max_depth': 10.0, 'min_samples_leaf': 5.0, 'min_samples_split': 6.0, 'n_estimators': 439.0}
100%|██████████| 100/100 [2:30:53<00:00, 90.54s/trial, best loss: -0.7317887866598072]  
{'max_depth': 20.0, 'min_samples_leaf': 1.0, 'min_samples_split': 3.0, 'n_estimators': 459.0}
100%|██████████| 100/100 [3:01:48<00:00, 109.08s/trial, best loss: -0.4318561033908006]  
{'max_depth': 15.0, 'min_samples_leaf': 1.0, 'min_samples_split': 2.0, 'n_estimators': 233.0}
100%|██████████| 100/100 [3:42:40<00:00, 133.60s/trial, best loss: -0.6592766620871874] 
{'max_depth': 13.0, 'min_samples_leaf': 3.0, 'min_samples_split': 2.0, 'n_estimators': 452.0}


In [26]:
trial

<hyperopt.base.Trials at 0x16e84f27f10>