In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datetime
import pandas as pd
from pytz import utc
import os
import mlflow
import numpy as np

# mlflow imports
from mlflow.models.signature import infer_signature

# sklearn imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# xgboost imports
from xgboost import XGBRegressor

# hyperopt imports
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials

# oma_tracking imports
from oma_tracking.data.utils import read_simulations_csv_files
from oma_tracking.data import make_dataset
from oma_tracking.data.preprocessing import AngleTransformer, sin_cos_angle_inputs
import oma_tracking.models.mlflow_functions as mlflow_f

from dotenv import load_dotenv
load_dotenv()

c:\Users\Max\anaconda3\envs\soiltwin\lib\site-packages\numpy\.libs\libopenblas.4SP5SUA7CBGXUEOC35YP2ASOICYYEQZZ.gfortran-win_amd64.dll
c:\Users\Max\anaconda3\envs\soiltwin\lib\site-packages\numpy\.libs\libopenblas.XWYDX2IKJW2NMTWSFYNGFUWKQU3LYTCZ.gfortran-win_amd64.dll


True

In [3]:
start =  datetime.datetime(2022,11,1,tzinfo=utc)
stop  = datetime.datetime(2023,3,1,tzinfo=utc)
location = 'nw2d01'
name_location = 'NW2_D01'

# Data Paths
data_path = "../../data/nw2/raw/nw2d01_" + start.strftime("%Y%m%d") + "_" + stop.strftime("%Y%m%d") + ".parquet"
mvbc_path = "../../data/nw2/mvbc_data.parquet"
tracked_frequencies_path = "../../data/nw2/tracked_modes/" + location + ".parquet"
simulations_data_path = "../../data/nw2/simulations/" + location + "/"

# Get all the data
data = pd.read_parquet(data_path)
mvbc_data = pd.read_parquet(mvbc_path)
tracked_frequencies = pd.read_parquet(tracked_frequencies_path)
simulation_data = read_simulations_csv_files(simulations_data_path + "eigen_frequencies/")
simulation_shifts = read_simulations_csv_files(simulations_data_path + "mean_shifts/")
simulation_errors = pd.read_csv(simulations_data_path + "errors/Errors_No_scour.csv", index_col=0)

In [4]:
AZURE_STORAGE_ACCESS_KEY = os.getenv('AZURE_STORAGE_ACCESS_KEY')
AZURE_STORAGE_CONNECTION_STRING = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
MLFLOW_TRACKING_URI = os.getenv('MLFLOW_TRACKING_URI')

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

artifact_root = 'wasbs://test@mlflowstoragev1.blob.core.windows.net'
mlflow_ui_string = mlflow_f.create_mlflow_ui(MLFLOW_TRACKING_URI, artifact_root)
database_url = 'http://127.0.0.1:5000'
mlflow_f.connect_mlflow_ui(mlflow_ui_string, database_url)

experiment_name = 'NW2_scour'
experiment = mlflow_f.run_mlflow_experiment(experiment_name = experiment_name)

(psycopg2.OperationalError) connection to server at "10.0.0.139", port 5432 failed: Connection timed out (0x0000274C/10060)
	Is the server running on that host and accepting TCP/IP connections?

(Background on this error at: https://sqlalche.me/e/14/e3q8)
Operation will be retried in 0.1 seconds


mlflow experiment set to: NW2_scour


In [5]:
weather_inputs = make_dataset.get_weather_subset(mvbc_data)
scada_inputs = make_dataset.get_scada_subset(data)

inputs = pd.concat([
            weather_inputs,
            scada_inputs
        ],axis=1)

prediction_params = tracked_frequencies 

In [8]:
hyperopt_folder = "../../data/nw2/model_hyperopt/" + location + "_"

In [9]:
seed = 2
def objective_xgb(space):
    model = Pipeline(
            steps=[
                ('preprocessing_angles', AngleTransformer(angles = ['winddirection', 'yaw'])),
                ('regressor', XGBRegressor(
                                 n_estimators = space['n_estimators'],
                                 max_depth = space['max_depth'],
                                 learning_rate = space['learning_rate'],
                                 colsample_bytree = space['colsample_bytree'],
                                 )
                )
            ]
        )
    score = cross_val_score(model,  X_train, y_train, cv=5, scoring='r2').mean()
    # We aim to maximize r2 score, therefore we return it as a negative value
    return {'loss': -score, 'status': STATUS_OK }
def optimize_xgb(trial):
    space = {
        'n_estimators':hp.uniformint('n_estimators',10,500),
        'max_depth':hp.uniformint('max_depth',3,20),
        'learning_rate':hp.uniform('learning_rate',0.01,0.5),
        'colsample_bytree': hp.uniform('colsample_bytree',0.1, 1),
    }
    best = \
        fmin(
            fn = objective_xgb,
            space = space,
            algo = tpe.suggest,
            trials = trial,
            max_evals = 200,
            rstate = np.random.default_rng(seed)
            )
    return best

XGB_optimizations = {}
for mode in prediction_params.columns:
    y = prediction_params[mode].dropna()
    X = inputs.loc[y.index].dropna()
    y = y.loc[X.index]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    X_train_val, X_val, y_train_val, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=seed)
    trial2=Trials()
    XGB_optimizations[mode]=optimize_xgb(trial2)
    print(XGB_optimizations[mode])
pd.DataFrame(XGB_optimizations).to_csv(hyperopt_folder + "xgb_optimizations.csv")

100%|██████████| 200/200 [40:27<00:00, 12.14s/trial, best loss: -0.28305908969794374]
{'colsample_bytree': 0.5249699361570449, 'learning_rate': 0.028744322686588067, 'max_depth': 6.0, 'n_estimators': 275.0}
100%|██████████| 200/200 [47:29<00:00, 14.25s/trial, best loss: -0.5340336897612492]
{'colsample_bytree': 0.48684749114948356, 'learning_rate': 0.021045327649482887, 'max_depth': 8.0, 'n_estimators': 441.0}
100%|██████████| 200/200 [39:31<00:00, 11.86s/trial, best loss: -0.365799277323665]  
{'colsample_bytree': 0.45826301668701863, 'learning_rate': 0.020502094520931994, 'max_depth': 10.0, 'n_estimators': 456.0}
100%|██████████| 200/200 [38:01<00:00, 11.41s/trial, best loss: -0.5417091611145215]
{'colsample_bytree': 0.5257177720137283, 'learning_rate': 0.06075855570180731, 'max_depth': 6.0, 'n_estimators': 242.0}


In [10]:
trial2

<hyperopt.base.Trials at 0x1e8bb8d6640>

In [11]:
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
seed = 2

def objective_rf(space):
    model = Pipeline(
            steps=[
                ('preprocessing_angles', AngleTransformer(angles = ['winddirection', 'yaw'])),
                ('regressor', RandomForestRegressor(
                                 n_estimators = space['n_estimators'],
                                 max_depth = space['max_depth'],
                                 min_samples_leaf = space['min_samples_leaf'],
                                 min_samples_split = space['min_samples_split']
                                 )
            )
        ]
    )
    score = cross_val_score(model,  X_train, y_train, cv=5, scoring='r2').mean()
    # We aim to maximize r2 score, therefore we return it as a negative value
    return {'loss': -score, 'status': STATUS_OK }

def optimize_rf(trial):
    space = {
        'n_estimators': hp.uniformint('n_estimators',10,500),
        'max_depth': hp.uniformint('max_depth',3,20),
        'min_samples_leaf': hp.uniformint('min_samples_leaf',1,5),
        'min_samples_split': hp.uniformint('min_samples_split',2,6)
    }
    best = \
        fmin(
            fn = objective_rf,
            space = space,
            algo = tpe.suggest,
            trials = trial,
            max_evals = 100,
            rstate = np.random.default_rng(seed)
            )
    return best

RF_optimizations = {}
for mode in prediction_params.columns:
    y = prediction_params[mode].dropna()
    X = inputs.loc[y.index].dropna()
    y = y.loc[X.index]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
    trial=Trials()
    RF_optimizations[mode]=optimize_rf(trial)
    print(RF_optimizations[mode])
pd.DataFrame(RF_optimizations).to_csv(hyperopt_folder + "rf_optimizations.csv")

100%|██████████| 100/100 [4:31:12<00:00, 162.73s/trial, best loss: -0.2776385007194554]  
{'max_depth': 11.0, 'min_samples_leaf': 5.0, 'min_samples_split': 3.0, 'n_estimators': 301.0}
100%|██████████| 100/100 [3:46:24<00:00, 135.85s/trial, best loss: -0.5375054044244332] 
{'max_depth': 12.0, 'min_samples_leaf': 5.0, 'min_samples_split': 5.0, 'n_estimators': 458.0}
100%|██████████| 100/100 [2:33:48<00:00, 92.29s/trial, best loss: -0.37939700141767807] 
{'max_depth': 18.0, 'min_samples_leaf': 5.0, 'min_samples_split': 2.0, 'n_estimators': 276.0}
100%|██████████| 100/100 [2:54:05<00:00, 104.45s/trial, best loss: -0.5311810475298762] 
{'max_depth': 11.0, 'min_samples_leaf': 5.0, 'min_samples_split': 2.0, 'n_estimators': 294.0}


In [12]:
trial

<hyperopt.base.Trials at 0x1e8bd8cf7f0>