In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datetime
import pandas as pd
from pytz import utc
import os
import mlflow
import numpy as np

# mlflow imports
from mlflow.models.signature import infer_signature

# sklearn imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# oma_tracking imports
from oma_tracking.data.utils import read_simulations_csv_files
from oma_tracking.data import make_dataset
from oma_tracking.data.preprocessing import AngleTransformer, sin_cos_angle_inputs
import oma_tracking.models.mlflow_functions as mlflow_f


from dotenv import load_dotenv
load_dotenv()

True

In [3]:
start =  datetime.datetime(2022,11,1,tzinfo=utc)
stop  = datetime.datetime(2023,5,4,tzinfo=utc)
location = 'nw2d01'
name_location = 'NW2_D01'

# Data Paths
data_root = "../../../"
data_path = data_root + "data/nw2/raw/nw2d01_" + start.strftime("%Y%m%d") + "_" + stop.strftime("%Y%m%d") + ".parquet"
mvbc_path = data_root + "data/nw2/mvbc_data.parquet"
tracked_frequencies_path = data_root + "data/nw2/tracked_modes/" + location + ".parquet"
simulations_data_path = data_root + "data/nw2/simulations/" + location + "/"

# Get all the data
data = pd.read_parquet(data_path)
mvbc_data = pd.read_parquet(mvbc_path)
tracked_frequencies = pd.read_parquet(tracked_frequencies_path)
simulation_data = read_simulations_csv_files(simulations_data_path + "eigen_frequencies/")
simulation_shifts = read_simulations_csv_files(simulations_data_path + "mean_shifts/")
simulation_errors = pd.read_csv(simulations_data_path + "errors/Errors_No_scour.csv", index_col=0)

In [4]:
weather_inputs = make_dataset.get_weather_subset(mvbc_data)
scada_inputs = make_dataset.get_scada_subset(data)

inputs = \
    pd.concat(
        [
            weather_inputs,
            scada_inputs
        ],
        axis=1
    )

prediction_params = \
    tracked_frequencies 

In [5]:
mode = 'SS1'
random_state = 24
test_size = 0.2
angle_transformed_inputs = sin_cos_angle_inputs(inputs)
angle_columns = [angle for angle in inputs.columns if ('yaw' in angle or 'winddirection' in angle)]
angle_removed_inputs = inputs.drop(columns=angle_columns)

y = prediction_params[mode].dropna()
X = angle_transformed_inputs.loc[y.index].dropna()
y = y.loc[X.index]
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=test_size, random_state=random_state)

from sklearn.preprocessing import StandardScaler
def data_preprocessing(X_train, X_test):
    "Apply standard scaling to the data"
    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns, index=X_test.index)
    return X_train, X_test
    
X_tr_prep, X_test_prep = data_preprocessing(X_train, X_test) 


In [9]:
pitch = X_train.filter(like='pitch')
x_c = X_train[X_train.columns.drop(list(X_train.filter(like='pitch')))]
x_c

Unnamed: 0_level_0,mvbc_WandelaarBuoy_Wave_height,mvbc_WandelaarBuoy_Sea_water_temperature,mvbc_WandelaarMeasuringpile_Tide_TAW,mvbc_WandelaarMeasuringpile_Air_pressure,mvbc_WandelaarMeasuringpile_Air_temperature,mean_NW2_D01_rpm,mean_NW2_D01_power,mean_NW2_D01_windspeed,sin_mean_NW2_D01_yaw,cos_mean_NW2_D01_yaw,sin_mean_NW2_D01_winddirection,cos_mean_NW2_D01_winddirection
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2022-11-27 15:20:00+00:00,79.666667,10.600000,438.000000,1011.133333,7.600000,10.445,4450.944,8.879,0.292055,-0.956402,0.270970,-0.962588
2023-02-08 10:20:00+00:00,15.000000,6.600000,82.333333,1035.266667,2.633333,1.194,-99.648,4.625,0.558657,-0.829399,0.608983,-0.793183
2022-12-15 09:10:00+00:00,93.333333,6.600000,150.000000,1007.600000,4.800000,6.565,735.104,5.707,-0.576732,0.816934,-0.624343,0.781150
2023-02-06 20:10:00+00:00,44.333333,6.600000,66.333333,1037.900000,5.700000,1.194,-108.544,4.574,0.923177,-0.384376,0.906020,-0.423235
2022-12-06 22:20:00+00:00,112.666667,9.000000,377.000000,1018.200000,7.733333,9.848,3983.360,8.926,-0.438638,0.898664,-0.453057,0.891481
...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-27 05:20:00+00:00,93.000000,10.633333,235.333333,1016.933333,7.533333,10.445,9524.033,15.672,0.229846,-0.973227,0.216048,-0.976383
2023-01-03 06:30:00+00:00,60.000000,8.400000,191.000000,1022.400000,7.100000,8.952,2725.568,7.926,0.077067,-0.997026,0.077067,-0.997026
2023-02-16 12:40:00+00:00,46.333333,6.800000,179.333333,1019.600000,9.000000,10.445,9525.056,15.531,-0.344381,-0.938830,-0.355352,-0.934733
2022-11-07 18:10:00+00:00,86.000000,13.700000,44.333333,1006.300000,14.300000,10.445,7600.576,11.238,-0.209857,-0.977732,-0.200787,-0.979635


In [13]:
len(X_train.columns)

13

In [24]:
X_train[X_train.filter(like='windspeed').columns[0]].values

array([ 8.879,  4.625,  5.707, ..., 15.531, 11.238, 15.754])

In [26]:
from pymc import HalfCauchy, Model, Normal, sample

with Model() as model:  # model specifications in PyMC are wrapped in a with-statement
    # Define priors
    sigma = HalfCauchy("sigma", beta=10)
    intercept = Normal("Intercept", 0, sigma=20)
    slope = Normal("slope", 0, sigma=20)

    # Define likelihood
    likelihood = Normal("y", mu=intercept + slope * X_train[X_train.filter(like='windspeed').columns[0]].values, sigma=sigma, observed=y_train)

    # Inference!
    # draw 3000 posterior samples using NUTS sampling
    idata = sample(3000)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [sigma, Intercept, slope]


In [None]:
import pymc as pm

with pm.Model(coords={"obs": obs_idx}) as gaussian_model:
    # --- Data Containers ---
    x_ = pm.MutableData(name="x", value=x_c, dims="obs")
    pitch_ = pm.MutableData(name="pitch", value=X_train.filter(like='pitch'), dims="obs")
    y_ = pm.MutableData(name="y", value=y, dims="obs")
    # --- Priors ---
    b_intercept = pm.Normal(name="b_intercept", mu=120, sigma=50)
    b_x = pm.Normal(name="b_x", mu=0, sigma=4)
    b_d = pm.Normal(name="b_d", mu=0, sigma=50)
    b_dx = pm.Normal(name="b_dx", mu=0, sigma=4)
    sigma = pm.Exponential(name="sigma", lam=1 / 50)
    # --- Deterministic Variables ---
    mu = pm.Deterministic(
        name="mu",
        var=b_intercept + b_x * x_ + b_d * d_ + b_dx * d_ * x_,
        dims="obs",
    )
    # --- Likelihood ---
    pm.Normal(
        name="likelihood",
        mu=mu,
        sigma=sigma,
        observed=y_,
        dims="obs",
    )

pm.model_to_graphviz(model=gaussian_model)