In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#IMPORTS
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from sklearn.metrics import mean_squared_error, r2_score
from src.models.metrics import calculate_aic_bic

In [3]:
loc = 'nw2'
turbine = 'c02'
mode = 'SS2'

# GET THE DATA
package_folder = os.path.dirname(os.path.dirname(os.path.dirname(os.getcwd())))
data_folder = os.path.join(package_folder, 'data')
models_folder = os.path.join(package_folder, 'models')
ss2_selected = pd.read_csv(os.path.join(data_folder, 'processed','nw2', turbine+'_ss2_selected_data_large.csv'))
ss2_selected['timestamp'] = pd.to_datetime(ss2_selected['timestamp'])
ss2_selected.set_index('timestamp', inplace=True)

SS1_dbscan = pd.read_parquet(os.path.join(data_folder, 'interim',loc,'tracked_modes', 'dbscan_based', loc+turbine+'_SS1_mode.parquet'))
SS2_dbscan = pd.read_parquet(os.path.join(data_folder, 'interim',loc,'tracked_modes', 'dbscan_based', loc+turbine+'_SS2_mode.parquet'))
FA1_dbscan = pd.read_parquet(os.path.join(data_folder, 'interim',loc,'tracked_modes', 'dbscan_based', loc+turbine+'_FA1_mode.parquet'))
FA2_dbscan = pd.read_parquet(os.path.join(data_folder, 'interim',loc,'tracked_modes', 'dbscan_based', loc+turbine+'_FA2_mode.parquet'))

rfe_selected_data = pd.read_parquet(os.path.join(data_folder, 'interim', loc, 'rfe_selected_data', loc+turbine+'_rfe_selected_data.parquet'))

In [8]:
##Prepare the training and test data

#choose y_ to be SS1_dbscan but uniquely indexed keeping the index with hghest value in size column when duplicated
y_ = ss2_selected.copy()
y_ = y_.sort_values(by=['size'], ascending=False)
y_ = y_.loc[~y_.index.duplicated(keep='last')]
y_ = y_.sort_index()

#Synchronize data
Xy = pd.DataFrame(y_['mean_frequency'])
for col in rfe_selected_data.columns:
    Xy[col] = rfe_selected_data[col]
Xy.dropna(inplace=True)
y = Xy.iloc[:,0]
X_ = Xy[rfe_selected_data.columns]

#preprocess the data
from src.data.preprocessing import sin_cos_angle_inputs
from sklearn.model_selection import train_test_split

X = sin_cos_angle_inputs(X_)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# MinMaxnormalization of the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mean = y_train.mean()
mse = mean_squared_error(y_test, [mean]*len(y_test))
r2 = r2_score(y_test, [mean]*len(y_test))
print(f'Mean: {mean}, MSE: {mse}, R2: {r2}')

Mean: 1.0355808857258562, MSE: 0.00020077205778828714, R2: -0.0011423574298663475


In [9]:
##Prepare the training and test data

#choose y_ to be SS1_dbscan but uniquely indexed keeping the index with hghest value in size column when duplicated
y_ = SS1_dbscan.copy()
y_ = y_.sort_values(by=['size'], ascending=False)
y_ = y_.loc[~y_.index.duplicated(keep='last')]
y_ = y_.sort_index()

#Synchronize data
Xy = pd.DataFrame(y_['mean_frequency'])
for col in rfe_selected_data.columns:
    Xy[col] = rfe_selected_data[col]
Xy.dropna(inplace=True)
y = Xy.iloc[:,0]
X_ = Xy[rfe_selected_data.columns]

#preprocess the data
from src.data.preprocessing import sin_cos_angle_inputs
from sklearn.model_selection import train_test_split

X = sin_cos_angle_inputs(X_)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# MinMaxnormalization of the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mean = y_train.mean()
mse = mean_squared_error(y_test, [mean]*len(y_test))
r2 = r2_score(y_test, [mean]*len(y_test))
print(f'Mean: {mean}, MSE: {mse}, R2: {r2}')

Mean: 0.21912574448306682, MSE: 9.147559528369654e-06, R2: -3.7997905095954465e-05


In [10]:
##Prepare the training and test data

#choose y_ to be SS1_dbscan but uniquely indexed keeping the index with hghest value in size column when duplicated
y_ = FA1_dbscan.copy()
y_ = y_.sort_values(by=['size'], ascending=False)
y_ = y_.loc[~y_.index.duplicated(keep='last')]
y_ = y_.sort_index()

#Synchronize data
Xy = pd.DataFrame(y_['mean_frequency'])
for col in rfe_selected_data.columns:
    Xy[col] = rfe_selected_data[col]
Xy.dropna(inplace=True)
y = Xy.iloc[:,0]
X_ = Xy[rfe_selected_data.columns]

#preprocess the data
from src.data.preprocessing import sin_cos_angle_inputs
from sklearn.model_selection import train_test_split

X = sin_cos_angle_inputs(X_)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

# MinMaxnormalization of the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mean = y_train.mean()
mse = mean_squared_error(y_test, [mean]*len(y_test))
r2 = r2_score(y_test, [mean]*len(y_test))
print(f'Mean: {mean}, MSE: {mse}, R2: {r2}')

Mean: 0.21958441198426598, MSE: 6.04736317478754e-05, R2: -0.0005598517765166289
