In [3]:
%load_ext autoreload

In [6]:
import sys
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(color_codes=True)
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

if not "../data_processing" in sys.path:
    sys.path.append("../data_processing")

%autoreload
import build_dataset

In [9]:
path = '/Volumes/OsvikExtra/VibrationData/RMS_dataset/GnDe_RMS_power>2500_WTG03.csv'
data = pd.read_csv(path)
data.head()

Unnamed: 0,AvgPower,ActPower,WindSpeed,NacelleDirection,GnDe_RMS_0,GnDe_RMS_1,GnDe_RMS_2,GnDe_RMS_3,GnDe_RMS_4,GnDe_RMS_5,...,GnDe_RMS_40,GnDe_RMS_41,GnDe_RMS_42,GnDe_RMS_43,GnDe_RMS_44,GnDe_RMS_45,GnDe_RMS_46,GnDe_RMS_47,GnDe_RMS_48,GnDe_RMS_49
0,2953.059326,3062.600098,12.2,211.300003,0.300703,0.167599,0.080561,0.084499,0.121249,0.425802,...,0.164977,0.308687,0.150392,1.18342,2.292909,1.25464,1.781284,1.916941,1.22213,0.666716
1,2506.83374,2213.600098,9.8,210.300003,0.309377,0.232027,0.09428,0.094344,0.164666,0.303873,...,0.147899,0.18464,0.349825,1.127339,2.018184,1.615926,2.21849,2.089519,1.052369,0.651631
2,2638.40332,2538.399902,10.2,36.900002,0.278201,0.184347,0.07855,0.078943,0.150086,0.297047,...,0.277046,0.294337,0.325631,0.940822,1.85859,1.692963,1.860862,1.666716,1.469655,0.640885
3,2603.288818,2243.100098,9.9,110.800003,0.25952,0.205391,0.072722,0.077394,0.130983,0.273404,...,0.292905,0.287657,0.343236,0.813018,1.648452,1.68313,1.240347,1.144751,1.500817,0.621034
4,3289.359131,3477.300049,13.1,229.699997,0.318589,0.182687,0.067799,0.075636,0.138208,0.368046,...,0.117334,0.218183,0.168848,0.913204,0.795317,1.880269,2.054144,1.126004,1.44303,0.711852


In [10]:
def train_test_split(df, percentage):
    split_index = int(np.floor(df.shape[0]) * percentage)
    train = df[:split_index]
    test = df[split_index:].reset_index(drop=True)
    return train, test

train, test = train_test_split(data, 0.8)

In [11]:
print(train.shape)
print(test.shape)

(227, 54)
(57, 54)


In [12]:
min(data['ActPower'])

-1000.0

## Loss function MAE (Mean Absolute Error)

In [41]:
def custom_mae(y_true, y_pred):
    return np.mean(np.abs(y_pred - y_true))

## Scale with StandardScaler

In [20]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train)
test_scaled = scaler.transform(test)

## PCA 

In [109]:
'''
The number of components required to capture 90% of the variance 
will be used
'''
pca = PCA(0.95) 
pca.fit(train_scaled)
print(f'The number of components is: {pca.n_components_}')

The number of components is: 26


In [110]:
train_pca = pca.transform(train_scaled)
train_pca_reconstructed = pca.inverse_transform(train_pca)
train_reconstruction_variance = np.var(train_scaled - pca.inverse_transform(train_pca))
print(f'The training reonstruction variance is: {train_reconstruction_variance}')

The training reonstruction variance is: 0.045689461151772436


In [111]:
test_pca = pca.transform(test_scaled)
test_pca_reconstructed = pca.inverse_transform(test_pca)
test_reconstruction_variance = np.var(test_scaled - pca.inverse_transform(test_pca))
print(f'The testing reonstruction variance is: {test_reconstruction_variance}')

The testing reonstruction variance is: 0.08720389829829704


In [112]:
train_loss = custom_mae(train_scaled, train_pca_reconstructed)
print(f'MAE loss on train data using pca: {train_loss}')

MAE loss on train data using pca: 0.16498336707746594


In [113]:
test_loss = custom_mae(test_scaled, test_pca_reconstructed)
print(f'MAE loss on test data using pca: {test_loss}')

MAE loss on test data using pca: 0.22540199152569734
