In [None]:
# Usual libs
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

# For heatmap creation
import seaborn as sns

# For LSTM model
from keras.models import Sequential
from keras.layers import LSTM,Dense, Activation, Dropout
from keras.callbacks import EarlyStopping

# To save and load model from disk
from tensorflow import keras

# For normalization of data into [0..1]
from sklearn.preprocessing import MinMaxScaler

# To play with Timestamp which is str at the beginning
import datetime

## Read inputs and peek at the data

In [None]:
original = pd.read_csv("./data/features.csv")
power = pd.read_csv("./data/power.csv")

In [None]:
original.head()

In [None]:
power.head()

All the entries except Timestamp. Refer to [Documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html) on how `iloc` works and [Stackoverflow](https://stackoverflow.com/a/56311678/15160666) for how it is used.

In [None]:
values = original.iloc[:, 1:].values

In [None]:
features = pd.DataFrame(values, columns=original.columns[1:])

In [None]:
features

## Make a correlation map between features and generate a heatmap of it.

In [None]:
# Get rid of 1-1 matches to better highlight inter-feature correlations.
corrs = features.corr()
corrs.replace(1.0, corrs.min().min(), inplace=True)

# Upper-right triangle for masking
heatmap_mask = np.triu(corrs)

# Generate heatmap
heatmap = sns.heatmap(data=corrs, annot=True, vmin=0, vmax=corrs.max().max(), mask=heatmap_mask, annot_kws={"size":8})
heatmap.set_title("Coorelation Between Features", fontdict={"fontsize":24}, pad=16)
heatmap.set_autoscale_on(True)

The highest correlation between two features is at `0.025` which is not enough to justify feature reduction / feature merging in preprocessing.

#### Save the heatmap

In [None]:
fig = heatmap.get_figure()
fig.set_size_inches([84., 42.])
fig.savefig("./data/heatmap.png", bbox_inches="tight")

## Correlation between three random features

In [None]:
attributes = ['Blade-1 Actual Value_Angle-B', 'Temperature Ambient', 'Nacelle Revolution']
pd.plotting.scatter_matrix(original[attributes], figsize=(12,12))

Line below outputs an array of `<AxesSubplot:>` objects. But the last 5 entries at the end do not have `title`.

In [None]:
features.hist(bins=50, figsize=(15,14))

 # Filling the missing data

In [None]:
updated_features = original.fillna(method='ffill').fillna(method='bfill')
updated_features.isnull().sum()

# Merge DataFrames

In [None]:
merged = updated_features.merge(power, on='Timestamp')
merged.to_csv("./data/final.csv", index=False)

# Make features usable

#### Timestamp

In [None]:
merged["Timestamp"] = pd.to_datetime(merged["Timestamp"])
date = pd.to_datetime(merged.pop('Timestamp'))

In [None]:
#Datetime object is still in string format so we can't use it. We need to change it to day, month, and year format. 
timestamp_s = date.map(datetime.datetime.timestamp)
day = 24*60*60
month = day*12
year = (365.2425)*day

merged['Day sin'] = np.sin(timestamp_s * (2 * np.pi / day))
merged['Day cos'] = np.cos(timestamp_s * (2 * np.pi / day))
merged['Month sin'] = np.sin(timestamp_s * (2 * np.pi / month))
merged['Month cos'] = np.cos(timestamp_s * (2 * np.pi / month))

# Year is 2019 for the entirety of input data so this is redundant.
# merged['Year sin'] = np.sin(timestamp_s * (2 * np.pi / year))
# merged['Year cos'] = np.cos(timestamp_s * (2 * np.pi / year))

In [None]:
plt.plot(np.array(merged['Day sin'])[:500])
plt.plot(np.array(merged['Day cos'])[:500])
plt.xlabel('Time [h]')
plt.title('Time of day signal')

#### Merge 3 temperature features into one

In [None]:
attributes = ["Gearbox_T1_High_Speed_Shaft_Temperature","Gearbox_T1_Intermediate_Speed_Shaft_Temperature","Gearbox_T3_High_Speed_Shaft_Temperature"]
pd.plotting.scatter_matrix(original[attributes], figsize=(12,12))

In [None]:
merged['Gearbox_Temperature_Average'] = merged[["Gearbox_T1_High_Speed_Shaft_Temperature","Gearbox_T1_Intermediate_Speed_Shaft_Temperature","Gearbox_T3_High_Speed_Shaft_Temperature"]].mean(axis=1)
del merged["Gearbox_T1_High_Speed_Shaft_Temperature"]
del merged["Gearbox_T3_High_Speed_Shaft_Temperature"]
del merged["Gearbox_T1_Intermediate_Speed_Shaft_Temperature"]

# Seperate out train, val, test data

In [None]:
n = len(merged)
train_merged = merged[0:int(n*0.7)]
val_merged = merged[int(n*0.7):int(n*0.9)]
test_merged = merged[int(n*0.9):]
num_features = merged.shape[1]

In [None]:
merged.info()

In [None]:
merged.describe()

In [None]:
sns.heatmap(merged)

In [None]:
merged_train = merged.iloc[:, 77:78].values

# Feature Scaling 

In [None]:
merged.shape

In [None]:
ms = MinMaxScaler()
train_set_with_s = ms.fit_transform(merged)

# Building the Model

Use every 50 to learn to predict 1

In [None]:
X_train = []
y_train = []

for i in range(50,len(train_set_with_s)):
  X_train.append(train_set_with_s[i-50:i, 0])
  y_train.append(train_set_with_s[i,0])

X_train, y_train = np.array(X_train), np.array(y_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

## Training

#### If there are no changes to the model, just load it up

In [None]:
# model = keras.models.load_model("./model")

#### Create a selu model and train in 128 batches for 30 epochs

In [None]:
model = Sequential()
model.add(LSTM(units=20, activation="selu", return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.2)) #hyperparameter tuning -- GridSearch, #Dimensionality reduciton(PCa,regularization, )
model.add(LSTM(units=30, activation="selu",return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=40, activation="selu", return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(units=50, activation="selu")) #relu, leakyrelu, 
model.add(Dropout(0.3))
model.add(Dense(units=1))
model.compile(optimizer="adam", loss="mean_squared_error") #rmsprop 

In [None]:
earlyStopping = EarlyStopping(monitor="val_loss",mode="min",verbose=1,patience=25)
model.fit(x=X_train, y=y_train, epochs = 30, batch_size=128, verbose = 1, callbacks=[earlyStopping], validation_split=0.2)

Only save a model after benchmarking. This is a reminder.

In [None]:
# model.save("./model")

# Prediction

In [None]:
test_prediction = ms.inverse_transform(model.predict(np.array(reshaped_test).reshape(681100, 1)))

In [None]:
train_prediction = model.predict(x=X_train)

In [None]:
fit_test = ms.fit_transform(test_merged)
fit_test = fit_test.flatten()
fit_test.reshape(len(fit_test), 1)
reshaped_test = []
for i in range(50, len(fit_test) - 1):
  reshaped_test.append(fit_test[i-50:i])

reshaped_test = np.array(reshaped_test)
# reshaped_test = np.reshape(reshaped_test, (reshaped_test.shape[0], reshaped_test.shape[1], 1))

# test_prediction = model.predict(reshaped_test)
# test_prediction = test_prediction.flatten()
reshaped_test.shape


In [None]:
test_prediction = ms.inverse_transform(test_prediction)

In [None]:
y_train = y_train.reshape(len(y_train), -1)


In [None]:
newy_train = ms.fit_transform(y_train)