In [None]:
import mlflow
import pickle
import sys
sys.path.insert(1, '../library')
import database_helper
from sklearn.preprocessing import StandardScaler
import pandas as pd
import os
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense
from sklearn.model_selection import train_test_split


In [None]:
os.environ["MLFLOW_TRACKING_USERNAME"] = "user"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "password"
mlflow.set_tracking_uri(uri="http://mlflow.local:80")
mlflow.set_experiment("DWD Autoencoder Anomaly Detection")
mlflow.autolog()

In [None]:
with open('selected_columns.pkl', 'rb') as f:
    selected_columns = pickle.load(f)
dataframe = database_helper.query_data(field_list=selected_columns)
dataframe['_time'] = pd.to_datetime(dataframe['_time'])
dataframe.set_index('_time', inplace=True)
dataframe = dataframe.select_dtypes(include='float64')
dataframe.interpolate(inplace=True)
display(dataframe)

In [None]:
dataframe.describe().transpose()

In [None]:
scaler = StandardScaler()
data_scaled = scaler.fit_transform(dataframe)
display(data_scaled)

In [None]:
train_data, test_data = train_test_split(data_scaled, test_size=0.2) #, random_state=42

n_samples = 1000
time_steps = 1
n_features = train_data.shape[1]  # Assuming univariate time series

# Define the LSTM autoencoder model
input_dim = train_data.shape[1:]  # (time_steps, n_features)
latent_dim = 32  # Compressed representation dimension

input_layer = Input(shape=input_dim)
encoder = LSTM(128, activation="relu", return_sequences=True)(input_layer)
encoder = LSTM(64, activation="relu", return_sequences=False)(encoder)
encoder_output = RepeatVector(time_steps)(encoder)

decoder = LSTM(64, activation="relu", return_sequences=True)(encoder_output)
decoder = LSTM(128, activation="relu", return_sequences=True)(decoder)
decoder_output = TimeDistributed(Dense(n_features))(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
display(autoencoder.summary())

optimizer = Adam(learning_rate=0.0001)
autoencoder.compile(optimizer=optimizer, loss='mse', metrics=['mse', 'accuracy'])
# Train the autoencoder
autoencoder.fit(train_data, train_data, 
                epochs=150, 
                batch_size=32, 
                validation_split=0.2, 
                verbose=1,
                callbacks=[early_stopping])

In [None]:
# Use the autoencoder to reconstruct the test data
reconstructed_data = autoencoder.predict(test_data)

# Calculate reconstruction error
reconstruction_error = np.mean(np.square(test_data - reconstructed_data), axis=1)

# Define a threshold for anomaly detection (this is a simple way, more advanced methods can be used)
threshold = np.percentile(reconstruction_error, 95)

# Identify anomalies
anomalies = reconstruction_error > threshold

# Print results
print("Number of anomalies detected:", np.sum(anomalies))