**Import Libraries**

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import eif
from kando import kando_client
import pickle
%run utils.ipynb
# import libraries
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest

%matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d   

from keras.layers import Input, Dropout, Dense, LSTM, TimeDistributed, RepeatVector
from keras.models import Model
from keras import regularizers

import eif as iso

**Global Variables**

In [2]:
WINDOW = 24
NODE = 883

In [3]:
def load_point_mps(point_id):
    """
    :param point_id:  
    :return: 
    """
    
    infile = open(f"Data_and_Mps/{point_id}_mps.pkl",'rb')
    mps = pickle.load(infile)
    infile.close()
    return(mps)

In [4]:
def load_point_data(point_id):
    infile = open(f"Data_and_Mps/{point_id}_data.pkl",'rb')
    df = pickle.load(infile)
    infile.close()
    return(df)

In [5]:
mps = load_point_mps(NODE)

In [6]:
df = load_point_data(NODE)

In [7]:
with open('water_authority.json') as f:
    active_by_aouthority = json.load(f)

## MATRIX PROFILE ANOMALIES

In [8]:
df = add_matrix_profile_anomalies_labels(df, mps)

In [9]:
plot__matrix_profile_anomalies(df)

<IPython.core.display.Javascript object>

<div class="alert alert-info"><b>&emsp;<u>ISOLATION FOREST ANOMALIES </u></b>

In [10]:
df = add_isolation_forest_anomalies_labels(df)

In [11]:
plot__isolation_forest_anomalies(df)

<IPython.core.display.Javascript object>

<div class="alert alert-info"><b>&emsp;<u>AUTOENCODER ANOMALIES </u></b>

In [12]:
def vae_input_preprocessing(df):
    # normalize the data
    x = df[['EC', 'PH', 'ORP', 'TEMPERATURE']].values  #returns a numpy array
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)

    # reshape inputs for LSTM [samples, timesteps, features]
    vae_input = x_scaled.reshape(x_scaled.shape[0], 1, x_scaled.shape[1])
    return (vae_input)

In [13]:
def autoencoder_model(X):
    '''
    define the autoencoder network model
    '''

    inputs = Input(shape=(X.shape[1], X.shape[2]))
    L1 = LSTM(16,
              activation='relu',
              return_sequences=True,
              kernel_regularizer=regularizers.l2(0.00))(inputs)
    L2 = LSTM(4, activation='relu', return_sequences=False)(L1)
    L3 = RepeatVector(X.shape[1])(L2)
    L4 = LSTM(4, activation='relu', return_sequences=True)(L3)
    L5 = LSTM(16, activation='relu', return_sequences=True)(L4)
    output = TimeDistributed(Dense(X.shape[2]))(L5)
    model = Model(inputs=inputs, outputs=output)
    return model

In [14]:
def add_vae_anomalies_labels(df, model):

    X_pred = model.predict(vae_input)
    X_pred = X_pred.reshape(X_pred.shape[0], X_pred.shape[2])
    X_pred = pd.DataFrame(X_pred, columns=['EC', 'PH', 'ORP', 'TEMPERATURE'])
    X_pred.index = df.index

    Xtrain = vae_input.reshape(vae_input.shape[0], vae_input.shape[2])

    scored_train = pd.DataFrame(index=df.index)
    scored_train['Loss_mae'] = np.mean(np.abs(X_pred - Xtrain), axis=1)

    df['auto encoder Anomaly'] = 0
    df['auto encoder Anomaly'][scored_train['Loss_mae'].nlargest(
        n=WINDOW, keep='first').index] = 1
    return (df)

In [15]:
def plot_vae_anomalies(df):

    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111, projection='3d')

    for i in range(2):
        x = df['PH'][df['auto encoder Anomaly'] == i]
        y = df['EC'][df['auto encoder Anomaly'] == i]
        z = df['TEMPERATURE'][df['auto encoder Anomaly'] == i]

        if i == 0:
            tmp_size = 90
            tmp_alpha = 0.05
            tmp_edgecolor = None
            color = 'blue'
            label = 'NORMAL'
            marker = '.'
        else:
            tmp_size = 130
            tmp_alpha = 0.8
            tmp_edgecolor = 'black'
            color = 'black'
            label = 'ALL DIMS'
            marker = '*'

        scatter = ax.scatter(x,
                             y,
                             z,
                             c=color,
                             marker=marker,
                             cmap=plt.cm.Spectral_r,
                             alpha=tmp_alpha,
                             s=tmp_size,
                             edgecolor=tmp_edgecolor,
                             label=label)
    plt.legend(title=' TYPE')

    ax.set_xlabel('PH')
    ax.set_ylabel('EC')
    ax.set_zlabel('TEMPERATURE')

    plt.title('AUTO ENCODER ANOMALIES\n', FontSize=16, FontWeight='bold')

    plt.show()

In [16]:
vae_input = vae_input_preprocessing(df)

# create the autoencoder model

model = autoencoder_model(vae_input)
model.compile(optimizer='adam', loss='mae' , metrics=['accuracy'])
model.summary()

# fit the model to the data

nb_epochs = 50
batch_size = WINDOW
history = model.fit(vae_input, vae_input, epochs=nb_epochs, batch_size=batch_size,
                    validation_split=0.15).history

df = add_vae_anomalies_labels(df, model)
plot_vae_anomalies(df)

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1, 4)]            0         
_________________________________________________________________
lstm (LSTM)                  (None, 1, 16)             1344      
_________________________________________________________________
lstm_1 (LSTM)                (None, 4)                 336       
_________________________________________________________________
repeat_vector (RepeatVector) (None, 1, 4)              0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 1, 4)              144       
_________________________________________________________________
lstm_3 (LSTM)                (None, 1, 16)             1344      
_________________________________________________________________
time_distributed (TimeDistri (None, 1, 4)             

<IPython.core.display.Javascript object>

<div class="alert alert-info"><b><u><center>EXTENDED ISOLATION FOREST ANOMALIES </center></u></b>

In [17]:
def add_extended_isolation_forest_anomalies_labels(df):
    # normalize the data
    x = df[['EC', 'PH', 'ORP', 'TEMPERATURE']].values  #returns a numpy array
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)

    if_eif = iso.iForest(x_scaled,
                         ntrees=100,
                         sample_size=256,
                         ExtensionLevel=3)

    # calculate anomaly scores
    anomaly_scores = if_eif.compute_paths(X_in=x_scaled)
    max_indices = anomaly_scores.argsort()[-WINDOW:][::-1]
    df['ALL_DIMS_eif'] = 0
    df['ALL_DIMS_eif'][max_indices] = 1
    
    indx = np.arange(df.shape[0])
    for col in ['EC', 'PH', 'ORP', 'TEMPERATURE']:
        df[f'{col}_eif'] = 0
        values = df[col].values
        tempdf = pd.DataFrame({
            'time': indx,
            'ph': values,
        })
        anomaly_score = iso.iForest(tempdf.values, 
                         ntrees = 100, 
                         sample_size = 256, 
                         ExtensionLevel = 1)
        anomaly_scores = if_eif.compute_paths(X_in = tempdf.values)
        max_indices = anomaly_scores.argsort()[-WINDOW:][::-1]
        df[f'{col}_eif'][max_indices] = 1

    df['NORMAL_eif'] = 1
    df['NORMAL_eif'][(df['EC_eif'] == 1) | (df['PH_eif'] == 1) | (df['ORP_eif'] == 1) |
                    (df['TEMPERATURE_eif'] == 1) | (df['ALL_DIMS_eif'] == 1)] = 0
    return (df)

In [18]:
def plot_extended_isolation_forest(df):

    eif_cols = ['NORMAL_eif', 'ALL_DIMS_eif', 'EC_eif', 'PH_eif', 'ORP_eif', 'TEMPERATURE_eif', ]
    colors = ['blue', 'black', 'red', 'green', 'yellow', 'orange']
    labels = ['NORMAL', 'ALL DIMS', 'EC', 'PH', 'ORP', 'TEMPERATURE']

    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111, projection='3d')

    for i in range(6):
        x = df['PH'][df[eif_cols[i]] == 1]
        y = df['EC'][df[eif_cols[i]] == 1]
        z = df['TEMPERATURE'][df[eif_cols[i]] == 1]

        tmp_color = colors[i]
        if i == 0:
            tmp_size = 90
            tmp_alpha = 0.03
            tmp_edgecolor = None
            marker = '.'
        else:
            tmp_size = 130
            tmp_alpha = 0.8
            tmp_edgecolor = 'black'
            marker = '*'

        scatter = ax.scatter(x,
                             y,
                             z,
                             c=colors[i],
                             marker=marker,
                             cmap=plt.cm.Spectral_r,
                             alpha=tmp_alpha,
                             s=tmp_size,
                             edgecolor=tmp_edgecolor,
                             label=labels[i])
    plt.legend(title=' TYPE')

    ax.set_xlabel('PH')
    ax.set_ylabel('EC')
    ax.set_zlabel('TEMPERATURE')

    plt.title('EXTENDED ISOLATION FOREST ANOMALIES\n', FontSize=16, FontWeight='bold')

    plt.show()

In [20]:
df = add_extended_isolation_forest_anomalies_labels(df)
plot_extended_isolation_forest(df)

<IPython.core.display.Javascript object>

In [None]:
def save_df_with_anomalies(df, point_id):
    b_file = open(f"{point_id}_data_with_anomalies.pkl", "wb")
    pickle.dump(df, b_file)
    b_file.close()

In [None]:
save_df_with_anomalies(df, 911)