In [3]:
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import stumpy
from tqdm.notebook import tqdm

In [1]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest

%matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d   

from keras.layers import Input, Dropout, Dense, LSTM, TimeDistributed, RepeatVector
from keras.models import Model
from keras import regularizers

import eif as iso

In [2]:
def simm(motif, matrix_profile):
    sims = []
    for index, point in enumerate(matrix_profile[:, 1]):
        if point == motif[1]:
            sims.append(index)
    return sims

In [3]:
def simms(motif, matrix_profile):
    i=0
    allsimms = simm(motif, matrix_profile)
    while i != len(allsimms):
        for index, point in enumerate(matrix_profile[:, 1]):
            if point == allsimms[i] and index not in allsimms:
                allsimms.append(index)
        i += 1        
    return(allsimms)

In [99]:
def discover_motifs_groups(matrix_profile, treshold):
    mins = [[matrix_profile[i][0], i, matrix_profile[i][1]]
            for i in range(2,
                           len(matrix_profile) - 2)
            if matrix_profile[i - 1][0] > matrix_profile[i][0]
            and matrix_profile[i][0] < matrix_profile[i + 1][0]
            and matrix_profile[i - 2][0] > matrix_profile[i - 1][0]
            and matrix_profile[i + 1][0] < matrix_profile[i + 2][0]
           ]
    if int(len(mins)*treshold)<10:
        groups_size = int(len(mins)*treshold)
    else:
        groups_size = 10
    motifs = sorted(mins, key = lambda x: x[0])[:groups_size]
    
    groups_check = []
    groups = []
    for motif in motifs:
        if len(simms(motif, matrix_profile))>1 and set(simms(motif, matrix_profile)) not in groups_check:
            groups_check.append(set(simms(motif, matrix_profile)))
            groups.append(simms(motif, matrix_profile))

    return(groups[:])

In [5]:
def plot_groups(df, groups, col, m):
    for group in groups:
        plt.figure(figsize=(15, 5))
        plt.title(f'{col} motifs', fontsize=20)
        for similar in group:
            motif = df[f'{col}'][similar:similar + m]
            plt.plot(range(1,m+1), motif, label=f'[{similar}:{similar+m}]')
            plt.legend()
        plt.show()

In [3]:
def plot_anomalies(df, mps, cols_for_matrix_profile, m=24):
    matrix_profile = mps[f'All_dimensions'][0][:, 0]
    max_index = np.argwhere(
        matrix_profile == matrix_profile.max()).flatten()[0]

    fig, ax = plt.subplots(len(cols_for_matrix_profile),
                           figsize=(15, 6),
                           gridspec_kw={'hspace': 0})
    ax[0].set_title(
        f'All_dimensions most anomalous behavior \nindices[{max_index}:{max_index+m}]\ndistance to closest neighbors {round(matrix_profile.max(),2)}',
        fontsize=20)
    colors = ['blue', 'green', 'orange', 'black']
    for i, col_ in enumerate(cols_for_matrix_profile):
        ax[i].plot(df[f'{col_}'].iloc[max_index:max_index + m],
                   color=colors[i])
        ax[i].set_ylabel(f'{col_} \n level', fontsize='16')
    plt.show()

    for col_ in cols_for_matrix_profile:
        plt.figure(figsize=(15, 5))
        matrix_profile = mps[f'mp_{col_}'][:, 0]
        plt.title(
            f'{col_} most anomalous behavior \n distance to closest neighbors {round(matrix_profile.max(),2)}',
            fontsize=20)
        max_index = np.argwhere(
            matrix_profile == matrix_profile.max()).flatten()[0]
        tmp_motif = df[f'{col_}'][max_index:max_index + m]
        tmp_motif.plot(label=f'[{[max_index]}:{[max_index + m]}]')
        plt.legend(title='Indices')
        plt.show()

In [7]:
def plot_multi_variant_matrix_profile(mps, cols_to_plot):
    plt.figure(figsize=(30, 6))
    for col_ in cols_to_plot:
        plt.plot(mps[f'mp_{col_}'][:, 0], label=col_)
        plt.legend(title='title',bbox_to_anchor=(-0.05, 1),)
    plt.plot(mps[f'All_dimensions'][0][:, 0], 'k--', linewidth=3, label='all')
    plt.title(f'All dimensions 1 day window matrix profile', fontsize=20)
    plt.xlabel('index')
    plt.ylabel(f'Distance to \n nearest neighbor', fontsize='18')
    plt.show()

In [16]:
def plot_univariant_matrix_profiles_dictionary(df, mps, cols_to_plot):
    for col_ in cols_to_plot:
        plt.subplots(sharex=True, figsize=(30,8), gridspec_kw={'hspace': 0})
        plt.subplot(2, 1, 1)
        df[f'{col_}'].plot()
        plt.title(f'{col_} OVER TIME \n And {col_} 1 day window matrix profile ', fontsize=20)
        plt.xlabel('Time')
        plt.ylabel(f'{col_} level', fontsize='16')   
        plt.subplot(2, 1, 2)
        plt.plot(mps[f'mp_{col_}'][:, 0], C='g')
        plt.xlabel('index')
        plt.ylabel(f'Distance to \n nearest \n neighbor', fontsize='16')   
        plt.show()

In [103]:
def plot_univariant_matrix_profiles_and_motifs(df, mps, cols_to_plot, window=24, threshold=0.05):
    for col_ in cols_to_plot:
        
        groups = discover_motifs_groups(mps[f'mp_{col_}'], threshold)
        colors = ['red', 'blue', 'green', 'orange', 'gray', 'purple', 'teal', 'pink', 'brown', 'yellow']
        new_df = df.reset_index()

        fig, ax = plt.subplots(2, figsize=(240,8), gridspec_kw={'hspace': 0})           
        
        ax[0].plot(new_df[f'{col_}'])
        ax[0].set_title(f'{col_} OVER TIME \n And {col_} 1 day window matrix profile ', 
                        fontsize=20)
        ax[0].set_ylabel(f'{col_} level', fontsize='16')
        
        ax[1].plot(mps[f'mp_{col_}'][:, 0], C='g')
        ax[1].set_xlabel('Time', fontsize ='15')
        ax[1].set_ylabel(f'Distance to \n nearest \n neighbor', fontsize='15') 
        
        for i, group in enumerate(groups):
            motif_color = colors[i]
            for motif in group:
                rect = Rectangle((motif, new_df[col_].min()), 
                                 window, new_df[col_].max(), 
                                 facecolor=motif_color, alpha=0.2)
                ax[0].add_patch(rect)
                ax[1].axvline(motif, linestyle="dashed", color=motif_color)
                ax[1].scatter(motif, mps[f'mp_{col_}'][motif,0], color=motif_color, marker='*', s=200)
  
        plt.show()

In [9]:
cols_for_matrix_profile = ['EC', 'PH', 'ORP', 'TEMPERATURE']

In [4]:
def create_matrix_profile_dictionary(df, target_cols, m=96):
    mps = {}
    for col_ in tqdm(target_cols):
        mps[f'mp_{col_}'] = stumpy.stump(df[f'{col_}'], m)
    mps['All_dimensions'] = stumpy.mstump(df[target_cols], m)
    return (mps)

In [11]:
def plot_columns(df, cols_to_plot, sampling_unit='D'):
    """
    :param df: pandas dataframe
    :param cols_to_plot: list of columns to plot
    :param sampling_unit: resampling unit size, default is day
    :return: Plot chosen time series columns from dataframe with optional time resampling
    """

    df = df.resample(sampling_unit).mean()
    data_dic = {
        'PI': 'Polution_Index ',
        'EC': 'Electrical_Conductivity',
        'PH': 'pH',
        'WL': 'Water_Level',
        'ORP': 'Oxidation_reduction_potential',
        'TEMPERATURE': 'Temperature',
        'COD': 'Chemical_Oxygen_Demand',
        'TSS': 'Total_suspended_solids',
        'FLOW': 'Flow',
        'Battery': 'Battery_Level',
        'Signal': 'Signal',
        'MS': 'MS',
        'gaps': 'gaps'
    }

    for col_ in cols_to_plot:
        plt.figure(figsize=(15, 5))
        df[f'{col_}'].plot()
        plt.title(data_dic[f'{col_}'] + ' OVER TIME', fontsize=18)
        plt.xlabel('Time')
        plt.ylabel(str(col_))
        plt.show()

In [12]:
def create_heat_map(data):
    """
    :param data: pandas dataframe
    return: plot a heatmap of correlation between the features
    """

    plt.figure(figsize=(10, 10))

    mask = np.tril(np.ones_like(data.corr(), dtype=np.bool))
    ax = sns.heatmap(data.corr(),
                     annot=True,
                     fmt=".2f",
                     mask=mask,
                     square=True,
                     linecolor='white',
                     linewidths=1)
    if data.shape[1] > 10:
        plt.xticks(rotation=55)
        plt.yticks(rotation=0)

    plt.title('\nFeatures Correlation', fontsize=18)
    plt.show()

In [13]:
def impute_nulls_with_time_interpolation(df, columns_to_impute, unit):
    """
    :param df: pandas dataframe
    :param columns_to_impute: list of columns names
    :return: imputed dataframe
    """

    df = df.resample(f'{unit}').mean()
    for col in columns_to_impute:
        df[f'{col}'] = df[f'{col}'].interpolate(method='time')
    return df

## anomalies

In [2]:
def load_point_mps(point_id):
    infile = open(f"{point_id}_mps.pkl",'rb')
    mps = pickle.load(infile)
    infile.close()
    return(mps)

In [3]:
def load_point_data(point_id):
    infile = open(f"{point_id}_data.pkl",'rb')
    df = pickle.load(infile)
    infile.close()
    return(df)

In [5]:
def add_matrix_profile_anomalies_labels(df, mps):
    for col_ in ['EC', 'PH', 'ORP', 'TEMPERATURE']:
        df[f'{col_}_mp'] = 0
        matrix_profile = mps[f'mp_{col_}'][:, 0]
        max_index = np.argwhere(
            matrix_profile == matrix_profile.max()).flatten()[0]
        df[f'{col_}_mp'].iloc[max_index:max_index + WINDOW] = 1
    df['ALL_DIMS_mp'] = 0
    matrix_profile = mps[f'All_dimensions'][0][:, 0]
    max_index = np.argwhere(
        matrix_profile == matrix_profile.max()).flatten()[0]
    df['ALL_DIMS_mp'].iloc[max_index:max_index + WINDOW] = 1
    df['NORMAL_mp'] = 1
    df['NORMAL_mp'][(df['EC_mp'] == 1) | (df['PH_mp'] == 1) |
                    (df['ORP_mp'] == 1) | (df['TEMPERATURE_mp'] == 1) |
                    (df['ALL_DIMS_mp'] == 1)] = 0
    return (df)

In [6]:
def plot__matrix_profile_anomalies(df):
    mp_cols = ['NORMAL_mp', 'ALL_DIMS_mp', 'EC_mp', 'PH_mp', 'ORP_mp', 'TEMPERATURE_mp']
    colors = ['blue', 'black', 'red', 'green', 'yellow', 'orange']
    labels = ['NORMAL', 'ALL DIMS', 'EC', 'PH', 'ORP', 'TEMPERATURE']

    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111, projection='3d')

    for i in range(6):
        x = df['PH'][df[mp_cols[i]] == 1]
        y = df['EC'][df[mp_cols[i]] == 1]
        z = df['TEMPERATURE'][df[mp_cols[i]] == 1]

        tmp_color = colors[i]
        if i == 0:
            tmp_size = 90
            tmp_alpha = 0.05
            tmp_edgecolor = None
            marker = '.'
        else:
            tmp_size = 130
            tmp_alpha = 0.8
            tmp_edgecolor = 'black'
            marker = '*'

        scatter = ax.scatter(x,
                             y,
                             z,
                             c=tmp_color,
                             marker=marker,
                             cmap=plt.cm.Spectral_r,
                             alpha=tmp_alpha,
                             s=tmp_size,
                             edgecolor=tmp_edgecolor,
                             label=labels[i])
    plt.legend(title='ANOMALY\n   TYPE')

    ax.set_xlabel('PH')
    ax.set_ylabel('EC')
    ax.set_zlabel('TEMPERATURE')

    plt.title('MATRIX PROFILE ANOMALIES\n', FontSize=16, FontWeight='bold')

    plt.show()

In [8]:
def add_isolation_forest_anomalies_labels(df):

    # normalize the data
    x = df[['EC', 'PH', 'ORP', 'TEMPERATURE']].values  #returns a numpy array
    min_max_scaler = MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)

    clf = IsolationForest(random_state=0)
    clf.fit(x_scaled)

    isolation_forest_anomaly_score = clf.decision_function(x_scaled)
    df['ALL_DIMS_if'] = 0

    min_multivariant_indices = np.argpartition(isolation_forest_anomaly_score,
                                               WINDOW)
    min_multivariant_indices = min_multivariant_indices[:WINDOW]

    df['ALL_DIMS_if'][min_multivariant_indices] = 1

    indx = np.arange(df.shape[0])
    for col in ['EC', 'PH', 'ORP', 'TEMPERATURE']:
        df[f'{col}_if'] = 0
        values = df[col].values
        tempdf = pd.DataFrame({
            'time': indx,
            'ph': values,
        })
        clf.fit(tempdf)
        isolation_forest_anomaly_score = clf.decision_function(tempdf)
        temp_min_indices = np.argpartition(isolation_forest_anomaly_score,
                                           WINDOW)
        temp_min_indices = temp_min_indices[:WINDOW]
        df[f'{col}_if'][temp_min_indices] = 1
    df['NORMAL_if'] = 1
    df['NORMAL_if'][(df['EC_if'] == 1) | (df['PH_if'] == 1) |
                    (df['ORP_if'] == 1) | (df['TEMPERATURE_if'] == 1) |
                    (df['ALL_DIMS_if'] == 1)] = 0
    return (df)

In [9]:
def plot__isolation_forest_anomalies(df):
    if_cols = ['NORMAL_if', 'ALL_DIMS_if', 'EC_if', 'PH_if', 'ORP_if', 'TEMPERATURE_if', ]
    colors = ['blue', 'black', 'red', 'green', 'yellow', 'orange']
    labels = ['NORMAL', 'ALL DIMS', 'EC', 'PH', 'ORP', 'TEMPERATURE']

    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111, projection='3d')

    for i in range(6):
        x = df['PH'][df[if_cols[i]] == 1]
        y = df['EC'][df[if_cols[i]] == 1]
        z = df['TEMPERATURE'][df[if_cols[i]] == 1]

        tmp_color = colors[i]
        if i == 0:
            tmp_size = 90
            tmp_alpha = 0.05
            tmp_edgecolor = None
            marker = '.'
        else:
            tmp_size = 130
            tmp_alpha = 0.8
            tmp_edgecolor = 'black'
            marker = '*'


        scatter = ax.scatter(x,
                             y,
                             z,
                             c=tmp_color,
                             marker=marker,
                             cmap=plt.cm.Spectral_r,
                             alpha=tmp_alpha,
                             s=tmp_size,
                             edgecolor=tmp_edgecolor,
                             label=labels[i])
    plt.legend(title='ANOMALY\n   TYPE')

    ax.set_xlabel('PH')
    ax.set_ylabel('EC')
    ax.set_zlabel('TEMPERATURE')

    plt.title('ISOLATION FOREST ANOMALIES\n', FontSize=16, FontWeight='bold')

    plt.show()