In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils_lstm import load_lstm_dfs, load_train_test_lstm
import copy
from Vanilla_LSTM import Vanilla_LSTM
from lstm_data_prep import LstmDataPrep
from utils_data import data_splitter
from lstm_model import LstmModel

2024-02-26 15:26:18.801625: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def split_sequences(sequences, n_steps):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix > len(sequences)-1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :], sequences[end_ix, :]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [3]:
list_of_df = load_lstm_dfs()

In [4]:
list_of_df[0]

Unnamed: 0_level_0,Accelerometer1RMS,Accelerometer2RMS,Current,Pressure,Temperature,Thermocouple,Voltage,Volume Flow RateRMS,anomaly,changepoint
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-03-09 12:14:36,0.027429,0.040353,0.770310,0.382638,71.2129,25.0827,219.789,32.0000,0.0,0.0
2020-03-09 12:14:37,0.027269,0.040226,1.096960,0.710565,71.4284,25.0863,233.117,32.0104,0.0,0.0
2020-03-09 12:14:38,0.027040,0.039773,1.140150,0.054711,71.3468,25.0874,234.745,32.0000,0.0,0.0
2020-03-09 12:14:39,0.027563,0.040313,1.108680,-0.273216,71.3258,25.0897,205.254,32.0104,0.0,0.0
2020-03-09 12:14:41,0.026570,0.039566,0.704404,0.382638,71.2725,25.0831,212.095,33.0000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2020-03-09 12:34:31,0.028051,0.039835,1.061810,0.054711,69.9380,24.9068,223.742,32.9875,0.0,0.0
2020-03-09 12:34:32,0.027184,0.039945,1.206770,0.054711,69.9818,24.9166,227.789,32.0129,0.0,0.0
2020-03-09 12:34:33,0.027617,0.039430,1.309070,0.054711,69.9444,24.9103,230.527,32.9875,0.0,0.0
2020-03-09 12:34:34,0.027669,0.039402,1.303750,-0.273216,69.9516,24.9103,232.127,32.9875,0.0,0.0


In [5]:
preprocess_pipeline = LstmDataPrep().get_preprocess_pipeline()

In [6]:
data = preprocess_pipeline.fit_transform(list_of_df[0][0:400])

In [7]:
data.shape

(400, 8)

In [9]:
data

array([[ 0.14336242, -0.21437812, -0.77086653, ...,  0.57160501,
        -0.95271795, -0.61428113],
       [-0.24423838, -0.30555175,  0.41459621, ...,  0.77054428,
         0.29866113, -0.59166427],
       [-0.80240301, -0.63091507,  0.57133932, ...,  0.83133128,
         0.45151566, -0.61428113],
       ...,
       [ 0.16818055,  0.25342603,  0.08408927, ..., -2.37380136,
        -0.40786855, -0.61428113],
       [-1.00946407, -0.34732768,  0.58019445, ..., -2.84904516,
        -0.4084319 , -0.58992451],
       [-0.40360971, -0.38090662,  0.91654433, ..., -2.41801008,
         1.35822092,  1.53693072]])

In [8]:
# hyperparameters selection
N_STEPS = 5
EPOCHS = 254
BATCH_SIZE = 32
VAL_SPLIT = 0.2
PARAMS = [N_STEPS, EPOCHS, BATCH_SIZE, VAL_SPLIT]
MODEL_HP = copy.deepcopy(PARAMS)
Q = 0.99 # quantile for upper control limit (UCL) selection

In [None]:
# model = Vanilla_LSTM(PARAMS

In [None]:
class LstmNew:
    
    def __init__(self,sequences_length,num_splits,partition_num,model_hp= None) -> None:
        self.sequences_length = sequences_length
        self.model_hp =  self.model_hp =model_hp if model_hp else MODEL_HP
        self.num_splits = num_splits
        self.partition_num = partition_num
        self.ucl = None
        self.prediction = None
        self.residuals = None
        
    
    def fit(self,X,y=None):
        
        X_parts = data_splitter(X, num_splits=self.num_splits)[self.partition_num]
        print(X_parts.shape)
        print(X_parts[:3])
        
        x,y = split_sequences(X_parts,n_steps=self.sequences_length)
        orig_model = Vanilla_LSTM(self.model_hp)
        orig_model.fit(x,y)
        self.trained_model = orig_model
        residuals = pd.DataFrame(y - self.trained_model.predict(x)).abs().sum(axis=1)
        self.ucl = residuals.quantile(Q) * 5
        return self
    
    
    def transform(self,X):
        x,y = split_sequences(X,n_steps=self.sequences_length)
        self.residuals = pd.DataFrame(y - self.trained_model.predict(x)).abs().sum(axis=1)
        self.prediction = pd.Series((self.residuals > self.ucl).astype(int).values).fillna(0)
        return self.prediction

In [None]:
model = LstmNew(sequences_length=N_STEPS,model_hp=MODEL_HP,num_splits=0,partition_num=0)

In [None]:
model.fit(data)

In [None]:
model.ucl

In [None]:
data_test = preprocess_pipeline.transform(list_of_df[0])

In [None]:
preds = model.transform(data_test)

In [None]:
preds.value_counts()

In [None]:
model.residuals.max()

#### combining the model and  the pipeline:

In [None]:
transformers_list

In [None]:
predicted_outlier = []
y_test = []
models = {}

In [None]:
%%time
# inference
for i,df in enumerate(list_of_df):
    y_test += list(df['anomaly'][5:].values)
    
    X_train = df[:400].drop(['anomaly','changepoint'], axis=1)
    
    # scaler init and fitting
    StSc = StandardScaler()
    StSc.fit(X_train)
    
    # convert into input/output
    X, y = split_sequences(StSc.transform(X_train), N_STEPS)
    
    # model fitting
    model.fit(X, y)
    # results predicting
    residuals_train = pd.DataFrame(y - model.predict(X)).abs().sum(axis=1)
    UCL = residuals_train.quantile(Q) * 5
    print("UCL: ", UCL)
    
    models[f"model_{i}"] = {
        'model': model,
        'ucl': UCL,
        'residuals' : list() ,
        'prediction' : list()
    }
    

    # results predicting
    X, y = split_sequences(StSc.transform(df.drop(['anomaly','changepoint'], axis=1)), N_STEPS)
    lstm_residuals = pd.DataFrame(y - model.predict(X)).abs().sum(axis=1)
    print('lstm_residuals', lstm_residuals)
    prediction = pd.Series((lstm_residuals > UCL).astype(int).values, 
                                index=df[N_STEPS:].index).fillna(0)
    
    # predicted outliers saving
    predicted_outlier.append(prediction)
    
    if i >= 2: 
        break

In [None]:
# # true outlier indices selection
# true_outlier = [df.anomaly for df in list_of_df]

# predicted_outlier[0].plot(figsize=(12,3), label='predictions', marker='o', markersize=5)
# true_outlier[0].plot(marker='o', markersize=2)
# plt.legend();

In [None]:
# models['model_0']

In [None]:
%%time
# inference
for i,df in enumerate(list_of_df):
    
    X, y = split_sequences(StSc.transform(df.drop(['anomaly','changepoint'], axis=1)), N_STEPS)

    
    for  model_name in models.keys():
        print("df i : ", i)
        lstm_residuals = pd.DataFrame(y - models[model_name]['model'].predict(X)).abs().sum(axis=1)
        print("lstm_residuals", lstm_residuals)
        print(f"models[{model_name}]['ucl']: ", models[model_name]['ucl'])
        prediction = pd.Series((lstm_residuals > models[model_name]['ucl']).astype(int).values, index=df[N_STEPS:].index).fillna(0)
        print("prediction: ", prediction)
        models[model_name]['residuals'] += list(lstm_residuals.values)
        models[model_name]['prediction'] += list(prediction.values)
        print("lstm_residuals", max(models[model_name]['residuals']))

    if i >= 2: 
        break


In [None]:
models['model_0']['ucl']

In [None]:
lstm_residuals.min()

In [None]:
preds_df = pd.DataFrame()

for model_name in models:
    preds_df[model_name] = models[model_name]['prediction']

In [None]:
preds_df = preds_df.assign(
            avg_prediction=lambda df_: df_.mean(axis=1),
            median_prediction=lambda df_: df_.median(axis=1),
            max_prediction=lambda df_: df_.max(axis=1),
)

In [None]:
preds_df[preds_df['model_0']!=preds_df['model_1']]

In [None]:
y_pred = preds_df['avg_prediction']

In [None]:
# train_set, test_set = load_train_test_lstm(list_of_dfs=load_lstm_dfs())

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix , ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
import numpy as np

In [None]:
def calc_metrics(y_true,y_pred):
    th = 0.99
    accuracy = accuracy_score(y_true, np.where(y_pred>=th,1,0))
    cm = confusion_matrix(y_true, np.where(y_pred>=th,1,0))
    f1 = f1_score(y_true, np.where(y_pred>=th,1,0))
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)
    return accuracy,cm,f1,fpr, tpr, thresholds, roc_auc

def plot_metrics(cm, fpr, tpr, roc_auc, thresholds, title = 'Model Evaluation Metrics'):
    
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(16, 8))
    fig.suptitle(title, fontsize=16, y=1.02)
    confusion_matrix_ax = ax[0]
    roc_curve_ax = ax[1]

    # Confusion Matrix in the first subplot
    ConfusionMatrixDisplay(confusion_matrix=cm).plot(ax=confusion_matrix_ax,cmap='Blues')
    confusion_matrix_ax.set_title('Confusion Matrix')
    confusion_matrix_ax.set_xlabel('Predicted labels')
    confusion_matrix_ax.set_ylabel('True labels')

    # ROC Curve in the second subplot
    roc_curve_ax.plot(fpr, tpr, color='darkorange', lw=2, label=f'AUC = {roc_auc:.2f}')
    roc_curve_ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    roc_curve_ax.set_xlabel('False Positive Rate')
    roc_curve_ax.set_ylabel('True Positive Rate')
    roc_curve_ax.set_title('Receiver Operating Characteristic (ROC) Curve')
    roc_curve_ax.legend(loc='lower right')
    # Display the plot
    plt.tight_layout()
    plt.show()

In [None]:
accuracy, cm, f1, fpr, tpr, thresholds, roc_auc = calc_metrics(y_test,y_pred)
plot_metrics(cm, fpr, tpr, roc_auc, thresholds, title = 'Average Probability')

In [None]:
y_pred.value_counts()

In [None]:
cm

In [None]:
1005/(1005+12062)

In [None]:
22022/(22022+12062)

In [None]:
f1