In [1]:
%matplotlib notebook
import sys
sys.path.append("/home/stachu/Projects/Anomaly_detection/Forecasting_models")

from scipy.stats import pearsonr, spearmanr
import numpy as np

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

import torch
from torch.nn.functional import mse_loss
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch.nn import functional as F
import pytorch_lightning as pl

from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.offline import plot
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

from predpy.experimentator import Experimentator, load_experimentator
from predpy.dataset import MultiTimeSeriesDataloader
from predpy.plotter.plotter import plot_3d_embeddings
from tsad.noiser import apply_noise_on_dataframes, white_noise
from tsad.error_regresor import NNRegressor, RegressionDataModule

In [2]:
exp = load_experimentator("../saved_experiments/2022-01-06_21:33:23.pkl")
model = exp.load_pl_model(
    model_idx=0,
    dir_path="../checkpoints/household_power_consumption/LSTMVAE_h200_l1",
    # file_name="2022-01-06_19:51:39.ckpt",
    # dir_path="./checkpoints",
    # file_name="epoch=0-step=23542",
    find_last=False
)

In [3]:
tsm = exp.load_time_series_module(0)

normal_dfs = tsm.get_data_from_range(start=-10000, end=-3000, copy=True)
anomaly_dfs = tsm.get_data_from_range(start=-3000, copy=True)

apply_noise_on_dataframes(
    anomaly_dfs, make_noise=white_noise, negativity="abs", loc=1, scale=0.35);

In [4]:
normal_dl = MultiTimeSeriesDataloader(
    tsm.get_data_from_range(start=-10000, end=-3000, copy=True),
    tsm.window_size, tsm.target, batch_size=tsm.batch_size)
dataloaders = [normal_dl]

loc_scale_pairs = [
    (0, 0.02), (0, 0.07), (0, 0.2), (0, 0.5),
    (0.2, 0.5), (0.2, 0.8), (0.3, 1.2),]

for loc, scale in loc_scale_pairs:
    dataloaders += [MultiTimeSeriesDataloader(
        apply_noise_on_dataframes(
            tsm.get_data_from_range(start=-10000, end=-3000, copy=True),
            make_noise=white_noise, negativity="abs", loc=1, scale=0.35),
        tsm.window_size, tsm.target, batch_size=tsm.batch_size)]

In [5]:
def max_err(x, x_tilda):
    return torch.max(torch.abs(x - x_tilda)).item()

In [6]:
zs, z_distances, recon_errs = [], [], []
for dataloader in tqdm(dataloaders, desc="Datasets"):
    for batch in tqdm(dataloader, desc="Records", leave=False):
        input_, _ = model.get_Xy(batch)
        z = model.encode(input_)
        rec = model.decode(z)
        for i in range(len(z)):
            zs += [z[i]]
            z_distances += [torch.norm(z[i]).item()]
            recon_errs += [max_err(input_[i], rec[i])]
zs = torch.cat(zs)
zs = zs.reshape(zs.shape[0], -1).detach().cpu().numpy()

Datasets:   0%|          | 0/8 [00:00<?, ?it/s]

Records:   0%|          | 0/104 [00:00<?, ?it/s]

Records:   0%|          | 0/104 [00:00<?, ?it/s]

Records:   0%|          | 0/104 [00:00<?, ?it/s]

Records:   0%|          | 0/104 [00:00<?, ?it/s]

Records:   0%|          | 0/104 [00:00<?, ?it/s]

Records:   0%|          | 0/104 [00:00<?, ?it/s]

Records:   0%|          | 0/104 [00:00<?, ?it/s]

Records:   0%|          | 0/104 [00:00<?, ?it/s]

In [7]:
def plot_scatter(x, y, ds_names, ds_lens=None, title=None, file_path=None):
    data = []
    start, end = 0, 0
    if ds_lens is not None:
        for i, ds_name in enumerate(ds_names):
            start = end
            end += ds_lens[i]
            data += [
                go.Scatter(
                    x=x[start:end],
                    y=y[start:end],
                    mode='markers', name=ds_name,
                )
            ]
    else:
        for i, ds_name in enumerate(ds_names):
            data += [
                go.Scatter(
                    x=x[i],
                    y=y[i],
                    mode='markers', name=ds_name,
                )
            ]

    layout = go.Layout(
        title=title,
        yaxis=dict(title='reconstruction error'),
        xaxis=dict(title='z norm')
    )

    fig = go.Figure(data=data, layout=layout)
    if file_path:
        plot(fig, filename=file_path)
    else:
        fig.show()

In [8]:
print("Pearson %.3f, p-val %.2f" % pearsonr(z_distances, recon_errs))
print("Spearman %.3f, p-val %.2f" % spearmanr(z_distances, recon_errs))

plot_scatter(
    z_distances, recon_errs, ds_lens=[len(dl) for dl in dataloaders],
    ds_names=["normal_data"] + [str(el) for el in loc_scale_pairs],
    title="Z norms and reconstruction errors"
)

Pearson 0.948, p-val 0.00
Spearman 0.431, p-val 0.00


In [9]:
pca = PCA(n_components=3)
embs_3d = pca.fit_transform(zs)

In [10]:
data = []
start, end = 0, 0
for i, dl in enumerate(dataloaders):
    start = end
    end += len(dl)
    if i == 0:
        name = "normal data"
    else:
        name = str(loc_scale_pairs[i-1])
    data += [
        go.Scatter3d(
            x=embs_3d[start:end, 0].tolist(),
            y=embs_3d[start:end, 1].tolist(),
            z=embs_3d[start:end, 2].tolist(),
            mode='markers', name=name,
        )
    ]

layout = go.Layout(
    title="VAE embeddings for white noise added (PCA reduction)",
)

fig = go.Figure(data=data, layout=layout)
fig.show()

In [11]:
# class NN(pl.LightningModule):
#     def __init__(self, c_in: int):
#         super().__init__()

#         self.layer_1 = nn.Linear(c_in, 128)
#         self.layer_2 = nn.Linear(128, 256)
#         self.layer_3 = nn.Linear(256, 1)

#     def forward(self, x):
#         x = F.relu(self.layer_1(x))
#         x = F.relu(self.layer_2(x))
#         x = self.layer_3(x)
#         return x

#     def step(self, batch):
#         x, y = batch
#         preds = self(x)
#         loss = F.mse_loss(preds, y)
#         return loss

#     def training_step(self, batch, batch_idx):
#         loss = self.step(batch)
#         self.log("train_loss", loss, prog_bar=True, logger=True)
#         return loss

#     def validation_step(self, batch, batch_idx):
#         loss = self.step(batch)
#         self.log("val_loss", loss, prog_bar=True, logger=True)
#         return loss

#     def test_step(self, batch, batch_idx):
#         loss = self.step(batch)
#         self.log("test_loss", loss, prog_bar=True, logger=True)
#         return loss

#     def configure_optimizers(self):
#         return torch.optim.Adam(self.parameters(), lr=1e-4)


# class DataModule(pl.LightningDataModule):
#     def __init__(self, x, y, splits=[0.8, 0.1, 0.1], batch_size=64):
#         super().__init__()
#         self.x = x
#         self.y = y
#         self.splits = splits

#     def setup(self, stage):
#         dataset = TensorDataset(self.x, self.y)
#         len_ = len(dataset)
#         splits = [int(frac * len_) for frac in self.splits]
#         if len(dataset) != sum(splits):
#             splits[0] += len(dataset) - sum(splits)
#         self.train_ds, self.val_ds, self.test_ds =\
#             torch.utils.data.random_split(
#                 dataset, splits)

#     def train_dataloader(self):
#         return DataLoader(self.train_ds, batch_size=64)

#     def val_dataloader(self):
#         return DataLoader(self.val_ds, batch_size=64)

#     def test_dataloader(self):
#         return DataLoader(self.test_ds, batch_size=64)

In [12]:
z_nn_path = None#"./z_nn.ckpt"

if z_nn_path is not None:
    z_nn = Regressor.load_from_checkpoint(
        checkpoint_path=z_nn_path, c_in=zs.shape[-1], h_sizes=[128, 256])
else:
    checkpoint = ModelCheckpoint(dirpath=".", filename="z_nn", save_top_k=1)
    early_stopping = EarlyStopping(monitor="val_loss", patience=2)

    z_data_module = RegressionDataModule(
        torch.tensor(zs), torch.unsqueeze(torch.tensor(recon_errs), -1))

    z_nn = NNRegressor(c_in=zs.shape[-1], h_sizes=[128, 256])
    trainer = pl.Trainer(
        max_epochs=20, gpus=1, callbacks=[checkpoint, early_stopping])

    trainer.fit(z_nn, z_data_module)


Checkpoint directory . exists and is not empty.

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type       | Params
--------------------------------------
0 | layers | Sequential | 59.0 K
--------------------------------------
59.0 K    Trainable params
0         Non-trainable params
59.0 K    Total params
0.236     Total estimated model params size (MB)


                                                              


The dataloader, val dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 12 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.


The dataloader, train dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 12 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Epoch 19: 100%|██████████| 747/747 [00:05<00:00, 140.94it/s, loss=0.0181, v_num=17, train_loss=0.0164, val_loss=0.0238]


In [13]:
z_norm_nn_path = None#"./z_norm_nn.ckpt"

if z_norm_nn_path is not None:
    z_norm_nn = Regressor.load_from_checkpoint(
        checkpoint_path=z_norm_nn_path, c_in=1, h_sizes=[128, 256])
else:
    checkpoint = ModelCheckpoint(dirpath=".", filename="z_norm_nn", save_top_k=1)
    early_stopping = EarlyStopping(monitor="val_loss", patience=2)

    z_norm_data_module = RegressionDataModule(
        torch.unsqueeze(torch.tensor(z_distances), -1), torch.unsqueeze(torch.tensor(recon_errs), -1))

    z_norm_nn = NNRegressor(c_in=1, h_sizes=[128, 256])
    trainer = pl.Trainer(
        max_epochs=20, gpus=1, callbacks=[checkpoint, early_stopping])

    trainer.fit(z_norm_nn, z_norm_data_module)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type       | Params
--------------------------------------
0 | layers | Sequential | 33.5 K
--------------------------------------
33.5 K    Trainable params
0         Non-trainable params
33.5 K    Total params
0.134     Total estimated model params size (MB)


Epoch 3: 100%|██████████| 747/747 [00:04<00:00, 174.61it/s, loss=0.0348, v_num=18, train_loss=0.0387, val_loss=0.036]


In [14]:
lr_z = LinearRegression().fit(zs, recon_errs)
lr_z_norm = LinearRegression().fit(np.array(z_distances).reshape(-1, 1), recon_errs)

In [15]:
z_lr_preds = lr_z.predict(zs)
z_norm_lr_preds = lr_z_norm.predict(np.array(z_distances).reshape(-1, 1))
z_nn_pred = z_nn(torch.tensor(zs)).cpu().detach().numpy().squeeze()
z_norm_nn_pred = z_norm_nn(torch.unsqueeze(torch.tensor(z_distances), -1)).cpu().detach().numpy().squeeze()

In [16]:
preds = [z_lr_preds, z_norm_lr_preds, z_nn_pred, z_norm_nn_pred]
preds_names = ["z_lr_preds", "z_norm_lr_preds", "z_nn_pred", "z_norm_nn_pred"]

In [17]:
plt.scatter(z_distances, recon_errs, label="Normal data")
plt.scatter(z_distances, z_lr_preds, label="z_lr_preds")
plt.scatter(z_distances, z_norm_lr_preds, label="z_norm_lr_preds")
plt.scatter(z_distances, z_nn_pred, label="z_nn_pred")
plt.scatter(z_distances, z_norm_nn_pred, label="z_norm_nn_pred")
plt.legend();

# plot_scatter(
#     x=[z_distances]*(1 + len(preds_names)), y=[recon_errs] + preds,
#     # ds_lens=[sum([len(dl) for dl in dataloaders])]*(1 + len(preds_names)),
#     ds_names=["true_values"] + preds_names,
#     title="Error predictions", file_path="./error_predictions.html"
# )

<IPython.core.display.Javascript object>