In [1]:
import logging
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import sys
sys.path.append('..')
import tensorflow as tf
tf.keras.backend.clear_session()
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Dense, Reshape, InputLayer
from tensorflow.keras.losses import mse
from tqdm import tqdm
from typing import Tuple

from odcd.datasets import fetch_kdd
from odcd.utils.data import create_outlier_batch
from odcd.utils.saving import save_od, load_od
from odcd.utils.visualize import plot_instance_outlier, plot_feature_outlier_tabular

logger = tf.get_logger()
logger.setLevel(logging.ERROR)

## Load dataset

We only keep a number of continuous (18 out of 41) features.

In [2]:
kddcup = fetch_kdd(percent10=True)  # only load 10% of the dataset
print(kddcup.data.shape, kddcup.target.shape)

(494021, 18) (494021,)


Assume that a model is trained on *normal* instances of the dataset (not outliers) and standardization is applied:

In [3]:
np.random.seed(0)
normal_batch = create_outlier_batch(kddcup.data, kddcup.target, n_samples=400000, perc_outlier=0)
data, target = normal_batch.data.astype('float'), normal_batch.target
print(data.shape, target.shape)
print('{}% outliers'.format(100 * target.mean()))

(400000, 18) (400000,)
0.0% outliers


In [4]:
mean, stdev = data.mean(axis=0), data.std(axis=0)

Generate batch of data with 10% outliers:

In [5]:
np.random.seed(1)
outlier_batch = create_outlier_batch(kddcup.data, kddcup.target, n_samples=100000, perc_outlier=10)
X_train, y_train = outlier_batch.data.astype('float'), outlier_batch.target
print(X_train.shape, y_train.shape)
print('{}% outliers'.format(100 * y_train.mean()))

(100000, 18) (100000,)
10.0% outliers


Apply standardization:

In [6]:
X_train = (X_train - mean) / stdev

## Define model

In [None]:
def relative_euclidean_distance(x: tf.Tensor, y: tf.Tensor, axis: int = -1):
    # TODO: make sure also works for higher dim enc eg for images
    dist = tf.norm(x - y, axis=axis) / tf.norm(x, axis=axis)
    return dist


class DAGMM(tf.keras.Model):
    """  Deep Autoencoding Gaussian Mixture Model.  """

    def __init__(self,
                 encoder_net: tf.keras.Sequential,
                 decoder_net: tf.keras.Sequential,
                 gmm_density_net: tf.keras.Sequential,
                 n_gmm: int = 5,
                 latent_dim: int = 10,  # TODO: infer from encoder_net
                 name: str = 'dagmm') -> None:
        super(DAGMM, self).__init__(name=name)
        self.encoder = encoder_net
        self.decoder = decoder_net
        self.gmm_density = gmm_density_net
        self.n_gmm = n_gmm
        self.latent_dim = latent_dim

    def call(self, x: tf.Tensor) -> tf.Tensor:
        enc = self.encoder(x)
        x_recon = self.decoder(enc)
        if len(x.shape) > 2:
            x = Flatten()(x)
            x_recon = Flatten()(x_recon)
        if len(enc.shape) > 2:
            enc = Flatten()(enc)
        rec_cos = tf.keras.losses.cosine_similarity(x, x_recon, -1)
        rec_euc = relative_euclidean_distance(x, x_recon, -1)
        z = tf.concat([enc, rec_cos, rec_euc], -1)
        gamma = self.gmm_density(z)
        # TODO: check whether reshaping before returning is needed for x_recon
        return enc, x_recon, z, gamma

    def gmm_params(self,
                   z: tf.Tensor,
                   gamma: tf.Tensor) \
            -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:

        # nb of samples in batch
        N = gamma.shape[0]

        # K
        sum_gamma = K.sum(gamma, 0)

        # K but should be [K, 1]?; check
        phi = sum_gamma / N

        # K x D (D = latent_dim)
        mu = (K.sum(tf.expand_dims(gamma, -1) * tf.expand_dims(z, 1), 0)
              / tf.expand_dims(sum_gamma, -1))

        # N x K x D
        z_mu = tf.expand_dims(z, 1) - tf.expand_dims(mu, 0)

        # N x K x D x D
        z_mu_outer = tf.expand_dims(z_mu, -1) * tf.expand_dims(z_mu, -2)

        # K x D x D
        cov = (K.sum(tf.expand_dims(tf.expand_dims(gamma, -1), -1) * z_mu_outer, 0)
               / tf.expand_dims(tf.expand_dims(sum_gamma, -1), -1))

        return phi, mu, cov

    def gmm_energy(self,
                   z: tf.Tensor,
                   phi: tf.Tensor,
                   mu: tf.Tensor,
                   cov: tf.Tensor,
                   return_mean: bool = True) \
            -> Tuple[tf.Tensor, tf.Tensor]:

        K, D, _ = tf.shape(cov)

        # N x K x D
        z_mu = tf.expand_dims(z, 1) - tf.expand_dims(mu, 0)

        cov_inverse = []
        det_cov = []
        cov_diag = 0
        eps = tf.constant(1e-12)
        for i in range(K):
            cov_k = cov[i] + tf.eye(D) * eps
            cov_k_inv = tf.expand_dims(tf.linalg.inv(cov_k), 0)
            cov_inverse.append(cov_k_inv)
            # use product of diagonals of LT from cholesky decomposition to compute determinant of matrix
            det_cov_k = tf.expand_dims(tf.reduce_prod(tf.linalg.diag(tf.linalg.cholesky(cov_k * 2 * np.pi))), 0)
            det_cov.append(det_cov_k)
            cov_diag += K.sum(1 / tf.linalg.diag(cov_k))

        # K x D x D
        cov_inverse = tf.concat(cov_inverse, 0)

        # K
        det_cov = tf.concat(det_cov)

        # N x K
        exp_term_tmp = -.5 * K.sum(K.sum(tf.expand_dims(z_mu, -1) * tf.expand_dims(cov_inverse, 0), -2) * z_mu, -1)
        # use logsumexp trick for stability
        max_val = K.max(K.clip(exp_term_tmp, 0, 1e12), axis=1, keepdims=True)
        exp_term = K.exp(exp_term_tmp - max_val)

        # N
        sample_energy = (- tf.squeeze(max_val) -
                         tf.log(K.sum(tf.expand_dims(phi, 0) * exp_term /
                                      tf.expand_dims(tf.sqrt(det_cov), 0), 1) + eps))

        if return_mean:
            sample_energy = tf.reduce_mean(sample_energy)

        return sample_energy, cov_diag

    def loss_fn(self,
                x: tf.Tensor,
                recon_x: tf.Tensor,
                z: tf.Tensor,
                gamma: tf.Tensor,
                w_energy: float = .1,
                w_cov_diag: float = .005) \
            -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
        recon_error = K.mean((x - recon_x) ** 2)
        phi, mu, cov = self.gmm_params(z, gamma)
        sample_energy, cov_diag = self.gmm_energy(z, phi, mu, cov)
        loss = recon_error + w_energy * sample_energy + w_cov_diag * cov_diag
        return loss, sample_energy, recon_error, cov_diag