In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from load_data import *
from flow_catlog import *
import flow_manager as fm
import autoencoder as ae
import classifier as clfr
from sampler import *
from utl import *
import pandas as pd
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

from mpl_toolkits.axes_grid1.axes_divider import make_axes_locatable
from tensorflow.compat.v1 import logging
import math
logging.set_verbosity(logging.ERROR)

In [None]:
DTYPE=tf.float32
NP_DTYPE=np.float32
num_blocks = 10
steps = 5000
lr = 1e-3
x_datafile = 'X_train.csv'
y_datafile = 'y_train.csv'

In [None]:
X_data = pd.read_csv(x_datafile, header=None)
y_data = pd.read_csv(y_datafile, header=None).values

X_data = X_data.values.astype(NP_DTYPE)
X_data = StandardScaler().fit_transform(X_data)
input_dims = X_data.shape[1]

In [None]:
X_train, X_test = train_test_split(X_data, test_size=0.1, random_state=42, shuffle=False)
y_train, y_test = train_test_split(y_data, test_size=0.1, random_state=42, shuffle=False)

In [None]:
def local_ploter(data, cols=4):
    cols = min(data.shape[1], 4)
    nrows = math.ceil(data.shape[1]/cols)
    fig, ax = plt.subplots(nrows=nrows, ncols=cols, figsize=(5*cols, 4*nrows))
    for i in range(math.ceil(data.shape[1]/cols)):
        for j in range(cols):
            if i*cols+j < data.shape[1]:
                if nrows == 1:
                    ax[j].plot(data[:, i*cols+j])
                else:
                    ax[i][j].plot(data[:, i*cols+j])

    plt.tight_layout()
    plt.show()

In [None]:
local_ploter(X_train)

# Train an AutoEncoder

In [None]:
%%script false --no-raise-error

# Parameters for AutoEncoder
latent_dims = 2
encoder_units = [64, 32]
decoder_units = [32, 64]

encoder = ae.build_encoder(input_dims, latent_dims, hidden_units=encoder_units)
decoder = ae.build_decoder(input_dims, latent_dims, hidden_units=decoder_units)
autoencoder = ae.AutoEncoder(encoder, decoder, flow_model=None, flow_opt=None)
autoencoder.compile()
history = autoencoder.fit(X_train, epochs=30, batch_size=64, verbose=1, validation_split=0.33, shuffle=True)
plt.plot(history.history['reconstruction_loss'])

X_train_decoded = autoencoder.predict(X_train)
local_ploter(X_train_decoded)

# Smooth Data

In [None]:
%%script false --no-raise-error

# import libraries
import numpy as np
import matplotlib.pyplot as plt
from tsmoothie.utils_func import sim_seasonal_data
from tsmoothie.smoother import ConvolutionSmoother
from tsmoothie.bootstrap import BootstrappingWrapper

# operate bootstrap
bts = BootstrappingWrapper(ConvolutionSmoother(window_len=8, window_type='ones'), 
                           bootstrap_type='mbb', block_length=24)
bts_samples = bts.sample(X_train[:, 0], n_samples=100)

# plot the bootstrapped timeseries
plt.figure(figsize=(13,5))
plt.plot(bts_samples.T, alpha=0.3, c='orange')
plt.plot(X_train[:, 0], c='blue', linewidth=0.5)

# Training a Flow Model

In [None]:
base_dist = tfd.MultivariateNormalDiag(loc=tf.zeros([input_dims], DTYPE), scale_diag=tf.ones([input_dims], DTYPE))

In [None]:
# model = MAF(base_dist, 
#             num_blocks, 
#             hidden_units=[32, 32],
#             ndims=input_dims, 
#             activation=tf.nn.relu,
#             learning_rate=lr, 
#             use_batchnorm=True)

# model = IAF(base_dist, 
#             num_blocks, 
#             hidden_units=[512, 512],
#             ndims=input_dims, 
#             activation=tf.nn.relu,
#             learning_rate=lr, 
#             use_batchnorm=True)

model = RealNVP(base_dist, 
                num_blocks, 
                hidden_units=[512, 512],
                ndims=input_dims, 
                activation=tf.nn.relu,
                learning_rate=lr, 
                use_batchnorm=True)

# losses = fm.train_dist_routine(X_train, model.flow, lr, 1000)
# plt.plot(losses)

In [None]:
model.compile()
history = model.fit(x=X_train, batch_size=128, epochs=300, shuffle=False)
plt.plot(history.history['loss'])

In [None]:
X_train_samples = model.flow.sample(1000)
local_ploter(X_train_samples)

In [None]:
x_train_log_prob = model.flow.log_prob(X_train).numpy()
plt.hist(x_train_log_prob, bins=3000)
plt.xlim(-25, 25)
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, roc_curve
threshold = -20

y_predicted = model.flow.log_prob(X_train).numpy() < threshold

metrics = precision_recall_fscore_support(y_train, y_predicted)
print('precision: {}'.format(metrics[0]))
print('recall: {}'.format(metrics[1]))
print('fscore: {}'.format(metrics[2]))
print('support: {}'.format(metrics[3]))

In [None]:
fig, axes = plt.subplots(2, figsize=(30,10))
x = np.arange(X_train.shape[0])
anomaly_true = np.where(y_train == 1)[0]
anomaly_predicted_index = np.where(y_predicted==1)
axes[0].scatter(x, X_train[:, 0], color='red', zorder=1, label='col-0', s=10)
axes[0].scatter(anomaly_predicted_index, X_train[anomaly_predicted_index, 0], s=10, color='blue', zorder=2, label='anomaly_predicted')
axes[0].vlines(anomaly_true, ymin=X_train[:, 0].min(), ymax=X_train[:, 0].max(), linewidth=1, zorder=0, label='anomaly_true', color='green')
axes[0].legend()

axes[1].scatter(x, X_train[:, 1], color='red', zorder=1, label='col-1', s=10)
axes[1].scatter(anomaly_predicted_index, X_train[anomaly_predicted_index, 1], s=10, color='blue', zorder=2, label='anomaly_predicted')
axes[1].vlines(anomaly_true, ymin=X_train[:, 1].min(), ymax=X_train[:, 1].max(), linewidth=1, zorder=0, label='anomaly_true', color='green')
axes[1].legend()

plt.show()

# Generate Data for the Classifier

In [None]:
factor = 10

anomalies = sample_anomalies(model, factor=factor, n_samples=1000, sample_shape=input_dims)
normals = sample_normals(model, factor=factor, n_samples=10000, sample_shape=input_dims)
clr_x = pd.DataFrame(np.concatenate((anomalies, normals)), columns=['col%d' % i for i in range(anomalies.shape[1])])
clr_y = pd.DataFrame(np.concatenate(([1]*len(anomalies), [0]*len(normals))), columns=['label'])
clr_data = pd.concat([clr_x, clr_y], axis=1)
clr_data.dropna(inplace=True)
clr_data = shuffle(clr_data)
clr_x = clr_data[clr_data.columns[:-1]]
clr_x = MinMaxScaler().fit_transform(clr_x)
clr_y = clr_data['label']

clr_X_train, clr_X_test, clr_y_train, clr_y_test = train_test_split(clr_x, clr_y, test_size=0.33, random_state=42)
clf, score = clfr.RFClassifier(clr_X_train, clr_y_train, clr_X_test, clr_y_test)

print('precision: {}'.format(score[0]))
print('recall: {}'.format(score[1]))
print('fscore: {}'.format(score[2]))
print('support: {}'.format(score[3]))

# Apply Classifier on Real Data

In [None]:
clr_y_data = clf.predict(X_train)
y_data_anomalies = np.where(clr_y_data == 1)[0]
y_data_anomalies.shape

In [None]:
fig, axes = plt.subplots(2, figsize=(30,10))
x = np.arange(len(clr_X_train[:, 0]))
axes[0].plot(x, X_train[:, 0], color='red', zorder=0)
axes[0].scatter(y_data_anomalies, X_train[y_data_anomalies, 0], s=10, color='blue', zorder=1)

axes[1].plot(x, X_train[:, 1], color='red', zorder=0)
axes[1].scatter(y_data_anomalies, X_train[y_data_anomalies, 1], color='blue', s=10, zorder=1)

plt.show()