In [None]:
import os
import pathlib

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (roc_curve,
                             roc_auc_score,
                             average_precision_score,
                             precision_recall_fscore_support)
import tensorflow as tf
import keras
import pyod

In [None]:
PROJECT_DIR = pathlib.Path(os.getenv('PROJECT_DIR', default=pathlib.Path.home() / 'ml4logs'))
DATASET_PATH = PROJECT_DIR / 'data/processed/hdfs_1-fasttext-max.npz'
REPORT_DIR = PROJECT_DIR / "temporary"

assert(DATASET_PATH.exists() and DATASET_PATH.is_file())
assert(REPORT_DIR.exists() and REPORT_DIR.is_dir())

## Load data

In [None]:
npzfile = np.load(DATASET_PATH)

In [None]:
N = 5 * 10**4
X = X[:N]
Y = Y[:N]

## Train/test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5, stratify=Y)

## Scale

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

## Train and evaluate models

In [None]:
from pyod.models.auto_encoder import AutoEncoder
from pyod.models.vae import VAE
model = VAE(epochs=10)
# model = AutoEncoder(epochs=10)
model.fit(x_train_scaled, y_train)

In [None]:
c_pred = model.predict(x_test_scaled)
y_pred = model.predict_proba(x_test_scaled)[:, 1]

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
ap = average_precision_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, c_pred, average='binary', zero_division=0)

In [None]:
auc, ap, precision, recall, f1