In [None]:
import os
import pathlib

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (roc_curve,
                             roc_auc_score,
                             average_precision_score,
                             precision_recall_fscore_support)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import IsolationForest
from sklearn.svm import LinearSVC

In [None]:
PROJECT_DIR = pathlib.Path(os.getenv('PROJECT_DIR', default=pathlib.Path.home() / 'ml4logs'))
DATASET_PATH = PROJECT_DIR / 'data/processed/bgl.npz'
REPORT_DIR = PROJECT_DIR / "reports/results"

assert(DATASET_PATH.exists() and DATASET_PATH.is_file())
assert(REPORT_DIR.exists() and REPORT_DIR.is_dir())

In [None]:
npzfile = np.load(DATASET_PATH)

## END TODO

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.5, stratify=Y)

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
models = {
    'Tree': DecisionTreeClassifier(),
    'LogisticRegression': LogisticRegression(C=100, tol=1e-2, max_iter=10**3),
    'LinearSVC': LinearSVC(penalty='l1', tol=0.1, dual=False),
    'IsolationForest': IsolationForest(random_state=2019, max_samples=0.9999, contamination=0.03, n_jobs=1),
}

In [None]:
roc = []
score = []
for name, model in models.items():
    display(name)
    model.fit(x_train_scaled, y_train)
    display(f'{name} is fitted')
    c_pred = model.predict(x_test_scaled)
    if hasattr(model, 'predict_proba'):
        y_pred = model.predict_proba(x_test_scaled)[:, 1]
    elif hasattr(model, 'decision_function'):
        y_pred = model.decision_function(x_test_scaled)
        if (name in {"IsolationForest", "OneClassSVM"}):
            y_pred = -y_pred
            c_pred[c_pred == 1] = 0
            c_pred[c_pred == -1] = 1
    else:
        raise NotImplementedError()
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    ap = average_precision_score(y_test, y_pred)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, c_pred, average='binary', zero_division=0)

    stats = {
        'Model': name,
        'AUC': auc,
        'AP': ap,
        'Precision': precision,
        'Recall': recall,
        'F1': f1
    }
    roc.append(pd.DataFrame({'Model': name, 'FPR': fpr, 'TPR': tpr}))
    score.append(pd.DataFrame([stats]))
    display(f'{name} is evaluated')

In [None]:
pd.concat(roc).to_csv(REPORT_DIR / 'bgl-fasttext-loglizer-roc.csv', index=False)
pd.concat(score).to_csv(REPORT_DIR / 'bgl-fasttext-loglizer-score.csv', index=False)