In [1]:
import warnings #suppress warnings
import torch
import random
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from scipy.io import arff
from pyod.models.anogan import AnoGAN
from pyod.models.mo_gaal import MO_GAAL
from pyod.models.lof import LOF
from pyod.models.gmm import GMM
import sklearn.metrics as sk

warnings.simplefilter("ignore")

2023-05-15 18:04:07.216780: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
class CustomDataset(Dataset):
    def __init__(self, path):
        # start preprocessing 
        self.arff_data = arff.loadarff(path)
        self.df = pd.DataFrame(self.arff_data[0])
        # 1 is outlier, 0 is normal data
        self.df["outlier"] = pd.factorize(self.df["outlier"])[0] - 1
        self.df["outlier"] = self.df["outlier"].abs()
        #end preprocessing
        
        self.data_tensor = torch.tensor(self.df.to_numpy()).float()
        self.data_numpy = self.df.to_numpy()
        
    def __len__(self):
        return len(self.data_tensor)
    
    def __getitem__(self, i):
        return self.data_tensor[i]

In [18]:
internet_ads_path = "./Resources/Datasets/InternetAds_withoutdupl_norm_02_v01.arff" #invert outlier-normal-labels
arrythmia_path = "./Resources/Datasets/Arrhythmia_withoutdupl_norm_02_v01.arff"
wave_path = "./Resources/Datasets/Waveform_withoutdupl_norm_v01.arff"

#seed = 777
#torch.manual_seed(seed)
#random.seed(seed)

#num_workers = 2
#batch_size = 128
#number of used GPUs
#gpu = 0 
#usedDevice = torch.device("cpu" if gpu == 0 else "cuda")

dataset = CustomDataset(arrythmia_path)

data_no_label = dataset.data_numpy[:,:-2]
data_label = dataset.data_numpy[:, -1]

#train_set, eval_set, test_set = torch.utils.data.random_split(dataset.data_numpy[:,:-1], [0.6,0.2,0.2]) #PFUSCH WEGEN NUMPY?
#maybe data loader for each category?
#dataloader = DataLoader(dataset=dataset.data_tensor, batch_size = batch_size, shuffle=True, num_workers=num_workers)

In [37]:
def check_accuracy(decision_values, labels):
    # positive: anomaly
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    
    for i in range(len(data_no_label)):
        if data_label[i] == labels[i]: # correct label
            if labels[i] == 1:
                tp += 1
            else:
                tn += 1
        else:
            if labels[i] == 1: # wrong label
                fp += 1
            else: 
                fn += 1
                
    print("------------------------------------------------------------")
    print("TP: " + str(tp))
    print("FP: " + str(fp))
    print("TN: " + str(tn))
    print("FN: " + str(fn))
    print("------------------------------------------------------------")
    print("Precision: " + str(tp/(tp+fp))) # When we declare a positive, how certain are we?
    print("Recall: " + str(tp/(tp+fn))) # How good are we at detecting the positives?
    print("Accuracy: " + str((tp+tn)/(tp+tn+fp+fn)))
    print("AUC: " + str(sk.roc_auc_score(data_label, decision_values)))
    print("F1 sklearn: " + str(sk.f1_score(y_true = data_label,y_pred = labels)))
    print("F1 calculated: " + str((tp/(tp+fp))*(tp/(tp+fn))*2/((tp/(tp+fp))+(tp/(tp+fn)))))

In [38]:
lof_model = LOF(n_neighbors=5)
lof_model.fit(data_no_label)

LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=1, n_neighbors=5, novelty=True, p=2)

In [39]:
decision_values = lof_model.decision_function(data_no_label)
check_accuracy(decision_values, lof_model.labels_)

------------------------------------------------------------
TP: 1
FP: 24
TN: 220
FN: 3
------------------------------------------------------------
Precision: 0.04
Recall: 0.25
Accuracy: 0.8911290322580645
AUC: 0.7120901639344263
F1 sklearn: 0.06896551724137932
F1 calculated: 0.06896551724137932


In [148]:
from pyod.models.knn import KNN

knn_model = KNN()
knn_model.fit(data_no_label)
predicted_values = knn_model.decision_function(data_no_label)
predicted_labels = knn_model.labels_
check_accuracy(predicted_values, predicted_labels)

AUC: 0.7264344262295082
TP: 1
FP: 24
TN: 220
FN: 3


In [91]:
mogaal_model = MO_GAAL(contamination=0.05)
mogaal_model.fit(train_set)

NameError: name 'train_set' is not defined

In [None]:
test_mogaal_pred = mogaal_model.predict(test_set)

In [None]:
check_accuracy(test_mogaal_pred)

In [None]:
anogan_model = AnoGAN()
anogan_model.fit(train_set)

In [14]:
test_anogan_pred = anogan_model.predict(test_set)

In [None]:
check_accuracy(test_anogan_pred)

In [19]:
gmm_model = GMM(n_components=5)
gmm_model.fit(train_set)

GMM(contamination=0.1, covariance_type='full', init_params='kmeans',
  max_iter=100, means_init=None, n_components=5, n_init=1,
  precisions_init=None, random_state=None, reg_covar=1e-06, tol=0.001,
  warm_start=False, weights_init=None)

In [20]:
test_gmm_pred = gmm_model.predict(test_set)

In [26]:
check_accuracy(test_gmm_pred)

688
TP: 13
FP: 116
TN: 542
FN: 17
Accuracy: 0.8066860465116279
