In [63]:
import warnings #suppress warnings
import torch
import random
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from scipy.io import arff
from pyod.models.anogan import AnoGAN
from pyod.models.mo_gaal import MO_GAAL
from pyod.models.lof import LOF
from pyod.models.gmm import GMM
import sklearn.metrics as sk

warnings.simplefilter("ignore")

In [64]:
class CustomDataset(Dataset):
    def __init__(self, path):
        # start preprocessing 
        self.arff_data = arff.loadarff(path)
        self.df = pd.DataFrame(self.arff_data[0])
        # 1 is outlier, 0 is normal data
        self.df["outlier"] = pd.factorize(self.df["outlier"], sort=True)[0]
        self.df["outlier"] = self.df["outlier"].abs()
        #end preprocessing
        
        self.data_tensor = torch.tensor(self.df.to_numpy()).float()
        self.data_numpy = self.df.to_numpy().astype("int32")
        
    def __len__(self):
        return len(self.data_tensor)
    
    def __getitem__(self, i):
        return self.data_tensor[i]

In [65]:
internet_ads_path = "./Resources/Datasets/InternetAds_withoutdupl_norm_02_v01.arff" #invert outlier-normal-labels
arrythmia_path = "./Resources/Datasets/Arrhythmia_withoutdupl_norm_02_v01.arff"
wave_path = "./Resources/Datasets/Waveform_withoutdupl_norm_v01.arff"

#seed = 777
#torch.manual_seed(seed)
#random.seed(seed)

#num_workers = 2
#batch_size = 128
#number of used GPUs
#gpu = 0 
#usedDevice = torch.device("cpu" if gpu == 0 else "cuda")

dataset = CustomDataset(arrythmia_path)

data_no_label = dataset.data_numpy[:,:-2]
data_label = dataset.data_numpy[:, -1]

#train_set, eval_set, test_set = torch.utils.data.random_split(dataset.data_numpy[:,:-1], [0.6,0.2,0.2]) #PFUSCH WEGEN NUMPY?
#maybe data loader for each category?
#dataloader = DataLoader(dataset=dataset.data_tensor, batch_size = batch_size, shuffle=True, num_workers=num_workers)

In [66]:
def check_accuracy(decision_values, labels):
    # positive: anomaly
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    
    for i in range(len(data_no_label)):
        if data_label[i] == labels[i]: # correct label
            if labels[i] == 1:
                tp += 1
            else:
                tn += 1
        else:
            if labels[i] == 1: # wrong label
                fp += 1
            else: 
                fn += 1
                
    print("------------------------------------------------------------")
    print("TP: " + str(tp))
    print("FP: " + str(fp))
    print("TN: " + str(tn))
    print("FN: " + str(fn))
    print("------------------------------------------------------------")
    print("Precision: " + str(tp/(tp+fp))) # When we declare a positive, how certain are we?
    print("Recall: " + str(tp/(tp+fn))) # How good are we at detecting the positives?
    print("Accuracy: " + str((tp+tn)/(tp+tn+fp+fn)))
    print("AUC: " + str(sk.roc_auc_score(data_label, decision_values)))
    #print("F1 sklearn: " + str(sk.f1_score(y_true = data_label,y_pred = labels)))
    #print("F1 calculated: " + str((tp/(tp+fp))*(tp/(tp+fn))*2/((tp/(tp+fp))+(tp/(tp+fn)))))

In [67]:
lof_model = LOF()
lof_model.fit(data_no_label)

LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=1, n_neighbors=20, novelty=True, p=2)

In [68]:
for i in range(1,100):
    model = LOF(n_neighbors=i)
    model.fit(data_no_label)
    decision_values = model.decision_function(data_no_label)
    print("AUC: " + str(i) + " " + str(sk.roc_auc_score(data_label, decision_values)))

AUC: 1 0.5
AUC: 2 0.7745901639344263
AUC: 3 0.6183401639344263
AUC: 4 0.7213114754098361
AUC: 5 0.7161885245901639
AUC: 6 0.7704918032786885
AUC: 7 0.7643442622950819
AUC: 8 0.7674180327868853
AUC: 9 0.7448770491803279
AUC: 10 0.6977459016393444
AUC: 11 0.6997950819672132
AUC: 12 0.709016393442623
AUC: 13 0.7059426229508197
AUC: 14 0.6598360655737704
AUC: 15 0.6577868852459016
AUC: 16 0.6506147540983607
AUC: 17 0.6557377049180327
AUC: 18 0.6454918032786885
AUC: 19 0.6383196721311476
AUC: 20 0.6290983606557378
AUC: 21 0.6045081967213115
AUC: 22 0.610655737704918
AUC: 23 0.5963114754098361
AUC: 24 0.6116803278688525
AUC: 25 0.6147540983606556
AUC: 26 0.6096311475409836
AUC: 27 0.6198770491803278
AUC: 28 0.6239754098360656
AUC: 29 0.6086065573770492
AUC: 30 0.6116803278688525
AUC: 31 0.6086065573770492
AUC: 32 0.6137295081967213
AUC: 33 0.6229508196721312
AUC: 34 0.6127049180327869
AUC: 35 0.5983606557377049
AUC: 36 0.6198770491803279
AUC: 37 0.6229508196721312
AUC: 38 0.6342213114754098


In [69]:
mogaal_model = MO_GAAL(contamination = 0.02)
mogaal_model.fit(data_no_label)

Epoch 1 of 60

Testing for epoch 1 index 1:
Epoch 2 of 60

Testing for epoch 2 index 1:
Epoch 3 of 60

Testing for epoch 3 index 1:
Epoch 4 of 60

Testing for epoch 4 index 1:
Epoch 5 of 60

Testing for epoch 5 index 1:
Epoch 6 of 60

Testing for epoch 6 index 1:


KeyboardInterrupt: 

In [35]:
decision_values = mogaal_model.decision_function(data_no_label)
check_accuracy(decision_values, mogaal_model.labels_)

------------------------------------------------------------
TP: 0
FP: 5
TN: 239
FN: 4
------------------------------------------------------------
Precision: 0.0
Recall: 0.0
Accuracy: 0.9637096774193549
AUC: 0.39754098360655743
F1 sklearn: 0.0


ZeroDivisionError: float division by zero

In [None]:
anogan_model = AnoGAN()
anogan_model.fit(train_set)

In [14]:
test_anogan_pred = anogan_model.predict(test_set)

In [None]:
check_accuracy(test_anogan_pred)

In [19]:
gmm_model = GMM(n_components=5)
gmm_model.fit(train_set)

GMM(contamination=0.1, covariance_type='full', init_params='kmeans',
  max_iter=100, means_init=None, n_components=5, n_init=1,
  precisions_init=None, random_state=None, reg_covar=1e-06, tol=0.001,
  warm_start=False, weights_init=None)

In [20]:
test_gmm_pred = gmm_model.predict(test_set)

In [26]:
check_accuracy(test_gmm_pred)

688
TP: 13
FP: 116
TN: 542
FN: 17
Accuracy: 0.8066860465116279
