In [None]:
import pandas as pd
import csv
import time
from sklearn.ensemble import IsolationForest

In [None]:
t = time.time()
ls_ = list(csv.reader(open('Final/[KNN]reference_indexes.csv', newline='')))
ls_learn = [int(x[0]) for x in ls_[1:]]

df = pd.read_csv("final_dataset.csv")
df.drop(columns=['Unnamed: 0'], inplace=True)  

train_data = df.iloc[ls_learn,:]
test_data = df[~df.isin(train_data)].dropna(how = 'all')
del df, ls_

print("Data loaded: %0.2f s." % (time.time() - t))

Data loaded: 56.45 s.


In [None]:
t = time.time()
clf = IsolationForest(random_state=0, contamination=0.005).fit(train_data)
print("Fitting done: %0.2f s." % (time.time() - t))

Fitting done: 16.57 s.


In [None]:
t = time.time()
res_ = clf.predict(test_data)
ahh = (time.time() - t)*1_000_000/test_data.shape[0]
print("Fitting done: %0.2f ns." % ahh)

Fitting done: 61.84 ns.


In [None]:
test_data['Prediction'] = res_

In [None]:
anomalous_indexes_if = test_data.loc[test_data['Prediction'] == -1].index
anomalous_indexes_if

Int64Index([   5974,    8204,    9329,   10366,   11143,   11561,   11915,
              11916,   13337,   13338,
            ...
            3574064, 3574162, 3574507, 3574508, 3574509, 3574510, 3574713,
            3575280, 3575282, 3575290],
           dtype='int64', length=16574)

In [None]:
import numpy as np
ls_ = list(csv.reader(open('Final/[KNN]anonmalous_indexes.csv', newline='')))
als_ = list(csv.reader(open('Final/[Autoencoder]anonmalous_indexes.csv', newline='')))

knn_anom_indx = [int(x[0]) for x in ls_[1:]]
autoencoders_anom_indx = [int(x[0]) for x in als_[1:]]
iso_forest_anom_indx = anomalous_indexes_if

print(f"KNN detected {len(knn_anom_indx)} anomalies.")
print(f"Autoencoder detected {len(autoencoders_anom_indx)} anomalies.")
print(f"Isolation Forest detected {len(iso_forest_anom_indx)} anomalies.")

KNN detected 21310 anomalies.
Autoencoder detected 16637 anomalies.
Isolation Forest detected 16574 anomalies.


# Comparison of 3 approaches KNN, Autoencoders and Isolation Forest: 
For learning in all of the algorithms the same sub-set of data was used. \
Sum of detected anomalous samples from three models: `39604` ~ `1.5%` of the dataset. \
We can refine it even further and select samples all the models agreed upon: `516` sample ~ `0.02%`. \


In [None]:
knn_autoencoder = [x for x in knn_anom_indx if x in autoencoders_anom_indx]
knn_iso_forest = [x for x in iso_forest_anom_indx if x in knn_anom_indx]
autoencoder_iso_forest = [x for x in iso_forest_anom_indx if x in autoencoders_anom_indx]
common = [x for x in knn_anom_indx if x in autoencoders_anom_indx and x in iso_forest_anom_indx]

print(f"KNN & Autoencoders: {len(knn_autoencoder)} ~ {len(knn_autoencoder)*100/len(autoencoders_anom_indx):{3}.{3}}% common anomalies.")
print(f"Autoencoder & Iso.Forest: {len(autoencoder_iso_forest)} ~ {len(autoencoder_iso_forest)*100/len(autoencoders_anom_indx):{3}.{3}}% common anomalies.")
print(f"Iso.Forest & KNN: {len(knn_iso_forest)} ~ {len(knn_iso_forest)*100/len(autoencoders_anom_indx):{3}.{3}}% common anomalies.")
print(f"Iso.Forest & KNN & Autoencoders: {len(common)} ~ {len(common)*100/len(autoencoders_anom_indx):{3}.{3}}% common anomalies.")

KNN & Autoencoders: 3228 ~ 19.4% common anomalies.
Autoencoder & Iso.Forest: 936 ~ 5.63% common anomalies.
Iso.Forest & KNN: 2719 ~ 16.3% common anomalies.
Iso.Forest & KNN & Autoencoders: 516 ~ 3.1% common anomalies.
