<a href="https://colab.research.google.com/github/Thanasiss23/ekpa/blob/main/%CE%91%CE%BD%CF%84%CE%AF%CE%B3%CF%81%CE%B1%CF%86%CE%BF_DarkNet_DetectionAnomaly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# -*- coding: utf-8 -*-
# DarkNet One-Class Anomaly Detection

# 1. Εγκατάσταση / εισαγωγή βιβθηκών
!pip install pycaret[full] --quiet

import pandas as pd
import numpy as np
from pycaret.anomaly import *
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Φόρτωση δεδομένων
url = "https://raw.githubusercontent.com/kdemertzis/EKPA/main/Data/DarkNet.csv"
df = pd.read_csv(url)

# Επισκόπηση των δεδομένων
print("Σχήμα δεδομένων:", df.shape)
print(df.head())

# 3. Προεπεξεργασία δεδομένων (handled by PyCaret setup)
# PyCaret setup will handle numerical feature imputation, scaling, etc.

# 4. PyCaret Anomaly Detection Setup
# Setting up the PyCaret environment for anomaly detection
# We specify session_id for reproducibility and normalize the data.
# We use 0.05 for fraction as in the original code.
setup(data = df,  session_id = 42, normalize = True, fraction = 0.05, silent=True)

# 5. Μοντέλα ανίχνευσης ανωμαλιών με PyCaret
# Isolation Forest
iforest_model = create_model('iforest')
iforest_results = assign_model(iforest_model)

# One-Class SVM
svm_model = create_model('svm')
svm_results = assign_model(svm_model)

# Autoencoder
ae_model = create_model('knn') # PyCaret uses 'knn' for Autoencoder
ae_results = assign_model(ae_model)


# 6. Αξιολόγηση μοντέλων
# PyCaret automatically adds 'Anomaly' and 'Anomaly_Score' columns
# We can count the detected anomalies for each model

print("Isolation Forest Outliers:", iforest_results['Anomaly'].sum())
print("One-Class SVM Outliers:", svm_results['Anomaly'].sum())
print("Autoencoder Outliers:", ae_results['Anomaly'].sum())


# 7. Επιλογή καλύτερου μοντέλου
# Based on the assumption of having mostly 'normal' data (Tor) in the original dataset
# The model with the fewest detected anomalies might be considered 'better'
outliers_count = {
    "Isolation Forest": iforest_results['Anomaly'].sum(),
    "One-Class SVM": svm_results['Anomaly'].sum(),
    "Autoencoder": ae_results['Anomaly'].sum()
}
best_model = min(outliers_count, key=outliers_count.get)
print("Καλύτερο μοντέλο (λιγότερα false positives σε Tor data):", best_model)

# You can also explore other evaluation metrics provided by PyCaret
# evaluate_model(iforest_model)
# evaluate_model(svm_model)
# evaluate_model(ae_model)

# Plotting reconstruction error for Autoencoder (if needed, requires manual extraction from AE model)
# PyCaret's 'knn' model is not a standard Autoencoder, so this plot is not directly applicable.
# If a true Autoencoder is needed, it would require implementing it outside of PyCaret's 'create_model'

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

# DarkNet One-Class Anomaly Detection

This notebook performs one-class anomaly detection on the DarkNet dataset using different models.

## 1. Εγκατάσταση / εισαγωγή βιβλιοθηκών

In [2]:
# -*- coding: utf-8 -*-
# DarkNet One-Class Anomaly Detection

# 1. Εγκατάσταση / εισαγωγή βιβθηκών
# Uninstall conflicting libraries first
!pip uninstall pandas numpy scikit-learn -y --quiet
# Reinstall libraries with compatible versions
!pip install pandas==2.2.2 numpy==1.26.4 scikit-learn==1.4.2 pycaret[full] --quiet

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import matplotlib.pyplot as plt
import seaborn as sns

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.7/35.7 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.5/46.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m104.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m248.5/248.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m67.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.0/103.0 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m82.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m412.9/412.9 kB[0m [31m17.4 MB/s[0m eta 

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## 2. Φόρτωση δεδομένων

In [None]:
url = "https://raw.githubusercontent.com/kdemertzis/EKPA/main/Data/DarkNet.csv"
df = pd.read_csv(url)

# Επισκόπηση των δεδομένων
print("Σχήμα δεδομένων:", df.shape)
print(df.head())

NameError: name 'pd' is not defined

## 3. Προεπεξεργασία δεδομένων

- Αφαίρεση μη αριθμητικών στηλών (αν υπάρχουν)
- Κανονικοποίηση
- Διάσπαση σε training και testing (μόνο Tor samples, 80/20)

In [None]:
# Αφαίρεση μη αριθμητικών στηλών (αν υπάρχουν)
df_numeric = df.select_dtypes(include=np.number)

# Κανονικοποίηση
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_numeric)

# Διάσπαση σε training και testing (μόνο Tor samples, 80/20)
X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)

NameError: name 'df' is not defined

## 4. Μοντέλο 1: Isolation Forest

In [None]:
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(X_train)
y_pred_if = iso_forest.predict(X_test)
# Μετατροπή σε 0=normal, 1=outlier
y_pred_if = np.where(y_pred_if == -1, 1, 0)

NameError: name 'IsolationForest' is not defined

## 5. Μοντέλο 2: One-Class SVM

In [None]:
oc_svm = OneClassSVM(nu=0.05, kernel="rbf", gamma='scale')
oc_svm.fit(X_train)
y_pred_svm = oc_svm.predict(X_test)
y_pred_svm = np.where(y_pred_svm == -1, 1, 0)

NameError: name 'OneClassSVM' is not defined

## 6. Μοντέλο 3: Autoencoder

In [None]:
input_dim = X_train.shape[1]
encoding_dim = int(input_dim / 2)

autoencoder = Sequential([
    Dense(encoding_dim, activation='relu', input_shape=(input_dim,),
          activity_regularizer=regularizers.l1(1e-5)),
    Dense(input_dim, activation='linear')
])
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
history = autoencoder.fit(X_train, X_train,
                          epochs=50,
                          batch_size=32,
                          validation_split=0.1,
                          verbose=0)

# Υπολογισμός reconstruction error
X_test_pred = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - X_test_pred, 2), axis=1)
threshold = np.percentile(mse, 95)
y_pred_ae = (mse > threshold).astype(int)

# Σχεδίαση histogram reconstruction errors Autoencoder
plt.hist(mse, bins=50)
plt.axvline(threshold, color='r', linestyle='--', label='Threshold')
plt.title("Autoencoder Reconstruction Error Distribution")
plt.xlabel("MSE")
plt.ylabel("Count")
plt.legend()
plt.show()

NameError: name 'X_train' is not defined

## 7. Αξιολόγηση μοντέλων

Στην περίπτωση αυτή, επειδή έχουμε μόνο Tor στο test set, οι outliers είναι ψευδώς ανιχνευμένα.

In [None]:
print("Isolation Forest Outliers:", np.sum(y_pred_if))
print("One-Class SVM Outliers:", np.sum(y_pred_svm))
print("Autoencoder Outliers:", np.sum(y_pred_ae))

NameError: name 'np' is not defined

## 8. Επιλογή καλύτερου μοντέλου

Το καλύτερο μοντέλο είναι αυτό με τα λιγότερα false positives στα Tor δεδομένα.

In [None]:
outliers_count = {
    "Isolation Forest": np.sum(y_pred_if),
    "One-Class SVM": np.sum(y_pred_svm),
    "Autoencoder": np.sum(y_pred_ae)
}
best_model = min(outliers_count, key=outliers_count.get)
print("Καλύτερο μοντέλο (λιγότερα false positives σε Tor data):", best_model)

NameError: name 'np' is not defined