In [None]:
# @title
# Setup
# https://pyod.readthedocs.io/en/latest/
# pyod for libraries in anomaly detection
#

# === CONFIGURATION ===
# Choose which dataset to use
DATASET_MODE = "cic-ids2018-wed14"
# options: "RT-IOT2022", "TON-IoT-Fridge"
# "cic-ids2018-fri02", "cic-ids2018-fri02-sub", "cic-ids2018-fri23",
# "cic-ids2018-thu01", "cic-ids2018-thu15", "cic-ids2018-thu22",
# "cic-ids2018-wed14", "cic-ids2018-wed21", "cic-ids2018-wed28"

# Enable lightweight mode to avoid OOM
# LIGHT_MODE = True          # set False for full data
CHUNK_SIZE = 20000         # adjust for Colab (50k–100k recommended)
SAMPLE_LIMIT = 40000
LIGHT_MODE = False


In [None]:
# Setup
!pip3 install -U tensorflow
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report,
    roc_curve, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay
)
# from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import layers, losses
from tensorflow.keras.models import Model


In [None]:

if DATASET_MODE == "RT-IOT2022":
    !pip3 install -U ucimlrepo
    from ucimlrepo import fetch_ucirepo

    dataset = fetch_ucirepo(id=942)
    X = dataset.data.features
    y = dataset.data.targets
    benign_labels = ['Thing_Speak', 'MQTT_Publish', 'NMAP_XMAS_TREE_SCAN', 'NMAP_TCP_scan',
                 'NMAP_OS_DETECTION', 'NMAP_UDP_SCAN', 'Wipro_bulb', 'NMAP_FIN_SCAN',
                 'DOS_SYN_Hping']
    malicious_column = "Attack_type"
    EXCLUDE_FROM_SCALING = []
    #benign_labels = ['Thing_Speak', 'MQTT_Publish', 'Wipro_bulb',
    #                 'NMAP_FIN_SCAN', 'NMAP_OS_DETECTION', 'NMAP_TCP_scan',
    #                 'NMAP_UDP_SCAN', 'NMAP_XMAS_TREE_SCAN']


elif DATASET_MODE.startswith("cic-ids2018-"):
    benign_labels = ['Benign']
    malicious_column = "Label"
    EXCLUDE_FROM_SCALING = ['time_sin', 'time_cos']
    if DATASET_MODE == "cic-ids2018-fri02":
      filename = "/content/drive/MyDrive/datasets-anomaly-detection/CIC-IDS2018/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv"
    elif DATASET_MODE == "cic-ids2018-fri02-sub":
      filename = "/content/drive/MyDrive/datasets-anomaly-detection/CIC-IDS2018/Friday-02-03-2018_TrafficForML_CICFlowMeter.subset.csv"
    elif DATASET_MODE == "cic-ids2018-fri23":
      filename = "/content/drive/MyDrive/datasets-anomaly-detection/CIC-IDS2018/Friday-23-02-2018_TrafficForML_CICFlowMeter.csv"
    elif DATASET_MODE == "cic-ids2018-thu01":
      filename = "/content/drive/MyDrive/datasets-anomaly-detection/CIC-IDS2018/Thursday-01-03-2018_TrafficForML_CICFlowMeter.csv"
    elif DATASET_MODE == "cic-ids2018-thu15":
      filename = "/content/drive/MyDrive/datasets-anomaly-detection/CIC-IDS2018/Thursday-15-02-2018_TrafficForML_CICFlowMeter.csv"
    elif DATASET_MODE == "cic-ids2018-thu22":
      filename = "/content/drive/MyDrive/datasets-anomaly-detection/CIC-IDS2018/Thursday-22-02-2018_TrafficForML_CICFlowMeter.csv"
    elif DATASET_MODE == "cic-ids2018-wed14":
      filename = "/content/drive/MyDrive/datasets-anomaly-detection/CIC-IDS2018/Wednesday-14-02-2018_TrafficForML_CICFlowMeter.csv"
    elif DATASET_MODE == "cic-ids2018-wed21":
      filename = "/content/drive/MyDrive/datasets-anomaly-detection/CIC-IDS2018/Wednesday-21-02-2018_TrafficForML_CICFlowMeter.csv"
    elif DATASET_MODE == "cic-ids2018-wed28":
      filename = "/content/drive/MyDrive/datasets-anomaly-detection/CIC-IDS2018/Wednesday-28-02-2018_TrafficForML_CICFlowMeter.csv"


    df =  pd.read_csv(filename, low_memory=True)
    # Attempt to parse timestamp automatically
    df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")

    # Extract hour, minute, and second
    df["hour"] = df["Timestamp"].dt.hour.fillna(0)
    df["minute"] = df["Timestamp"].dt.minute.fillna(0)
    df["second"] = df["Timestamp"].dt.second.fillna(0)

    # Compute cyclical time features
    df["time_sin"] = np.sin(2 * np.pi * ((df["hour"] + (df["minute"] + df["second"] / 60) / 60) / 24))
    df["time_cos"] = np.cos(2 * np.pi * ((df["hour"] + (df["minute"] + df["second"] / 60) / 60) / 24))

    # Drop timestamp and raw time components (keep only sin/cos)
    df = df.drop(columns=["Timestamp", "hour", "minute", "second"])

    X = df.drop(columns=[malicious_column])
    y = df[[malicious_column]] if malicious_column in df.columns else None
    print(f"Loaded chunk shape: {df.shape}")

elif DATASET_MODE == "TON-IoT-Fridge":
  benign_labels = False
  malicious_column = "label"
  EXCLUDE_FROM_SCALING = ['time_sin', 'time_cos']

  filename = "/content/drive/MyDrive/datasets-anomaly-detection/TON-IoT/Train_Test_IoT_Fridge.csv"
  df = pd.read_csv(filename, low_memory=True)
  df["Timestamp"] = pd.to_datetime(df["time"], errors="coerce")

  # Extract hour, minute, and second
  df["hour"] = df["Timestamp"].dt.hour.fillna(0)
  df["minute"] = df["Timestamp"].dt.minute.fillna(0)
  df["second"] = df["Timestamp"].dt.second.fillna(0)

  # Compute cyclical time features
  df["time_sin"] = np.sin(2 * np.pi * ((df["hour"] + (df["minute"] + df["second"] / 60) / 60) / 24))
  df["time_cos"] = np.cos(2 * np.pi * ((df["hour"] + (df["minute"] + df["second"] / 60) / 60) / 24))

  # Drop timestamp and raw time components (keep only sin/cos)
  df = df.drop(columns=["Timestamp", "hour", "minute", "second", "time", 'date', 'type'])

  df['temp_condition'] = df['temp_condition'].astype(str).str.strip()

  X = df.drop(columns=[malicious_column])
  y = df[[malicious_column]] if malicious_column in df.columns else None
  print(f"Loaded chunk shape: {df.shape}")

elif DATASET_MODE == "KKDCup-10":
  benign_labels = "normal."
  malicious_column = "label"
  EXCLUDE_FROM_SCALING = ['label', 'time_sin', 'time_cos']
  filename = "/content/drive/MyDrive/datasets-anomaly-detection/kddcup/kddcup.data_10_percent_corrected.csv"
  df = pd.read_csv(filename, low_memory=True)
  X = df.drop(columns=[malicious_column])
  y = df[[malicious_column]] if malicious_column in df.columns else None
  print(f"Loaded chunk shape: {df.shape}")

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Initial column names: {X.columns.values}")


In [None]:

# TODO: drop NaN or invalid
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]



num_cols = X.select_dtypes(include=[np.number]).columns
cat_cols = X.select_dtypes(exclude=[np.number]).columns
# Identify time features to exclude from scaling

# Automatically exclude only those that exist
excluded_features = [col for col in EXCLUDE_FROM_SCALING if col in num_cols]
num_cols_to_scale = [col for col in num_cols if col not in excluded_features]

if len(X) < 5e5:
    log_transformer = FunctionTransformer(lambda arr: np.log1p(np.abs(arr)))
else:
    log_transformer = FunctionTransformer(lambda arr: arr)  # passthrough for big data

numeric_transformer = Pipeline([
    ('log', log_transformer),
    ('scaler', RobustScaler(quantile_range=(5, 95)))
])

categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Define preprocessor
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols_to_scale),
    ("passthrough", "passthrough", excluded_features),
    ("cat", categorical_transformer, cat_cols)
])

# Fit on a sample to reduce memory if data huge
sample_rows = min(SAMPLE_LIMIT, len(X))
preprocessor.fit(X.sample(sample_rows, random_state=42))

if LIGHT_MODE:
  # Transform in smaller batches
  def transform_in_chunks(preprocessor, X, chunk_size=CHUNK_SIZE):
      chunks = []
      for i in range(0, len(X), chunk_size):
          X_chunk = X.iloc[i:i+chunk_size]
          X_trans = preprocessor.transform(X_chunk)
          chunks.append(X_trans)
      return np.vstack([c.toarray() if hasattr(c, "toarray") else c for c in chunks])
  print("Transform")
  # Apply transformation safely
  X_scaled = transform_in_chunks(preprocessor, X)
else:
# Apply transformations
  X_scaled = preprocessor.transform(X)

# print(np.min(X_scaled), np.max(X_scaled))
# print(np.mean(X_scaled), np.std(X_scaled))


# Get proper column names
num_features = np.array(num_cols)
cat_features = np.array([])
if len(cat_cols) > 0:
    cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols)
feature_names = np.concatenate([num_features, cat_features])


# Construct final DataFrame
X = pd.DataFrame(X_scaled, columns=feature_names, index=X.index)

print(f"Numeric cols: {len(num_cols)}, Categorical cols: {len(cat_cols)}")
print(f"Final feature count: {X.shape[1]}")
print(X.head())

print(cat_cols)
print(X.shape)
print(y.shape)



In [None]:

# TODO: drop NaN or invalid
X = X.replace([np.inf, -np.inf], np.nan).dropna()
y = y.loc[X.index]

# TODO: preprocess X using one-hot encoder
# cat_cols = X.select_dtypes(include=["object", "category"]).columns
# encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# X_encoded = encoder.fit_transform(X[cat_cols])
# Create DataFrame for encoded features
# encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(cat_cols), index=X.index)
# Combine encoded categorical features with numerical ones
# num_df = X.drop(columns=cat_cols)
# X_processed = pd.concat([num_df, encoded_df], axis=1)

# TODO: normalize input features
num_cols = X.select_dtypes(include=[np.number]).columns
cat_cols = X.select_dtypes(exclude=[np.number]).columns


numeric_transformer = Pipeline([
#    ('log', FunctionTransformer(lambda x: np.log1p(np.abs(x)))),
    ('log', FunctionTransformer(np.vectorize(lambda x: np.log1p(abs(x))) if len(X) < 5e5 else FunctionTransformer(lambda x: x))),  # skip log on big data
    ('scaler', RobustScaler(quantile_range=(5, 95)))
    # ('scaler', StandardScaler())
])

categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=True)


preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])



# Fit on a sample to reduce memory if data huge
sample_rows = min(SAMPLE_LIMIT, len(X))
X_sample = X.sample(sample_rows, random_state=42)
preprocessor.fit(X_sample)

if LIGHT_MODE:
  # Transform in smaller batches
  def transform_in_chunks(preprocessor, X, chunk_size=CHUNK_SIZE):
      chunks = []
      for i in range(0, len(X), chunk_size):
          X_chunk = X.iloc[i:i+chunk_size]
          X_trans = preprocessor.transform(X_chunk)
          chunks.append(X_trans)
      return np.vstack([c.toarray() if hasattr(c, "toarray") else c for c in chunks])
  print("Transform")
  # Apply transformation safely
  X_scaled = transform_in_chunks(preprocessor, X)
else:
# Apply transformations
  X_scaled = preprocessor.fit_transform(X)

# print(np.min(X_scaled), np.max(X_scaled))
# print(np.mean(X_scaled), np.std(X_scaled))


# Get proper column names
num_features = num_cols
cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(cat_cols)
feature_names = np.concatenate([num_features, cat_features])

# Construct final DataFrame
X = pd.DataFrame(X_scaled, columns=feature_names, index=X.index)

print(f"Numeric cols: {len(num_cols)}, Categorical cols: {len(cat_cols)}")
print(f"Final feature count: {X.shape[1]}")
print(X.head())

print(cat_cols)
print(X.shape)
print(y.shape)

# metadata
# print(rt_iot2022.metadata)
# variable information
# print(rt_iot2022.variables)


In [None]:
y_benign = y.loc[y[malicious_column].isin(benign_labels)]
y_malicious = y.loc[~y[malicious_column].isin(benign_labels)]

# Print ratio benign and malicious
print(f"Benign ratio: {len(y_benign) / len(y):.2f}")
print(f"Malicious ratio: {len(y_malicious) / len(y):.2f}")


train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10
# Split benign data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X.loc[y_benign.index], y_benign, test_size=1 - train_ratio, random_state=4
)
# Split training data into training and validation sets
# 75/25 of benign training data
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=test_ratio/(test_ratio + validation_ratio), random_state=4
)

# Combine benign test data and malicious data for the overall test set
X_test = pd.concat([X_test, X.loc[y_malicious.index]])
y_test = pd.concat([y_test, y_malicious])

# X_train, y_train -> benign training dataset -> (10630, 83), (10630, 1)
# X_test_benign, y_test_benign -> benign testing dataset -> (1877, 83), (1877, 1)
# X_test, y_test -> benign + malicious training dataset -> (112487, 83), (112487, 1)

In [None]:
print(y.loc[y[malicious_column].isin(['ARP_poisioning', 'DOS_SYN_Hping'])].size)

In [None]:
print("Benign label y_train", y_train[malicious_column].unique())
print(X_train.shape)
print("Benign testing y_test_benign", y_test[malicious_column].unique())
print(X_test.shape)
print("Benign validation y_val", y_val[malicious_column].unique())
print(X_val.shape)
print("Malicious label y_test", y_test[malicious_column].unique())
print(X_test.shape)

In [None]:
class AnomalyDetector(Model):
  def __init__(self, input_dim, activation):
    super(AnomalyDetector, self).__init__()
    self.encoder = tf.keras.Sequential([
      layers.Dense(48, activation=activation),
      layers.Dense(16, activation=activation),
      layers.Dense(8, activation=activation)])

    self.decoder = tf.keras.Sequential([
      layers.Dense(16, activation=activation),
      layers.Dense(48, activation=activation),
      layers.Dense(input_dim, activation="linear")])

  def call(self, x):
    encoded = self.encoder(x)
    decoded = self.decoder(encoded)
    return decoded


autoencoder = AnomalyDetector(input_dim=X_train.shape[1], activation="relu")
autoencoder.compile(optimizer='adam', loss='mse')
es = EarlyStopping(monitor='val_loss', patience=6, restore_best_weights=True)
history = autoencoder.fit(X_train, X_train,
          epochs=500,
          batch_size=512,
          validation_data=(X_val, X_val),
          shuffle=True,
          callbacks=[es])

In [None]:
#
reconstructions = autoencoder.predict(X_test)
loss = np.mean((X_test.to_numpy() - reconstructions)**2, axis=1)  # ensure numpy arrays

#
benign_mask = y_test[malicious_column].isin(benign_labels).values
y_true = (~benign_mask).astype(int)  # 0 = benign, 1 = malicious

#
val_recon = autoencoder.predict(X_val)
val_loss = np.mean(np.abs(X_val.to_numpy() - val_recon), axis=1)

def best_threshold(y_true, loss):
    thresholds = np.linspace(np.min(loss), np.max(loss), 2000)
    f1_scores = [f1_score(y_true, (loss > t).astype(int)) for t in thresholds]
    best_idx = np.argmax(f1_scores)
    best_threshold = thresholds[best_idx]
    print(f"Best threshold: {best_threshold:.5f}, Best F1: {f1_scores[best_idx]:.4f}")
    return best_threshold

def mad_threshold(val_loss, k):
    med = np.median(val_loss)
    mad = np.median(np.abs(val_loss - med))
    return med + k * mad

threshold = mad_threshold(val_loss, k=6.0)
#
# threshold = best_threshold(y_true, loss)
#threshold = np.mean(val_loss) + 1 * np.std(val_loss)
#print(f"Threshold: {threshold:.5f}")


#
y_pred = (loss > threshold).astype(int)

#
acc  = accuracy_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec  = recall_score(y_true, y_pred)
f1   = f1_score(y_true, y_pred)
auc  = roc_auc_score(y_true, loss)

print(classification_report(y_true, y_pred, target_names=["Benign", "Anomaly"]))


print(f"\nAccuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"AUC:       {auc:.4f}")

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Benign", "Anomaly"])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

#
plt.figure(figsize=(6,4))
plt.hist(loss[benign_mask], bins=50, alpha=0.6, label='Benign', color="green")
plt.hist(loss[~benign_mask], bins=50, alpha=0.6, label='Malicious', color="red")
plt.axvline(x=threshold, color='black', linestyle='--', label='Threshold')
plt.legend()
plt.xlabel("Reconstruction Error (MSE)")
plt.ylabel("Frequency")
plt.title("Reconstruction Error Distribution")
plt.show()

#
fpr, tpr, _ = roc_curve(y_true, loss)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"AUC = {auc:.3f}")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve – Autoencoder Anomaly Detection")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()