In [None]:
# !pip install --upgrade scikit-learn==1.2.2 imbalanced-learn==0.10.1

import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
from tqdm import tqdm
import joblib

# --- Data Loading and Cleaning ---
data_dir = '/kaggle/input/cicids2017'
files = [f for f in os.listdir(data_dir) if f.endswith('.parquet')]
print(f"Found {len(files)} Parquet files: {files}")

df_list = []
for file in tqdm(files, desc="Loading data"):
    df_list.append(pd.read_parquet(os.path.join(data_dir, file)))
data = pd.concat(df_list, ignore_index=True)

for col in ['Flow ID', 'Timestamp']:
    if col in data.columns:
        data.drop(columns=[col], inplace=True)

data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.dropna(inplace=True)
print(f"Data shape after cleaning: {data.shape}")

data['Label'] = data['Label'].str.replace('�', '-', regex=False)
X = data.drop(columns=['Label'])
y = data['Label']

# --- Feature Encoding and Scaling ---
cat_cols = X.select_dtypes(include=['object']).columns
for col in tqdm(cat_cols, desc="Encoding categorical features"):
    X[col] = LabelEncoder().fit_transform(X[col])

le_label = LabelEncoder()
y_encoded = le_label.fit_transform(y)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

# --- Handle Imbalance with SMOTE ---
rare_thresh = 2000
unique, counts = np.unique(y_train, return_counts=True)
rare_classes = [cls for cls, count in zip(unique, counts) if count < rare_thresh]
rare_mask = np.isin(y_train, rare_classes)

X_train_rare, y_train_rare = X_train[rare_mask], y_train[rare_mask]
X_train_majority, y_train_majority = X_train[~rare_mask], y_train[~rare_mask]

smote = SMOTE(random_state=42)
X_rare_resampled, y_rare_resampled = smote.fit_resample(X_train_rare, y_train_rare)

X_train_combined = np.vstack([X_train_majority, X_rare_resampled])
y_train_combined = np.concatenate([y_train_majority, y_rare_resampled])

# --- Class Weights ---
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_combined), y=y_train_combined)
weights_dict = {i: w for i, w in enumerate(class_weights)}
sample_weights = np.array([weights_dict[label] for label in y_train_combined])

# --- Supervised Model: LightGBM Multiclass ---
train_data = lgb.Dataset(X_train_combined, label=y_train_combined, weight=sample_weights)
valid_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

class TqdmCallback:
    def __init__(self, total):
        self.pbar = tqdm(total=total, desc="LightGBM Training")
    def __call__(self, env):
        self.pbar.update()
        if env.iteration + 1 == env.end_iteration:
            self.pbar.close()

params = {
    'objective': 'multiclass',
    'num_class': len(le_label.classes_),
    'metric': 'multi_logloss',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'verbosity': -1,
}
tqdm_callback = TqdmCallback(total=200)

bst = lgb.train(
    params=params,
    train_set=train_data,
    valid_sets=[valid_data],
    num_boost_round=200,
    callbacks=[lgb.early_stopping(stopping_rounds=20), tqdm_callback]
)

y_pred_prob = bst.predict(X_test)
y_pred_labels = np.argmax(y_pred_prob, axis=1)
print("Main classifier report:")
print(classification_report(y_test, y_pred_labels, target_names=le_label.classes_))

# --- Specialist Classifier for Rare Class: Web Attack - XSS ---
target_class = 'Web Attack - XSS'
target_label = list(le_label.classes_).index(target_class)

y_train_spec = (y_train == target_label).astype(int)
y_test_spec = (y_test == target_label).astype(int)

smote_spec = SMOTE(random_state=42)
X_train_spec_res, y_train_spec_res = smote_spec.fit_resample(X_train, y_train_spec)
class_wt_spec = compute_class_weight('balanced', classes=np.unique(y_train_spec_res), y=y_train_spec_res)
weights_spec_dict = {i: w for i, w in enumerate(class_wt_spec)}
sample_weights_spec = np.array([weights_spec_dict[label] for label in y_train_spec_res])

train_data_spec = lgb.Dataset(X_train_spec_res, label=y_train_spec_res, weight=sample_weights_spec)
valid_data_spec = lgb.Dataset(X_test, label=y_test_spec, reference=train_data_spec)

params_spec = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.1,
    'verbosity': -1,
}
tqdm_callback_spec = TqdmCallback(total=200)
bst_spec = lgb.train(
    params=params_spec,
    train_set=train_data_spec,
    valid_sets=[valid_data_spec],
    num_boost_round=200,
    callbacks=[lgb.early_stopping(stopping_rounds=20), tqdm_callback_spec]
)

y_pred_spec = bst_spec.predict(X_test)
y_pred_spec_labels = (y_pred_spec > 0.5).astype(int)
print(f"Specialist classifier report for {target_class}:")
print(classification_report(y_test_spec, y_pred_spec_labels))

print("Supervised training complete. Starting unsupervised training...")



Collecting imbalanced-learn==0.10.1
  Downloading imbalanced_learn-0.10.1-py3-none-any.whl.metadata (8.2 kB)
Downloading imbalanced_learn-0.10.1-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.0/226.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.13.0
    Uninstalling imbalanced-learn-0.13.0:
      Successfully uninstalled imbalanced-learn-0.13.0
Successfully installed imbalanced-learn-0.10.1
Found 8 Parquet files: ['Benign-Monday-no-metadata.parquet', 'Bruteforce-Tuesday-no-metadata.parquet', 'Portscan-Friday-no-metadata.parquet', 'WebAttacks-Thursday-no-metadata.parquet', 'DoS-Wednesday-no-metadata.parquet', 'DDoS-Friday-no-metadata.parquet', 'Infiltration-Thursday-no-metadata.parquet', 'Botnet-Friday-no-metadata.parquet']


Loading data: 100%|██████████| 8/8 [00:03<00:00,  2.34it/s]


Data shape after cleaning: (2313810, 78)


Encoding categorical features: 0it [00:00, ?it/s]
LightGBM Training:   0%|          | 1/200 [00:07<25:16,  7.62s/it]

Training until validation scores don't improve for 20 rounds


LightGBM Training:  47%|████▋     | 94/200 [03:38<04:06,  2.33s/it]

Early stopping, best iteration is:
[74]	valid_0's multi_logloss: 0.00850718
Main classifier report:
                            precision    recall  f1-score   support

                    Benign       1.00      1.00      1.00    395464
                       Bot       0.51      0.99      0.68       288
                      DDoS       1.00      1.00      1.00     25603
             DoS GoldenEye       0.99      1.00      1.00      2057
                  DoS Hulk       1.00      1.00      1.00     34569
          DoS Slowhttptest       0.93      0.99      0.96      1046
             DoS slowloris       0.99      0.99      0.99      1077
               FTP-Patator       1.00      1.00      1.00      1186
                Heartbleed       1.00      1.00      1.00         2
              Infiltration       0.71      0.71      0.71         7
                  PortScan       0.90      0.98      0.94       391
               SSH-Patator       1.00      1.00      1.00       644
  Web Attack - 


LightGBM Training:   0%|          | 0/200 [00:00<?, ?it/s][A
LightGBM Training:   0%|          | 1/200 [00:08<29:27,  8.88s/it][A

Training until validation scores don't improve for 20 rounds



LightGBM Training:   1%|          | 2/200 [00:09<12:31,  3.80s/it][A
LightGBM Training:   2%|▏         | 3/200 [00:09<07:06,  2.17s/it][A
LightGBM Training:   2%|▏         | 4/200 [00:09<04:35,  1.40s/it][A
LightGBM Training:   2%|▎         | 5/200 [00:09<03:11,  1.02it/s][A
LightGBM Training:   3%|▎         | 6/200 [00:10<02:21,  1.37it/s][A
LightGBM Training:   4%|▎         | 7/200 [00:10<01:49,  1.76it/s][A
LightGBM Training:   4%|▍         | 8/200 [00:10<01:29,  2.16it/s][A
LightGBM Training:   4%|▍         | 9/200 [00:10<01:15,  2.53it/s][A
LightGBM Training:   5%|▌         | 10/200 [00:11<01:05,  2.88it/s][A
LightGBM Training:   6%|▌         | 11/200 [00:11<01:00,  3.13it/s][A
LightGBM Training:   6%|▌         | 12/200 [00:11<00:56,  3.35it/s][A
LightGBM Training:   6%|▋         | 13/200 [00:11<00:52,  3.54it/s][A
LightGBM Training:   7%|▋         | 14/200 [00:12<00:50,  3.69it/s][A
LightGBM Training:   8%|▊         | 15/200 [00:12<00:48,  3.78it/s][A
LightGBM Trai

Early stopping, best iteration is:
[77]	valid_0's binary_logloss: 0.00454672
Specialist classifier report for Web Attack - XSS:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    462632
           1       0.30      0.92      0.45       130

    accuracy                           1.00    462762
   macro avg       0.65      0.96      0.73    462762
weighted avg       1.00      1.00      1.00    462762

Supervised training complete. Starting unsupervised training...


In [2]:
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

print("Starting unsupervised training...")

pbar = tqdm(total=3, desc="Unsupervised Training")

# Isolation Forest trained on full benign data
benign_label = le_label.transform(['Benign'])[0]
X_benign = X_train[y_train == benign_label]

iso_forest = IsolationForest(contamination=0.01, random_state=42)
iso_forest.fit(X_benign)
pbar.update(1)

subset_size = 50000  # or smaller if needed
np.random.seed(42)
if len(X_benign) > subset_size:
    sample_indices = np.random.choice(len(X_benign), subset_size, replace=False)
    X_benign_svm = X_benign[sample_indices]
else:
    X_benign_svm = X_benign

one_class_svm = OneClassSVM(kernel='rbf', gamma='scale', nu=0.01)
one_class_svm.fit(X_benign_svm)
pbar.update(1)


# Autoencoder trained on full benign data
X_auto_train = X_benign
input_dim = X_auto_train.shape[1]
encoding_dim = 32

autoencoder = keras.Sequential([
    layers.Dense(encoding_dim, activation='relu', input_shape=(input_dim,)),
    layers.Dense(16, activation='relu'),
    layers.Dense(encoding_dim, activation='relu'),
    layers.Dense(input_dim, activation='linear')
])
autoencoder.compile(optimizer='adam', loss='mse')

history = autoencoder.fit(
    X_auto_train, X_auto_train,
    epochs=20,
    batch_size=256,
    shuffle=True,
    validation_split=0.1,
    verbose=1
)
pbar.update(1)

pbar.close()
print("Unsupervised training complete.")

# Predict anomalies using Isolation Forest and One-Class SVM
iso_pred = iso_forest.predict(X_test)
svm_pred = one_class_svm.predict(X_test)
combined_anomaly = np.where((iso_pred == -1) | (svm_pred == -1), 1, 0)
true_anomaly = (y_test != benign_label).astype(int)

print("Combined anomaly detection report:")
print(classification_report(true_anomaly, combined_anomaly))

# Calculate autoencoder anomaly scores and threshold
reconstructions = autoencoder.predict(X_auto_train)
mse = np.mean(np.power(X_auto_train - reconstructions, 2), axis=1)
threshold = np.mean(mse) + 2 * np.std(mse)

test_reconstructions = autoencoder.predict(X_test)
test_mse = np.mean(np.power(X_test - test_reconstructions, 2), axis=1)
autoencoder_anomaly = (test_mse > threshold).astype(int)

print("Autoencoder anomaly count (test set):", np.sum(autoencoder_anomaly))

# Merge all anomaly and supervised detections
final_alert = (
    (y_pred_labels != benign_label) |         # Supervised model detection
    (combined_anomaly == 1) |                  # IsolationForest or One-Class SVM anomaly
    (autoencoder_anomaly == 1)                 # Autoencoder anomaly
)

print("Total samples flagged as malicious or anomalous (any model):", np.sum(final_alert))

# Save all models and preprocessors
joblib.dump(bst, 'lgb_main_smote_weighted.pkl')
joblib.dump(bst_spec, f'lgb_specialist_{target_class.replace(" ", "_")}.pkl')
joblib.dump(iso_forest, 'isolation_forest.pkl')
joblib.dump(one_class_svm, 'one_class_svm.pkl')
autoencoder.save('autoencoder_anomaly_model.h5')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(le_label, 'label_encoder.pkl')

print("All models and preprocessors saved successfully.")


2025-09-09 10:43:56.533907: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757414636.709893      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757414636.762665      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Starting unsupervised training...




Unsupervised Training:   0%|          | 0/3 [00:00<?, ?it/s][A[A

Unsupervised Training:  33%|███▎      | 1/3 [00:49<01:39, 49.82s/it][A[A

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1757414702.386641      36 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Epoch 1/20


I0000 00:00:1757414708.075724     125 service.cc:148] XLA service 0x789a1006b4e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1757414708.076303     125 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1757414708.318348     125 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m  75/5562[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m11s[0m 2ms/step - loss: 0.0525

I0000 00:00:1757414709.062229     125 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m5562/5562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 2ms/step - loss: 0.0040 - val_loss: 9.3829e-05
Epoch 2/20
[1m5562/5562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - loss: 8.0595e-05 - val_loss: 6.6336e-05
Epoch 3/20
[1m5562/5562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - loss: 5.8755e-05 - val_loss: 5.3504e-05
Epoch 4/20
[1m5562/5562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - loss: 5.0734e-05 - val_loss: 4.8205e-05
Epoch 5/20
[1m5562/5562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - loss: 4.6347e-05 - val_loss: 4.2751e-05
Epoch 6/20
[1m5562/5562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - loss: 4.1724e-05 - val_loss: 4.1308e-05
Epoch 7/20
[1m5562/5562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - loss: 3.9557e-05 - val_loss: 3.6786e-05
Epoch 8/20
[1m5562/5562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - loss: 3



Unsupervised Training: 100%|██████████| 3/3 [05:12<00:00, 104.32s/it][A[A

Unsupervised training complete.





Combined anomaly detection report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95    395464
           1       0.83      0.50      0.62     67298

    accuracy                           0.91    462762
   macro avg       0.87      0.74      0.79    462762
weighted avg       0.91      0.91      0.90    462762

[1m49433/49433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 1ms/step
[1m14462/14462[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1ms/step
Autoencoder anomaly count (test set): 27612
Total samples flagged as malicious or anomalous (any model): 75387
All models and preprocessors saved successfully.
