# CIC-IDS-2017 IID vs OOD

In [1]:
import numpy as np

from skexplain.utils import dataset, log, persist
from skexplain.utils.const import CIC_IDS_2017_DATASET_META

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

df_train_test_path = "res/dataset/CIC-IDS-2017/"
df_validate_path = "res/dataset/validation/heartbleed.csv"

# CIC-IDS-2017 RandomForestClassifier Classification Accuracy with IID

In [3]:
print("Reading CIC-IDS-2017 dataset...")
# Step 1: Parse train-test def
X, y, _, _, _ = dataset.read(
    df_train_test_path, metadata=CIC_IDS_2017_DATASET_META, as_df=True
)
print("Done!")

print("Splitting dataset into training and test...")
X_indexes = np.arange(0, X.shape[0])
X_train, X_test, y_train, y_test = train_test_split(
    X_indexes, y, train_size=0.7, stratify=y
)
X_train = X.iloc[X_train]
X_test = X.iloc[X_test]
print("Done!")

# Step 2: Train black-box model with loaded dataset
print("#" * 10, "Model init", "#" * 10)
model_path = "RandomForestClassifier_cic_ids_2017.joblib.zip"
print("Looking for pre-trained model: {}...".format(model_path))
blackbox = persist.load_model(model_path)
if not blackbox:
    print("Training model: RandomForestClassifier...")
    blackbox = RandomForestClassifier(n_jobs=4)
    blackbox.fit(X_train, y_train)
    persist.save_model(blackbox, model_path)

print("Done!")

y_pred = blackbox.predict(X_test)

print("Blackbox model classification report with IID:")
print(
    "\n{}".format(
        classification_report(
            y_test,
            y_pred,
            digits=3,
            target_names=CIC_IDS_2017_DATASET_META["classes"],
        )
    )
)

Reading CIC-IDS-2017 dataset...


  converters=metadata["converters"] if "converters" in metadata else None,


Done!
Splitting dataset into training and test...
Done!
########## Model init ##########
Looking for pre-trained model: RandomForestClassifier_cic_ids_2017.joblib.zip...
Done!
Blackbox model classification report with IID:

                          precision    recall  f1-score   support

                  BENIGN      1.000     0.999     1.000    681929
                     Bot      0.934     0.917     0.926       590
                    DDoS      1.000     1.000     1.000     38408
           DoS GoldenEye      1.000     0.998     0.999      3088
                DoS Hulk      0.999     0.999     0.999     69322
        DoS Slowhttptest      0.987     0.995     0.991      1650
           DoS slowloris      0.998     0.999     0.998      1739
             FTP-Patator      1.000     1.000     1.000      2381
              Heartbleed      1.000     1.000     1.000         3
            Infiltration      1.000     0.909     0.952        11
                PortScan      0.994     1.000    

# CIC-IDS-2017 RandomForestClassifier Classification Accuracy with OOD

In [5]:
print("Reading Heartbleed OOD dataset...")
df_meta = CIC_IDS_2017_DATASET_META
df_meta["is_dir"] = False
X_validate, y_validate, _, _, _ = dataset.read(
    df_validate_path, metadata=df_meta, as_df=True
)
print("Done!")

y_val_pred = blackbox.predict(X_validate)

print("Blackbox model classification report with OOD:")
print(
    "\n{}".format(
        classification_report(
            y_validate,
            y_val_pred,
            digits=3,
            target_names=["BENIGN", "Heartbleed"],
        )
    )
)


2021-04-21 21:04:09,385 - INFO - Names: ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd URG Flags', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Average Packet

2021-04-21 21:04:09,460 - INFO - Targets shape: (1041, 1) Index(['Label'], dtype='object')
2021-04-21 21:04:09,465 - INFO - /Users/asjacobs/workspace/explainability/scikit-explain/res/dataset/heartbleed-large/heartbleed-large.csv
2021-04-21 21:04:09,466 - INFO - [[1.4780e+03 6.0000e+00 6.0000e+00 ... 0.0000e+00 1.0000e+00 0.0000e+00]
 [1.2842e+04 9.0000e+00 8.0000e+00 ... 0.0000e+00 1.0000e+00 0.0000e+00]
 [1.0592e+04 9.0000e+00 8.0000e+00 ... 0.0000e+00 1.0000e+00 0.0000e+00]
 ...
 [1.1570e+03 6.0000e+00 6.0000e+00 ... 0.0000e+00 1.0000e+00 0.0000e+00]
 [9.6500e+02 6.0000e+00 6.0000e+00 ... 0.0000e+00 1.0000e+00 0.0000e+00]
 [1.3380e+03 6.0000e+00 6.0000e+00 ... 0.0000e+00 1.0000e+00 0.0000e+00]]
2021-04-21 21:04:09,467 - INFO - [[0]
 [1]
 [1]
 ...
 [0]
 [0]
 [0]]
2021-04-21 21:04:09,577 - INFO - [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0