In [5]:
import copy
import io

import numpy as np

import graphviz
import matplotlib.pyplot as plt
import pandas as pd
import rootpath
import shap
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from interpret import show
from interpret.blackbox import LimeTabular
from skexplain.enums.feature_type import FeatureType
from skexplain.imitation import (ClassificationDagger, RegressionDagger)
from skexplain.utils import dataset, log, persist
from skexplain.utils.const import (BOSTON_DATASET_META,
                                   CIC_IDS_2017_DATASET_META,
                                   HEARTBLEED_DATASET_META,
                                   HEARTBLEED_LARGE_DATASET_META,
                                   DIABETES_DATASET_META,
                                   DOWNLOAD_DATASET_META, IOT_DATASET_META,
                                   WINE_DATASET_META)
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, f1_score, r2_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.svm import LinearSVC

df_test_meta=HEARTBLEED_LARGE_DATASET_META
df_train_meta=CIC_IDS_2017_DATASET_META 
model=RandomForestClassifier
as_df=False


""" Test using Reinforcement Learning to extract Decision Tree from a generic Blackbox model """
logger = log.Logger(
    "{}/res/log/{}/notebook_{}_{}.log".format(rootpath.detect(), df_train_meta['name'],  model.__name__, "Raw")
)
logger.log('Init done.')

2021-06-29 20:17:30,159 - INFO - Init done.


In [6]:
df_train_test_path = "{}/res/dataset/CIC-IDS-2017-TEST/".format(rootpath.detect())

# Step 1: Parse test def
X_train, y_train, feature_names, _, _ = dataset.read(df_train_test_path, metadata=df_train_meta,
                                         verbose=True, logger=logger, as_df=as_df)

2021-06-29 20:17:30,167 - INFO - Names: ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd URG Flags', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Average Packet

['/Users/asjacobs/workspace/explainability/scikit-explain/res/dataset/CIC-IDS-2017-TEST/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv.zip', '/Users/asjacobs/workspace/explainability/scikit-explain/res/dataset/CIC-IDS-2017-TEST/Monday-WorkingHours.pcap_ISCX.csv.zip', '/Users/asjacobs/workspace/explainability/scikit-explain/res/dataset/CIC-IDS-2017-TEST/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv.zip', '/Users/asjacobs/workspace/explainability/scikit-explain/res/dataset/CIC-IDS-2017-TEST/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv.zip', '/Users/asjacobs/workspace/explainability/scikit-explain/res/dataset/CIC-IDS-2017-TEST/Wednesday-workingHours.pcap_ISCX.csv.zip', '/Users/asjacobs/workspace/explainability/scikit-explain/res/dataset/CIC-IDS-2017-TEST/Friday-WorkingHours-Morning.pcap_ISCX.csv.zip', '/Users/asjacobs/workspace/explainability/scikit-explain/res/dataset/CIC-IDS-2017-TEST/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv.zip', '/Users/asjacobs/w

2021-06-29 20:17:49,054 - INFO - Pandas read_csv complete.
2021-06-29 20:17:49,055 - INFO - Column/Categories Fwd PSH Flags [0, 1]
2021-06-29 20:17:49,162 - INFO - Column/Categories Fwd URG Flags [0, 1]
2021-06-29 20:17:49,248 - INFO - CSV dataset read:
2021-06-29 20:17:49,599 - INFO -          Flow Duration  Total Fwd Packets  Total Backward Packets  \
0                    3                  2                       0   
1                  109                  1                       1   
2                   52                  1                       1   
3                   34                  1                       1   
4                    3                  2                       0   
...                ...                ...                     ...   
2830738            155                  2                       2   
2830739            110                  1                       1   
2830740            166                  2                       2   
2830741             81 

In [7]:
# Step 2: Train black-box model with loaded dataset
logger.log("#" * 10, "Model init", "#" * 10)
model_path = "../res/{}_{}_{}_{}.joblib.zip".format(model.__name__,
                                                        "Raw", df_train_meta['name'], X_train.shape[1])
logger.log("Looking for pre-trained model: {}...".format(model_path))
blackbox = persist.load_model(model_path)
if not blackbox:
    raise ValueError("Trained model not found. Please train model before unit testing it.")
logger.log("#" * 10, "Done", "#" * 10)

2021-06-29 20:17:58,467 - INFO - ########## Model init ##########
2021-06-29 20:17:58,469 - INFO - Looking for pre-trained model: ../res/RandomForestClassifier_Raw_cic_ids_2017_70.joblib.zip...


ValueError: Trained model not found. Please train model before unit testing it.

In [None]:
X_test, y_test, feature_names, _, _ = dataset.read(df_test_meta['path'], metadata=df_test_meta,
                                         verbose=True, logger=logger, as_df=as_df)                                                   


logger.log(df_test_meta['path'])
logger.log(X_test)
logger.log(y_test)

y_pred = blackbox.predict(X_test)
logger.log(list(y_pred))

logger.log("Blackbox model training classification report:")
logger.log("\n{}".format(classification_report(y_test, y_pred, digits=3)))