In [2]:
import copy
import io

import numpy as np

import graphviz
import matplotlib.pyplot as plt
import pandas as pd
import rootpath
import shap
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from interpret import show
from interpret.blackbox import LimeTabular
from skexplain.enums.feature_type import FeatureType
from skexplain.imitation import (ClassificationDagger,
                                 IncrementalClassificationDagger,
                                 RegressionDagger)
from skexplain.utils import dataset, log, persist
from skexplain.utils.const import (BOSTON_DATASET_META,
                                   CIC_IDS_2017_DATASET_META,
                                   DIABETES_DATASET_META,
                                   HEARTBLEED_DATASET_META,
                                   DOWNLOAD_DATASET_META, IOT_DATASET_META,
                                   WINE_DATASET_META)
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, f1_score, r2_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.svm import LinearSVC

df_test_meta=HEARTBLEED_DATASET_META
df_train_meta=CIC_IDS_2017_DATASET_META 
model=RandomForestClassifier
as_df=False


""" Test using Reinforcement Learning to extract Decision Tree from a generic Blackbox model """
logger = log.Logger(
    "{}/res/log/{}/notebook_{}_{}.log".format(rootpath.detect(), df_train_meta['name'],  model.__name__, "Raw")
)
logger.log('Init done.')

2021-04-21 20:37:16,186 - INFO - Init done.


In [5]:
# Step 1: Parse test def
X_train, y_train, feature_names, _, _ = dataset.read(df_train_meta['path'], metadata=df_train_meta,
                                         verbose=True, logger=logger, as_df=as_df)

2021-04-21 20:37:46,900 - INFO - Names: ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd URG Flags', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Average Packet

2021-04-21 20:38:08,352 - INFO - Targets shape: (2275074, 1) Index(['Label'], dtype='object')


In [4]:
from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression, ClassificationTree, DecisionListClassifier

# Step 2: Train black-box model with loaded dataset
logger.log("#" * 10, "Model init", "#" * 10)
model_path = "../res/weights/{}_{}_{}_{}.joblib".format('ebm',
                                                        "Raw", df_train_meta['name'], X_train.shape[1])
logger.log("Looking for pre-trained model: {}...".format(model_path))
ebm = persist.load_model(model_path)
if not ebm:
    ebm = ExplainableBoostingClassifier(n_jobs=-1, feature_names=list(feature_names))
    ebm.fit(X_train, y_train)   #Works on dataframes and numpy arrays
    persist.save_model(ebm, model_path)

logger.log("#" * 10, "Done", "#" * 10)

2021-04-21 20:37:35,248 - INFO - ########## Model init ##########


NameError: name 'X_train' is not defined

In [7]:
logger.log("#" * 10, "Model test", "#" * 10)
y_pred = ebm.predict(X_train)

logger.log("Blackbox model training classification report:")
logger.log("\n{}".format(classification_report(y_train, y_pred, digits=3)))

logger.log("#" * 10, "Done", "#" * 10)

2021-04-19 11:56:37,515 - INFO - ########## Model test ##########
2021-04-19 11:57:17,845 - INFO - Blackbox model training classification report:
2021-04-19 11:57:20,325 - INFO - 
              precision    recall  f1-score   support

           0      1.000     1.000     1.000   2273097
           1      0.999     0.993     0.996    130004

    accuracy                          1.000   2403101
   macro avg      0.999     0.997     0.998   2403101
weighted avg      1.000     1.000     1.000   2403101

2021-04-19 11:57:20,326 - INFO - ########## Done ##########


In [3]:
X_test, y_test, feature_names, _, _ = dataset.read(df_test_meta['path'], metadata=df_test_meta,
                                         verbose=True, logger=logger, as_df=as_df)                                                   

logger.log(X_test)
logger.log(y_test)

y_pred = ebm.predict(X_test)
logger.log(y_test.ravel(), y_pred)

2021-04-21 20:37:20,577 - INFO - Names: ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Fwd URG Flags', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count', 'Down/Up Ratio', 'Average Packet

2021-04-21 20:37:20,656 - INFO - Targets shape: (1008, 1) Index(['Label'], dtype='object')
2021-04-21 20:37:20,663 - INFO - [[7.660e+02 6.000e+00 6.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 [7.126e+03 8.000e+00 7.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 [5.284e+03 8.000e+00 7.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 ...
 [3.100e+02 6.000e+00 6.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 [6.530e+02 6.000e+00 6.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]
 [7.860e+02 6.000e+00 6.000e+00 ... 0.000e+00 1.000e+00 0.000e+00]]
2021-04-21 20:37:20,664 - INFO - [[0]
 [1]
 [1]
 ...
 [0]
 [0]
 [0]]


NameError: name 'ebm' is not defined

In [9]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

In [19]:
ebm_local = ebm.explain_local(X_test_2, y_test)

ebm_res = ebm.predict(X_test)

print(ebm_res, y_test)

show(ebm_local)

[0 1] [[0]
 [1]]
