# Modeling IoT Malware Classification using XGBoost

In [1]:
# %pip install --upgrade pyarrow

# built in modules
from datetime import datetime
#tml.get_current_timestamp()
time_start_notebook =  datetime.now()
import os
import sys
import importlib

# PIP modules
import pandas as pd
import numpy as np
from   sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import sklearn.metrics as skm
from   xgboost import XGBClassifier

# My custom Python module
######################### DON'T TOUCH ANYTHING BELOW #########################
newpath = '..'
if not newpath in sys.path:
    sys.path.insert(1, newpath)
del newpath
import tim_ml_lib as tml    # works
importlib.reload(tml)     # reload it since I'm frequently making changes.
######################### DON'T TOUCH ANYTHING ABOVE #########################

# some common, basic setup
tml.initialize_random_seeds()
tml.initialize_display_options()

# Model Options

In [2]:
# INPUT_FILEPATH      = # os.path.join('data', 'REPLACEME.csv')
INPUT_FILEPATH      = './data/CTU-IoT-Malware-Capture_ready2train.csv.gz'

PRIMARY_METRIC      = 'F1'        # key from 'scoring' dict
N_JOBS              = tml.get_ideal_num_jobs()  # number of CPU cores to use
CV                  = 7                         # cross validation - number of folds
# N_PCA_COMPONENTS    = 5                       # number of dimensions to keep after principal component analysis transformation
VERBOSITY_LEVEL     = 0                         # verbose level for GridSearchCV: 0-4
ZERO_DIVISION_VALUE = 0                         # 
LABEL_COLUMN_NAME   = 'is_IoT_malware'
RANDOM_STATE        = tml.settings.RANDOM_STATE

param_grid = {
   'max_depth':     [2],
   'n_estimators':  [10, 100], #, 250, 500, 750, 1000],
   'learning_rate': [0.9],
}

scoring = {
    "F1":                 skm.make_scorer(skm.f1_score, zero_division=ZERO_DIVISION_VALUE),  # not a problem source                 
    "Accuracy":           'accuracy',   # not a problem source
    "Recall":             skm.make_scorer(skm.recall_score, zero_division=ZERO_DIVISION_VALUE),   # not a problem source
    "Precision":          skm.make_scorer(skm.precision_score, zero_division=ZERO_DIVISION_VALUE),   # not a problem source           

    "roc_auc":            "roc_auc",   # not a problem source
    "roc_auc_score":      skm.make_scorer(skm.roc_auc_score),     # not a problem source
    #"auc":                skm.make_scorer(skm.auc), # PROBLEM!!!!

    "neg_log_loss":       "neg_log_loss",   # not a problem source
    "Brier Score Loss":   skm.make_scorer(skm.brier_score_loss),   # not a problem source
    }

# FLOAT_DATATYPE = 'float64[pyarrow]'     # works, when 10% sample, df is 115 MB for float64. No errors
FLOAT_DATATYPE = 'float32[pyarrow]'       # works, when 10% sample, df is 75 MB for float32. No errors.  max/min +- 3.4 x 10^38; consumes 131 MB of RAM with float32 and chunksize=6252750 (25%)
# FLOAT_DATATYPE = 'float16[pyarrow]'     # ERROR, cannot run

TRAINING_DATATYPES = {
    'is_IoT_malware':      'uint8[pyarrow]',        # label    
    'duration':            FLOAT_DATATYPE,      # float b/c of standardization
    'orig_pkts':           FLOAT_DATATYPE,      # float b/c of standardization    
    'hour_of_day':         FLOAT_DATATYPE,      # added feature - values ranges from [0,23]
    'src_port_range_name': 'uint8[pyarrow]',        # added feature - values ranges from [0,4]
    'orig_ip_bytes':       FLOAT_DATATYPE,
    'proto_icmp':          'uint8[pyarrow]',
    'proto_tcp':           'uint8[pyarrow]',
    'proto_udp':           'uint8[pyarrow]'
}


# Load data in Chunks:
https://www.vantage-ai.com/en/blog/4-strategies-how-to-deal-with-large-datasets-in-pandas

FAILED when using chunksize = 6000000, CV=7
 * BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.


using chunksize = 3000000, CV=7

In [3]:
# original file has 25,011,003 rows. I often ran into problems when I went above 25% sample size, but that was with float64
# so try 6,252,750
chunksize     = 3000000
engine        = 'c'          # pyarrow does not support chunking
loop_counter  = 1
models        = []
history_stats = []
# loop_max    = 3

for df in pd.read_csv(INPUT_FILEPATH, chunksize=chunksize, delimiter=',', engine=engine, dtype=TRAINING_DATATYPES, header=0):
    print(f'========== Start Chunk # {loop_counter}, chunksize={chunksize} ==========')
    tml.initialize_random_seeds()
    
    model_generic = XGBClassifier(
        objective='binary:logistic',
        random_state=RANDOM_STATE,
        )
    
    y  = df[LABEL_COLUMN_NAME] # .copy()
    X  = df.drop(columns=LABEL_COLUMN_NAME, axis=1)

    print(f'Dataframe chunk shape: {df.shape}')
    print(f'Dataframe size in memory: {tml.get_total_dataframe_memory_usage(df)}')
    print(f'X shape: {X.shape}')
    print(f'y shape: {y.shape}')
    
    clf = GridSearchCV(
                        model_generic,
                        param_grid,
                        verbose=VERBOSITY_LEVEL,
                        n_jobs=N_JOBS,
                        cv=CV,
                        scoring=scoring,
                        refit=PRIMARY_METRIC,
                        return_train_score=True)

    clf.fit(X, y)
    print(f'Fitting GridSearch chunk # {loop_counter} was successful.')
    stats_dict = tml.write_GridSearchCV_output_to_csv(PRIMARY_METRIC, clf, X, y)

    print(f'Best {PRIMARY_METRIC} score: {str(clf.best_score_)}')
    print(f'Parameters associated with best model: {str(clf.best_params_)}')

    models.append(clf)                
    history_stats.append(stats_dict)
    print(f'========== End loop for chunk # {loop_counter} ==========\n')
    loop_counter += 1


Dataframe chunk shape: (3000000, 9)
Dataframe size in memory: 63.0 MB
X shape: (3000000, 8)
y shape: (3000000,)
Fitting GridSearch chunk # 1 was successful.
Best F1 score: 0.9856652257064976
Parameters associated with best model: {'learning_rate': 0.9, 'max_depth': 2, 'n_estimators': 100}

Dataframe chunk shape: (3000000, 9)
Dataframe size in memory: 63.0 MB
X shape: (3000000, 8)
y shape: (3000000,)
Fitting GridSearch chunk # 2 was successful.
Best F1 score: 0.9846174325534693
Parameters associated with best model: {'learning_rate': 0.9, 'max_depth': 2, 'n_estimators': 10}

Dataframe chunk shape: (3000000, 9)
Dataframe size in memory: 63.0 MB
X shape: (3000000, 8)
y shape: (3000000,)
Fitting GridSearch chunk # 3 was successful.
Best F1 score: 0.9846761015973949
Parameters associated with best model: {'learning_rate': 0.9, 'max_depth': 2, 'n_estimators': 10}

Dataframe chunk shape: (3000000, 9)
Dataframe size in memory: 63.0 MB
X shape: (3000000, 8)
y shape: (3000000,)
Fitting GridSearc

# Plot model performance

In [4]:
# TODO implement index based loop to iterate thru both models and history_stats
# or put them into the same list as a tuple
# for clf in models:
#     for plot_x_axis in param_grid.keys():
#         fig = tml.plot_grid_search_scores(plot_x_axis, clf, scoring, PRIMARY_METRIC)
#         ts = stats_dict['Timestamp']
#         guid = stats_dict['Experiment GUID']
#         figure_filename = f'experiment_{ts}_{plot_x_axis}_{guid}.png'
#         fig.savefig(f'./metrics_history/{figure_filename}')
#         print(f'Saved figure as {figure_filename}')
#         plt.clf()
#         #del fig

In [5]:
time_end_notebook =  datetime.now()
print(time_end_notebook - time_start_notebook)

0:12:41.534084
