# Developing a Machine Learning model for detecting Internet of Things Malware

TODO:
* Change export format from CSV to Parquet

In [1]:
# Install dependencies
# %pip install --upgrade --quiet pip geoip2 humanize pyarrow
# print('Finished installing dependencies')

#### Import dependencies

In [2]:
# built in modules
from datetime import datetime
time_start_notebook =  datetime.now()
import os
import importlib
import ipaddress
import glob
import sys
import csv

# PIP modules
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import geoip2.database
import humanize

In [3]:
######################### DON'T TOUCH ANYTHING BELOW #########################
newpath = '..'
if not newpath in sys.path:
    sys.path.insert(1, newpath)
import tim_ml_lib as tml    # works
importlib.reload(tml)     # reload it since I'm frequently making changes.
######################### DON'T TOUCH ANYTHING ABOVE #########################
RANDOM_STATE        = tml.settings.RANDOM_STATE

# some common, basic setup
tml.initialize_random_seeds()
tml.initialize_display_options()

In [4]:
OUTPUT_FILEPATH   = './data/CTU-IoT-Malware-Capture_ready2train.csv.gz'
CSV_FILES_TO_LOAD = glob.glob('./data/CTU-IoT-Malware-Capture*.labeled.csv.gz')

ORIGINAL_LABEL_COLUMN_NAME = 'label'
LABEL_COLUMN_NAME          = 'is_IoT_malware'

# Using Python's built-in datatypes
# highest compatibility, poorest speed performance
# SMALL_INT_DATATYPE  = 'Int8'
# BIG_INT_DATATYPE    = 'Int64'     # 'uint64[pyarrow]'
# PORT_DATATYPE       = 'Int32'     # 'uint32[pyarrow]'
# FLOAT_DATATYPE      = 'float32'   # 'float32[pyarrow]'
# ENUM_DATATYPE       = 'category'
# BOOL_DATATYPE       = 'bool'      # 'bool[pyarrow]'

# Using pyarrow datatypes
# SMALL_INT_DATATYPE  = 'uint8[pyarrow]'
# BIG_INT_DATATYPE    = 'uint64[pyarrow]'
# PORT_DATATYPE       = 'uint32[pyarrow]'
# FLOAT_DATATYPE      = 'float32[pyarrow]'
# ENUM_DATATYPE       = 'category'
# BOOL_DATATYPE       = 'bool[pyarrow]'

# Using Numpy datatypes
# compatible with more functions, like correlation
SMALL_INT_DATATYPE  = np.uint8
BIG_INT_DATATYPE    = np.uint64
PORT_DATATYPE       = np.uint32
FLOAT_DATATYPE      = np.float32
ENUM_DATATYPE       = 'category'
BOOL_DATATYPE       = bool

# https://stackoverflow.com/questions/29245848/what-are-all-the-dtypes-that-pandas-recognizes
IMPORT_DATATYPES = {  
    # 'uid':          'string',           # unique identifier
    'ts':           FLOAT_DATATYPE,   # timestamp, not really needed unless I want to extract the hour
    #'id.resp_p':    PORT_DATATYPE,         # The destination port
    'id.orig_p':    PORT_DATATYPE,         # The source      port
    # 'id.orig_h':    'string',         # The source      IP address
    # 'id.resp_h':    'string',         # The destination IP address
    # 'local_orig':   BOOL_DATATYPE',    # ALL ARE NULL - Indicates whether the connection is considered local or not - Bool
    # 'local_resp':   BOOL_DATATYPE,    # ALL ARE NULL - Indicates whether the connection is considered local or not - Bool
    # 'history':      'string',           # A history of connection states - String
    'label':          ENUM_DATATYPE,         # A label associated with the connection (e.g., 'Malicious' or 'Benign'). string
    # 'conn_state':     ENUM_DATATYPE,       # The state of the connection.|string|
    'proto':          ENUM_DATATYPE,         # The network protocol used (e.g., 'tcp').
    'service':        ENUM_DATATYPE,         # The service associated with the connection.|string
    # 'tunnel_parents': ENUM_DATATYPE,        # ALL ARE NULL - Indicates if this connection is part of a tunnel.|set[string]|
    # 'detailed-label': ENUM_DATATYPE,       # A more detailed description or label for the connection.|string|
    'duration':       FLOAT_DATATYPE,     # has NaN values, so have to use float; The duration of the connection. DECIMAL number in seconds?
    'orig_bytes':     FLOAT_DATATYPE,     # has NaN values, so have to use float; The number of bytes sent from the source to the destination.
    'resp_bytes':     FLOAT_DATATYPE,     # has NaN values, so have to use float; The number of bytes sent from the destination to the source
    'missed_bytes':   BIG_INT_DATATYPE,       # The number of missed bytes in the connection.
    'resp_ip_bytes':  FLOAT_DATATYPE,     # The number of IP bytes sent from the destination to the source.
    'resp_pkts':      FLOAT_DATATYPE,     # The number of packets sent from the destination to the source
    'orig_pkts':      BIG_INT_DATATYPE,       # cannot use 32-bit types; The number of packets sent from the source to the destination.
    'orig_ip_bytes':  BIG_INT_DATATYPE, # cannot use 32-bit types; The number of IP bytes sent from the source to the destination.|count|
}

raw_cols_to_import = IMPORT_DATATYPES.keys()

# COLUMNS_TO_OHE = ['proto', 'service', 'conn_state', 
#                   'history', 'ip_dest_country'] 
#                     #'id.resp_h', 'id.orig_h']
                    
# SERVICE_TO_PROTOCOL_AND_PORT_MAPPINGS = {
#   'ssh': {'protocol': 'tcp', 'port': 22},
#   'dns': {'protocol': 'udp', 'port': 53},
# }

# Load the data into a Pandas dataframe
load a directory of CSV files

currently takes 82 sec on M3 laptop

cut down to 36 sec when I only imported the columns that I needed

cut from 36 to 26 by specifying uint[pyarrow] instead of Int64, not changing engine

* https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-chunking
* https://stackoverflow.com/questions/66346343/can-i-load-multiple-csv-files-using-pyarrow
* https://arrow.apache.org/docs/2.0/python/generated/pyarrow.csv.ReadOptions.html#pyarrow.csv.ReadOptions

In [5]:
dfs = []    # create an empty list to contain all the dataframes
for iter_csv_file in CSV_FILES_TO_LOAD:

    # print(iter_csv_file)
    df_temp = pd.read_csv(iter_csv_file,
                          # index_col='uid',
                          # index_col=False,
                          # chunksize=?,
                          # engine='pyarrow',        # causes issues with delimiters sometimes
                          engine='c',        
                          # converters={}, # to investigate
                          dtype=IMPORT_DATATYPES,
                          usecols=raw_cols_to_import,            # only load certain columns
                          delimiter='|',
                          na_values='-'
    )
    dfs.append(df_temp)
    del df_temp

df_orig = pd.concat(dfs, ignore_index=True)
df_orig = df_orig.astype(IMPORT_DATATYPES)        # force it again, necessary
df_orig = tml.drop_totally_empty_columns(df_orig)
del dfs

df_orig.shape

# Initial EDA on raw data

In [None]:
tml.list_percent_of_rows_missing_values(df_orig)

In [None]:
# tml.get_total_dataframe_memory_usage(df_orig)

In [None]:
#tml.get_dataframe_memory_usage_by_col(df_orig)

In [None]:
# Benign                                   .3510518     # 35% of whole entire dataset is benign
# tml.get_class_balance(df_orig, label_col_name=ORIGINAL_LABEL_COLUMN_NAME).round(4)

### Create a copy for easy retesting
Easy restart point during testing

In [None]:
df = df_orig.copy()

importlib.reload(tml)     # reload it since I'm frequently making changes.
tml.initialize_random_seeds()

## Data prep - locating and replacing missing values

```
service       0.999280
duration      0.610614
orig_bytes    0.610614
resp_bytes    0.610614
```

In [None]:
# View missing values before replacement
# tml.list_percent_of_rows_missing_values(df)

In [None]:
fill_na_with = {
    'duration':       0,
    'orig_bytes':     0,
    'resp_bytes':     0,
    'missed_bytes':   0,
    'resp_ip_bytes':  0,
    'resp_pkts':      0,
    'orig_pkts':      0,
    'orig_ip_bytes':  0,
}

df.fillna(fill_na_with, inplace=True)

# View missing values after replacement
tml.list_percent_of_rows_missing_values(df)

# Feature Enhancements

# Setting the label and determining class balance

In [None]:
# Setting the label as boolean
df[LABEL_COLUMN_NAME] = ~df[ORIGINAL_LABEL_COLUMN_NAME].isin(['Benign'])
df.drop(columns=[ORIGINAL_LABEL_COLUMN_NAME], inplace=True)
tml.get_class_balance(df, label_col_name=LABEL_COLUMN_NAME)

# Feature addition - account for the time of day that attackers tend to attack 
Increases the correlation for the time and label

1) Extracts the hour from the timestamp [0,23]
2) Applies an offset to account for when attackers work [0,23]
3) Normalizes the day to [0,1]

In [None]:
def shift_hour_of_day(orig_hour_value, shift_amount=0):
    """ Offsets an hour of the day. Used for moving around time for
    adjusting for work days, trying to keep it linear so that events that tend to happen after hours are more linearly related.
    Returns a number [0,23]"""
    return abs((orig_hour_value + shift_amount) % 24)


def adjust_for_attacker_workday(df1):
    """x"""
    df = df1.copy()
    df['hour_of_day'] = df['hour_of_day'].map(lambda x: shift_hour_of_day(x, shift_amount=-18))
    return df


# converting the date to timestamp, need the unit='s' to convert Unix time
df['ts'] = pd.to_datetime(df['ts'], errors="raise", unit='s')

# Add feature - hour of day
df['hour_of_day'] = df['ts'].dt.hour + (df['ts'].dt.minute / 60.0)
df.drop(columns=['ts'], inplace=True)   # drop the original

df = adjust_for_attacker_workday(df)
df = tml.normalize_to_specific_range(df, 'hour_of_day', 0, 23)

# p0 = sns.histplot(data=df, x='hour_of_day', hue=LABEL_COLUMN_NAME, binwidth=0.25)
# plt.ylim(0,250000)

## Feature addition: mapping port numbers to ranges
While the port number itself is important, and can give clues about the traffic, which port range it falls into also give high level information.

* Well-known ports: 0–1023, also known as system ports, are used by common network services and official ports for notable system administrations
* Registered ports: 1024–49151, also known as user ports, are used for a variety of network services and functions
* Dynamic and private ports: 49152–65535, also known as ephemeral ports

In [None]:
# takes 16 sec before vectorization
def map_port_number_to_range(port_number):
    """x"""
    
    #if not pd.api.types.is_numeric_dtype(port_number):
    #    return ''
    # if not 1 <= port_number <= 65535:
    #     return ''
    
    if 1 <= port_number <= 1023:
        return 0 #'0-reserved'
    elif 1024 <= port_number <= 49151:
        return 1  # '1-registered'
    elif 49151 <= port_number <= 65535:
        return 2  # '2-ephemeral'
    else:
        return 3

df['src_port_range_name'] = df['id.orig_p'].apply(map_port_number_to_range)
df = df.astype({'src_port_range_name': SMALL_INT_DATATYPE})
df.drop(columns=['id.orig_p'], inplace=True)

df['src_port_range_name'].unique()

In [None]:
# df.set_index('uid', inplace=True)     # causes issues sometimes
# IP_ADDRESS_COLUMN_NAMES = ['id.orig_h', 'id.resp_h']
# for iter_colname in IP_ADDRESS_COLUMN_NAMES:
#     df[iter_colname] = df[iter_colname].apply(ipaddress.ip_address)
# df.dtypes

In [None]:
# Show some sample data after the transforms
df.head(3)

# Feature enhancements

## Feature addition / correction - account for the time of day that attackers tend to attack 

In [None]:
#df.columns.sort_values()

## Feature addition: ratio of incoming vs outgoing bandwidth, packets, etc

In [None]:
# df['orig_bytes_clipped']    = df_orig['orig_bytes'].clip(upper=50000)
# df['resp_bytes_clipped']    = df_orig['resp_bytes'].clip(upper=50000)

In [None]:
# sns.scatterplot(data=df, x='orig_bytes_clipped', y='resp_bytes_clipped', hue=LABEL_COLUMN_NAME)
# plt.loglog()

In [None]:
# # Correlation with label was "NaN"
# df['test1']                 =  df_orig['resp_bytes'].apply(np.log) / df_orig['orig_bytes'].apply(np.log)
# df['test1_std'] = stats.zscore(df['test1'])
# sns.histplot(data=df, x='test1', hue=LABEL_COLUMN_NAME)
# plt.ylim((0,2500))
# plt.xlim((0.5,2.2))

In [None]:
# # Correlation with label was "NaN"
# df['test2']                 =  (df_orig['resp_bytes'] / df_orig['orig_bytes']).apply(np.log)
# sns.histplot(data=df, x='test2', hue=LABEL_COLUMN_NAME)
# plt.ylim((0,2000))
# plt.xlim((-2.5,4.5))
# df['test2_std'] = stats.zscore(df['test2'])

In [None]:
# # Test 0 - no log stuff
# df['ratio_bytes']   = df_orig['resp_bytes'] / df_orig['orig_bytes']
# df['ratio_bytes'].describe()
# sns.histplot(data=df, x='ratio_bytes', hue=LABEL_COLUMN_NAME, log_scale=(False, True), binwidth=50)

In [None]:
# df['ratio_packets'] = df['orig_pkts']  / df['resp_pkts']

## Feature addition: Geo-IP and ASN features
This step takes the longest - about 8 minutes on a MacBook air w/o vectorizing via numba or specifying # of threads via 

In [None]:
# # configure and load the GeoIP databases
# # https://dev.maxmind.com/geoip/geolite2-free-geolocation-data?lang=en  
# # https://www.maxmind.com/en/accounts/985797/geoip/downloads
# # https://github.com/maxmind/GeoIP2-python?tab=readme-ov-file#database-usage

# try:
#     # open the readers, they must be closed below
#     geoip_country = geoip2.database.Reader('./geoip/GeoLite2-Country_20240308/GeoLite2-Country.mmdb')
#     geoip_asn     = geoip2.database.Reader('./geoip/GeoLite2-ASN_20240308/GeoLite2-ASN.mmdb')

#     # @numba.vectorize
#     def ip_to_country(ip_as_str):
#         try:
#             ip = ipaddress.ip_address(ip_as_str)
#             if ip.is_global:
#                 return geoip_country.country(ip).country.name
#         finally:
#             return None

#     # @numba.vectorize
#     def ip_to_asn(ip_as_str):
#         try:
#             ip = ipaddress.ip_address(ip_as_str)
#             if ip.is_global:
#                 return geoip_asn.asn(ip).autonomous_system_number
#         finally:
#             return None


#     # GeoIP
#     df['ip_dest_country'] = df['id.resp_h'].apply(ip_to_country)
#     df['ip_asn']          = df['id.resp_h'].apply(ip_to_asn)

#     print(df['ip_dest_country'].unique().tolist())
#     print(df['ip_asn'].unique().tolist())
# finally:
#     # close the readers
#     geoip_country.close()
#     geoip_asn.close()

# df.dtypes

In [None]:
# counting unique values
# tml.get_unique_value_counts(df)

## Converting strings to one-hot encoded columns

In [None]:
# tml.list_unique_string_values(df)
# df['service'].unique()
df = tml.onehotencode_single_column(df, 'service')
df = tml.onehotencode_single_column(df, 'proto')

# Second phase of EDA - after feature additions 
but before normalizing and standardizing

In [None]:
def quick_eda(df1, colname, lab, log_scale=True):
    """x"""
    print(df1[colname].describe(percentiles=[]))
    sns.histplot(data=df1, x=colname, hue=lab, log_scale=log_scale)
    plt.title(f'Histogram for {colname}')

In [None]:
# quick_eda(df, colname='orig_bytes', lab=LABEL_COLUMN_NAME)
# plt.xlim(right=10000)

In [None]:
# quick_eda(df, colname='resp_bytes', lab=LABEL_COLUMN_NAME)
# plt.xlim(left=10, right=1000)

In [None]:
# quick_eda(df, colname='missed_bytes', lab=LABEL_COLUMN_NAME)
# # plt.xlim(right=25000)

In [None]:
# sns.histplot(data=df, x='resp_pkts', hue=LABEL_COLUMN_NAME, log_scale=(False, True)) #, binwidth=5)
# #plt.xlim(left=0, right=400)
# #plt.ylim((0,5000))

In [None]:
# quick_eda(df, colname='resp_ip_bytes', lab=LABEL_COLUMN_NAME)
# # plt.xlim(right=7500)

## Final Data Prep - Winsorizing and Normalizing

In [None]:
cols_to_winsor_then_standardize = [
    'duration', 
    'orig_pkts',
    'orig_ip_bytes',    
    ]

for c in cols_to_winsor_then_standardize:
    print(f'Winsorizing {c}...')
    # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mstats.winsorize.html
    df[c] = stats.mstats.winsorize(df[c], limits=[0.05, 0.05])
    print(f'Standardizing {c}...')
    # df = tml.standardize_column_using_zscore(df, c) # causes issues
    df[c] = stats.zscore(df[c]) 

print('Finished prep.')

In [None]:
# Clipping then standardizing

df['missed_bytes']  = df['missed_bytes'].clip(upper=25000)

df['resp_pkts']     = df['resp_pkts'].clip(upper=100)

df['resp_ip_bytes'] = df['resp_ip_bytes'].clip(upper=7500)

df['orig_bytes']    = stats.zscore(df['orig_bytes'])
df['resp_bytes']    = stats.zscore(df['resp_bytes'])
df['missed_bytes']  = stats.zscore(df['missed_bytes'])
df['resp_pkts']     = stats.zscore(df['resp_pkts'])
df['resp_ip_bytes'] = stats.zscore(df['resp_ip_bytes'])

# TODO: feature idea: ratio of packets or size to/from source/dest

In [None]:
# df.describe()

In [None]:
tml.trainable_data_report(df)

## Final Data Prep - Converting Bools to Ints

In [None]:
df = tml.change_all_bools_to_ints(df)   # must be done befoe correlations
tml.feature_target_correlation(df, label_column_name=LABEL_COLUMN_NAME)

In [None]:
# tml.get_total_dataframe_memory_usage(df_orig)

# Final tests before export to CSV

In [None]:
# id.orig_p            102304
# id.resp_p             75872
# tml.get_unique_value_counts(df)

In [None]:
tml.assert_datatypes_ready_for_training(df)

In [None]:
# df.describe()

In [None]:
# tml.get_total_dataframe_memory_usage(df)

# Final EDA

In [None]:
df.head()

In [None]:
importlib.reload(tml)
tml.feature_target_correlation(df, label_column_name='is_IoT_malware')

In [None]:
# Drop features with poor label correlation:
importlib.reload(tml)
df = tml.drop_feature_below_corr_threshold(df, 0.04, label_column_name='is_IoT_malware')
tml.feature_target_correlation(df, label_column_name='is_IoT_malware')

# Exporting ready to train data to CSV files

In [None]:
# Runtime on Macbook air with full dataset to uncompressed file: at least 8 min
# M3 runtime to GZ with several other changes like datatypes, but no multicore stuff: 1 min 22 sec, 37.6 MB
# M3 runtime to CSV with several other changes like datatypes, but no multicore stuff: 50 sec, 1.4 GB
#   CSV is mariginally faster than XZ, but takes up way more space
#   https://dask.pydata.org/en/latest/diagnostics-local.html
#   increase # of rows/block size
#   https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html

# shuffle the order randomly before outputting.
df = df.sample(frac=1,random_state=RANDOM_STATE).reset_index(drop=True)

# Create a training/test dataset and output to CSV
df.to_csv(OUTPUT_FILEPATH,
        sep=',',                        # changing delimiter to , so that pyarrow engine can be used by model
        header=True,
        index=False,
        chunksize=1000000,
        compression='gzip',
        quoting=csv.QUOTE_ALL,
        encoding='utf-8')

print(f"Training data saved to new file:\n{OUTPUT_FILEPATH}")

output_filesize = humanize.naturalsize(os.stat(OUTPUT_FILEPATH).st_size)
print(f'output file size: {output_filesize}')

In [None]:
time_end_notebook =  datetime.now()
print(time_end_notebook - time_start_notebook)