# Data Preparation


#### Import dependencies

In [53]:
import os
import re
import ipaddress
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
import geoip2.database


sns.set_theme()

### Load the data into a Pandas dataframe
Define the path to the dataset file
Define the name of the label column

In [54]:
rootdir = os.getcwd()
infile = os.path.join(rootdir, 'data',
                      'CTU-IoT-Malware-Capture-20-1conn.log.labeled.csv')

df = pd.read_csv(infile, delimiter='|', na_values='-')

#### Customized variables for this dataset

In [55]:
ORIGINAL_LABEL_COLUMN_NAME = 'label'
LABEL_COLUMN_NAME = 'label_bool'

NORMALIZE_METHOD = "min_max"

def get_stat(col_name, stat_name):
    """docstring TBD"""
    return df.describe(include="all").loc[stat_name].loc[col_name]


# Finding the percentiles:
def find_nearest_index(array, value):
    """docstring TBD"""
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx

df.head(10)

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
0,1538479000.0,CSQG794riQ4XnzTxP2,192.168.100.103,37082,192.168.100.1,53,udp,dns,5.005151,78.0,...,,0,D,2,134,0,0,,Benign,
1,1538479000.0,COTbdG2BhtGBlmf6r,192.168.100.103,34486,192.168.100.1,53,udp,dns,0.007243,90.0,...,,0,Dd,2,146,2,146,,Benign,
2,1538479000.0,CP48WJ2HOnLuGtr5kb,192.168.100.103,37601,192.168.100.1,53,udp,dns,0.00225,90.0,...,,0,Dd,2,146,2,146,,Benign,
3,1538479000.0,CeTMJi2TydRSaVdsG4,192.168.100.103,46439,192.168.100.1,53,udp,dns,5.005154,78.0,...,,0,D,2,134,0,0,,Benign,
4,1538479000.0,CZ6ne24AN9WAg9XA9d,192.168.100.103,55174,192.168.100.1,53,udp,dns,0.002246,90.0,...,,0,Dd,2,146,2,146,,Benign,
5,1538479000.0,CmOsCc16kkxJiJ3zF6,192.168.100.103,40788,192.168.100.1,53,udp,dns,5.005157,78.0,...,,0,D,2,134,0,0,,Benign,
6,1538479000.0,Ca9JNa4eTKtloY8z5h,192.168.100.103,56655,192.168.100.1,53,udp,dns,0.001999,90.0,...,,0,Dd,2,146,2,146,,Benign,
7,1538479000.0,CX4Jtj23yoifdPjkah,192.168.100.103,52983,192.168.100.1,53,udp,dns,5.001404,78.0,...,,0,D,2,134,0,0,,Benign,
8,1538479000.0,CaZS0B31T39XBqk8Nd,192.168.100.103,37651,192.168.100.1,53,udp,dns,5.001156,78.0,...,,0,D,2,134,0,0,,Benign,
9,1538479000.0,C458Uc2OYB9oRtxaAl,192.168.100.103,59986,192.168.100.1,53,udp,dns,0.028733,90.0,...,,0,Dd,2,146,2,146,,Benign,


In [56]:
df.tail(10)

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
3199,1538565000.0,COemgP2o76UNoxFy7a,192.168.100.103,123,217.30.75.147,123,udp,,0.072964,48.0,...,,0,Dd,1,76,1,76,,Benign,
3200,1538565000.0,CivARQ2xD0oSPfVate,192.168.100.103,123,217.30.75.147,123,udp,,0.003242,48.0,...,,0,Dd,1,76,1,76,,Benign,
3201,1538565000.0,CCBQlL2ZtnDcjijuX4,192.168.100.103,123,217.30.75.147,123,udp,,0.002746,48.0,...,,0,Dd,1,76,1,76,,Benign,
3202,1538565000.0,Ctz25L1cqDMX45Lky9,192.168.100.103,123,217.30.75.147,123,udp,,0.002747,48.0,...,,0,Dd,1,76,1,76,,Benign,
3203,1538565000.0,Cb4JIc3hbDka7tLZth,192.168.100.103,123,217.30.75.147,123,udp,,0.002732,48.0,...,,0,Dd,1,76,1,76,,Benign,
3204,1538565000.0,CgenZ5oaUlAs8oOP8,192.168.100.103,123,217.30.75.147,123,udp,,0.058227,48.0,...,,0,Dd,1,76,1,76,,Benign,
3205,1538565000.0,CuyoPRuDeCX9EleYd,192.168.100.103,123,217.30.75.147,123,udp,,0.002738,48.0,...,,0,Dd,1,76,1,76,,Benign,
3206,1538565000.0,CXya0e36iFhSxS3Mf9,192.168.100.103,123,217.30.75.147,123,udp,,0.033229,48.0,...,,0,Dd,1,76,1,76,,Benign,
3207,1538521000.0,CGLDvm3X8xOnv1Rfwf,192.168.100.103,38098,66.85.157.90,443,tcp,,44325.615625,19431.0,...,,4500,ShADCaGcgd,8872,372893,9307,388975,,Malicious C&C,Torii
3208,1538565000.0,Cr70I22hK5v40Wu9mh,192.168.100.103,123,217.30.75.147,123,udp,,0.042471,48.0,...,,0,Dd,1,76,1,76,,Benign,


# Converting existing datatypes

In [57]:
# Setting the label as boolean
df[LABEL_COLUMN_NAME] = df[ORIGINAL_LABEL_COLUMN_NAME].isin(['Malicious   C&C']).astype(int).astype(bool)

# converting the date to timestamp,
# need the unit='s' to convert Unix time
df["ts_converted"] = pd.to_datetime(
    df["ts"], errors="raise",
    unit='s'
)

# IP_ADDRESS_COLUMN_NAMES = ['id.orig_h', 'id.resp_h']
# for iter_colname in IP_ADDRESS_COLUMN_NAMES:
#     df[iter_colname] = df[iter_colname].apply(ipaddress.ip_address)

# df['orig_bytes'] = df['orig_bytes'].astype(int) # bytes are integers, not float

df.tail(5)

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label,label_bool,ts_converted
3204,1538565000.0,CgenZ5oaUlAs8oOP8,192.168.100.103,123,217.30.75.147,123,udp,,0.058227,48.0,...,Dd,1,76,1,76,,Benign,,False,2018-10-03 11:08:31.550179072
3205,1538565000.0,CuyoPRuDeCX9EleYd,192.168.100.103,123,217.30.75.147,123,udp,,0.002738,48.0,...,Dd,1,76,1,76,,Benign,,False,2018-10-03 11:09:36.550151936
3206,1538565000.0,CXya0e36iFhSxS3Mf9,192.168.100.103,123,217.30.75.147,123,udp,,0.033229,48.0,...,Dd,1,76,1,76,,Benign,,False,2018-10-03 11:10:43.549967104
3207,1538521000.0,CGLDvm3X8xOnv1Rfwf,192.168.100.103,38098,66.85.157.90,443,tcp,,44325.615625,19431.0,...,ShADCaGcgd,8872,372893,9307,388975,,Malicious C&C,Torii,True,2018-10-02 22:53:41.627996928
3208,1538565000.0,Cr70I22hK5v40Wu9mh,192.168.100.103,123,217.30.75.147,123,udp,,0.042471,48.0,...,Dd,1,76,1,76,,Benign,,False,2018-10-03 11:11:49.550106880


# Locating missing values

In [58]:
# Locating missing values:
nan_count = np.sum(df.isnull(), axis=0).sort_values(ascending=False)
nan_count

local_orig        3209
tunnel_parents    3209
local_resp        3209
detailed-label    3193
service           2617
duration           885
orig_bytes         885
resp_bytes         885
history              0
label_bool           0
label                0
resp_ip_bytes        0
resp_pkts            0
orig_ip_bytes        0
orig_pkts            0
ts                   0
missed_bytes         0
uid                  0
conn_state           0
proto                0
id.resp_p            0
id.resp_h            0
id.orig_p            0
id.orig_h            0
ts_converted         0
dtype: int64

## Removing columns that the model doesn't use
TBD

In [59]:
df.drop(
    columns=[
        ORIGINAL_LABEL_COLUMN_NAME,     # was replaced
        "detailed-label",               # will be used in future version of this Notebook
        "ts",                           # was converted to a new column
        "uid"                           # unique identifier, not used by model
    ],
    inplace=True,
)

df.dtypes

id.orig_h                 object
id.orig_p                  int64
id.resp_h                 object
id.resp_p                  int64
proto                     object
service                   object
duration                 float64
orig_bytes               float64
resp_bytes               float64
conn_state                object
local_orig               float64
local_resp               float64
missed_bytes               int64
history                   object
orig_pkts                  int64
orig_ip_bytes              int64
resp_pkts                  int64
resp_ip_bytes              int64
tunnel_parents           float64
label_bool                  bool
ts_converted      datetime64[ns]
dtype: object

## Winsorizing numerical outliers
Description TBD

In [60]:
# # Winsorize the top 1% and bottom 1%
# percentile = 0.01

# COLUMNS_TO_WINSORIZE = ['duration_in_minutes']

# for iter_column_name in COLUMNS_TO_WINSORIZE:
#     new_column_name = iter_column_name + "_winsorized"

#     winsorized_data = stats.mstats.winsorize(
#         df[iter_column_name], limits=[percentile, percentile], inplace=False
#     )

#     df[new_column_name] = winsorized_data
#     df.drop(columns=iter_column_name, inplace=True)
#     print(
#         f"Winsorized column {iter_column_name} to {new_column_name} and removed original column."
#     )

## Replacing missing numerical values w/ their mean
Description TBD

In [61]:
# COLUMNS_TO_REPLACE_MISSING = ['duration_in_minutes_winsorized']

# for iter_column_name in df.select_dtypes(include=np.number).columns.tolist():
#     num_missing = np.sum(df[iter_column_name].isnull(), axis=0)

#     if num_missing > 0:
#         new_column_name = iter_column_name + "_replacedMissing"
#         mean = get_stat(iter_column_name, "mean")

#         df[new_column_name] = df[iter_column_name].fillna(value=mean, inplace=False)
#         df.drop(columns=iter_column_name, inplace=True)

#         # TODO: get a count of the number changed.
#         print(
#             f"Replaced missing values in column {iter_column_name} with the mean and created new column {new_column_name}. Removed original column"
#         )
#     else:
#         print(
#             f"No missing values detected in column {iter_column_name}, no changes made. Original column left intact."
#         )

## Normalizing numerical ranges
Description TBD

In [62]:
# def normalize(df_local, column_name, normalize_method_name):
#     """docstring TBD"""
#     df_temp = df_local.copy()
#     new_columnname = column_name + "_normalized"

#     if normalize_method_name == "absolute_range":
#         df_temp[new_columnname] = (
#             df_temp[column_name] / df_temp[column_name].abs().max()
#         )

#     elif normalize_method_name == "min_max":
#         # rescales a features to be in the range [0,1]
#         df_temp[new_columnname] = (
#             df_temp[column_name] - df_temp[column_name].min()
#         ) / (df_temp[column_name].max() - df_temp[column_name].min())

#     elif normalize_method_name == "z_score":
#         df_temp[new_columnname] = (
#             df_temp[column_name] - df_temp[column_name].mean()
#         ) / df_temp[column_name].std()

#     else:
#         raise NameError("Unrecogized normalization method")

#     df_temp.drop(columns=column_name, inplace=True)
#     print(
#         f"Normalized column {column_name} into {new_columnname} using {normalize_method_name}. Removed original."
#     )
#     return df_temp

# df.dtypes

# # iterate through the list of current numeric columns
# # for iter_column_name in df.select_dtypes(include=np.number).columns.tolist():
# for iter_column_name in ['duration_in_minutes_winsorized', 'month_published_int']:
#     df = normalize(df, iter_column_name, NORMALIZE_METHOD)

# Adding Features

In [63]:
# configure and load the GeoIP databases
# %pip install geoip2
# restart the kernel

# https://dev.maxmind.com/geoip/geolite2-free-geolocation-data?lang=en  
# https://www.maxmind.com/en/accounts/985797/geoip/downloads
# https://github.com/maxmind/GeoIP2-python?tab=readme-ov-file#database-usage


# TODO: close these readers
geoip_country = geoip2.database.Reader('./geoip/GeoLite2-Country_20240308/GeoLite2-Country.mmdb')
geoip_asn     = geoip2.database.Reader('./geoip/GeoLite2-ASN_20240308/GeoLite2-ASN.mmdb')

def ip_to_country(ip_as_str):
    ip = ipaddress.ip_address(ip_as_str)
    if ip.is_global:
        return geoip_country.country(ip).country.name
    return None

def ip_to_asn(ip_as_str):
    ip = ipaddress.ip_address(ip_as_str)
    if ip.is_global:
        return geoip_asn.asn(ip).autonomous_system_number
    return None

# GeoIP
df['ip_dest_country'] = df['id.resp_h'].apply(ip_to_country)
df['ip_asn']          = df['id.resp_h'].apply(ip_to_asn)

# id.orig_h|id.orig_p|id.resp_h|id.resp_p|proto|service
COLUMN_NAMES_CATEGORICAL = ['ip_asn', 'ip_dest_country',
                            'id.resp_p', 'id.orig_p',
                            'id.orig_h', 'id.resp_h',
                            'proto', 'service', 'conn_state']

for iter_colname in COLUMN_NAMES_CATEGORICAL:
    df[iter_colname] = df[iter_colname].astype('category')

print(df['ip_dest_country'].unique().tolist())
print(df['ip_asn'].unique().tolist())

df.dtypes

[nan, 'Czechia', 'United States', 'France', 'Belgium']
[nan, 24806.0, 197019.0, 198161.0, 15169.0, 20454.0, 24971.0, 48574.0, 16276.0, 42000.0, 16246.0, 41046.0, 2852.0, 197197.0, 9009.0, 8251.0, 15685.0, 12570.0, 2611.0, 51134.0, 29208.0]


id.orig_h                category
id.orig_p                category
id.resp_h                category
id.resp_p                category
proto                    category
service                  category
duration                  float64
orig_bytes                float64
resp_bytes                float64
conn_state               category
local_orig                float64
local_resp                float64
missed_bytes                int64
history                    object
orig_pkts                   int64
orig_ip_bytes               int64
resp_pkts                   int64
resp_ip_bytes               int64
tunnel_parents            float64
label_bool                   bool
ts_converted       datetime64[ns]
ip_dest_country          category
ip_asn                   category
dtype: object

## Converting strings to one-hot encoded columns
Locate string columns that have a small number of unique values and replace them with one-hot encoded versions, then remove the original column.

In [64]:
#TODO: support the IP address in a numerical fashion, add features for APN, country, etc.
columns_to_OHE = ['proto', 'service', 'conn_state', 'history'] #'id.resp_h', 'id.orig_h']

for iter_column_name in columns_to_OHE:
    # define a new column name
    new_column_prefix = iter_column_name # + '_onehot_'
    
    # create a one-hot encoded version in a new dataframe
    temp_df = pd.get_dummies(df[iter_column_name], prefix=new_column_prefix)

    # merge the new dataframe into the existing one
    df = df.join(temp_df)

    # remove the original column now that it has been encoded 
    # into the existing dataframe
    df.drop(columns=iter_column_name, inplace=True)
    
    # TODO: get count of # of new columns
    # TODO: make sure it is not one-hot encoding Booleans
    print(f'One-hot encoded: {iter_column_name} into {new_column_prefix}*')

One-hot encoded: proto into proto*
One-hot encoded: service into service*
One-hot encoded: conn_state into conn_state*
One-hot encoded: history into history*


In [65]:
# Everything should be reduced to numbers at this point

list_of_string_columns = df.select_dtypes(include=object).columns.tolist()

# create a Pandas Series that lists the string columns by ascending counts
df_unique_string_vals = df[list_of_string_columns].nunique().sort_values(ascending=True)
df_unique_string_vals

Series([], dtype: float64)

# Re-order the columns
Sort the column names alphabetically, but make sure the 'label' column is always last.

In [66]:
# alphabetically sort the column names, but leave the label as the last column
column_order = sorted(df.columns)
column_order.remove(LABEL_COLUMN_NAME)
column_order.append(LABEL_COLUMN_NAME)
df = df.reindex(column_order, axis=1)

# Final tests

In [67]:
# check for missing values
# check for any remaining strings
df.describe(include="all")

Unnamed: 0,conn_state_OTH,conn_state_RSTR,conn_state_S0,conn_state_S1,conn_state_SF,duration,history_D,history_D^d,history_Dd,history_S,...,orig_pkts,proto_tcp,proto_udp,resp_bytes,resp_ip_bytes,resp_pkts,service_dns,ts_converted,tunnel_parents,label_bool
count,3209,3209,3209,3209,3209,2324.0,3209,3209,3209,3209,...,3209.0,3209,3209,2324.0,3209.0,3209.0,3209,3209,0.0,3209
unique,2,2,2,2,2,,2,2,2,2,...,,2,2,,,,2,,,2
top,False,False,False,False,True,,False,False,True,False,...,,False,True,,,,False,,,False
freq,3208,3208,2135,3208,2132,,2148,3208,2130,3196,...,,3192,3192,,,,2617,,,3193
mean,,,,,,37.898508,,,,,...,6.490807,,,65.699225,291.973824,6.349018,,2018-10-02 22:01:36.445796864,,
min,,,,,,0.00048,,,,,...,0.0,,,0.0,0.0,0.0,,2018-10-02 11:12:49.600292864,,
25%,,,,,,0.002744,,,,,...,1.0,,,48.0,0.0,0.0,,2018-10-02 15:22:05.549019136,,
50%,,,,,,0.005746,,,,,...,1.0,,,48.0,76.0,1.0,,2018-10-02 20:38:09.549634048,,
75%,,,,,,0.056341,,,,,...,1.0,,,48.0,76.0,1.0,,2018-10-03 04:50:03.549690880,,
max,,,,,,44325.615625,,,,,...,8872.0,,,19431.0,388975.0,9307.0,,2018-10-03 11:11:49.550106880,,


In [68]:
# show the final datatypes before exporting to CSV
df.dtypes

conn_state_OTH                   bool
conn_state_RSTR                  bool
conn_state_S0                    bool
conn_state_S1                    bool
conn_state_SF                    bool
duration                      float64
history_D                        bool
history_D^d                      bool
history_Dd                       bool
history_S                        bool
history_ShADCaGcgd               bool
history_ShADaCGdtfF              bool
history_ShADaCGr                 bool
history_^d                       bool
id.orig_h                    category
id.orig_p                    category
id.resp_h                    category
id.resp_p                    category
ip_asn                       category
ip_dest_country              category
local_orig                    float64
local_resp                    float64
missed_bytes                    int64
orig_bytes                    float64
orig_ip_bytes                   int64
orig_pkts                       int64
proto_tcp   

In [70]:
df.head()

Unnamed: 0,conn_state_OTH,conn_state_RSTR,conn_state_S0,conn_state_S1,conn_state_SF,duration,history_D,history_D^d,history_Dd,history_S,...,orig_pkts,proto_tcp,proto_udp,resp_bytes,resp_ip_bytes,resp_pkts,service_dns,ts_converted,tunnel_parents,label_bool
0,False,False,True,False,False,5.005151,True,False,False,False,...,2,False,True,0.0,0,0,True,2018-10-02 11:12:49.600292864,,False
1,False,False,False,False,True,0.007243,False,False,True,False,...,2,False,True,90.0,146,2,True,2018-10-02 11:12:59.610846976,,False
2,False,False,False,False,True,0.00225,False,False,True,False,...,2,False,True,90.0,146,2,True,2018-10-02 11:13:09.630641920,,False
3,False,False,True,False,False,5.005154,True,False,False,False,...,2,False,True,0.0,0,0,True,2018-10-02 11:12:59.620088064,,False
4,False,False,False,False,True,0.002246,False,False,True,False,...,2,False,True,90.0,146,2,True,2018-10-02 11:13:19.645443840,,False


# Storing training and prediction data into CSV files

In [69]:
output_file_prefix = os.path.splitext(infile)[0]

# Create a training/test dataset and output to CSV
df_training = df.copy()
training_outfile = output_file_prefix + "_train.csv"
df_training.to_csv(training_outfile)
print(f"Training data saved to new CSV file:\n{training_outfile}")

Training data saved to new CSV file:
/Users/the-molecular-man/source_code/machine-learning/IoT_malware/data/CTU-IoT-Malware-Capture-20-1conn.log.labeled_train.csv
