# Data Preparation


#### Import dependencies

In [1]:
import os
import re
from collections import Counter

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats

sns.set_theme()

### Load the data into a Pandas dataframe
Define the path to the dataset file
Define the name of the label column

In [2]:
rootdir = os.getcwd()
infile = os.path.join(rootdir, 'data',
                      'CTU-IoT-Malware-Capture-20-1conn.log.labeled.csv')

df = pd.read_csv(infile, delimiter='|', na_values='-')

#### Customized variables for this dataset

In [3]:
ORIGINAL_LABEL_COLUMN_NAME = 'label'
LABEL_COLUMN_NAME = 'label_bool'

NORMALIZE_METHOD = "min_max"

def get_stat(col_name, stat_name):
    """docstring TBD"""
    return df.describe(include="all").loc[stat_name].loc[col_name]


# Finding the percentiles:
def find_nearest_index(array, value):
    """docstring TBD"""
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    return idx

df.head(10)

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
0,1538479000.0,CSQG794riQ4XnzTxP2,192.168.100.103,37082,192.168.100.1,53,udp,dns,5.005151,78.0,...,,0,D,2,134,0,0,,Benign,
1,1538479000.0,COTbdG2BhtGBlmf6r,192.168.100.103,34486,192.168.100.1,53,udp,dns,0.007243,90.0,...,,0,Dd,2,146,2,146,,Benign,
2,1538479000.0,CP48WJ2HOnLuGtr5kb,192.168.100.103,37601,192.168.100.1,53,udp,dns,0.00225,90.0,...,,0,Dd,2,146,2,146,,Benign,
3,1538479000.0,CeTMJi2TydRSaVdsG4,192.168.100.103,46439,192.168.100.1,53,udp,dns,5.005154,78.0,...,,0,D,2,134,0,0,,Benign,
4,1538479000.0,CZ6ne24AN9WAg9XA9d,192.168.100.103,55174,192.168.100.1,53,udp,dns,0.002246,90.0,...,,0,Dd,2,146,2,146,,Benign,
5,1538479000.0,CmOsCc16kkxJiJ3zF6,192.168.100.103,40788,192.168.100.1,53,udp,dns,5.005157,78.0,...,,0,D,2,134,0,0,,Benign,
6,1538479000.0,Ca9JNa4eTKtloY8z5h,192.168.100.103,56655,192.168.100.1,53,udp,dns,0.001999,90.0,...,,0,Dd,2,146,2,146,,Benign,
7,1538479000.0,CX4Jtj23yoifdPjkah,192.168.100.103,52983,192.168.100.1,53,udp,dns,5.001404,78.0,...,,0,D,2,134,0,0,,Benign,
8,1538479000.0,CaZS0B31T39XBqk8Nd,192.168.100.103,37651,192.168.100.1,53,udp,dns,5.001156,78.0,...,,0,D,2,134,0,0,,Benign,
9,1538479000.0,C458Uc2OYB9oRtxaAl,192.168.100.103,59986,192.168.100.1,53,udp,dns,0.028733,90.0,...,,0,Dd,2,146,2,146,,Benign,


In [4]:
df.tail(10)

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,label,detailed-label
3199,1538565000.0,COemgP2o76UNoxFy7a,192.168.100.103,123,217.30.75.147,123,udp,,0.072964,48.0,...,,0,Dd,1,76,1,76,,Benign,
3200,1538565000.0,CivARQ2xD0oSPfVate,192.168.100.103,123,217.30.75.147,123,udp,,0.003242,48.0,...,,0,Dd,1,76,1,76,,Benign,
3201,1538565000.0,CCBQlL2ZtnDcjijuX4,192.168.100.103,123,217.30.75.147,123,udp,,0.002746,48.0,...,,0,Dd,1,76,1,76,,Benign,
3202,1538565000.0,Ctz25L1cqDMX45Lky9,192.168.100.103,123,217.30.75.147,123,udp,,0.002747,48.0,...,,0,Dd,1,76,1,76,,Benign,
3203,1538565000.0,Cb4JIc3hbDka7tLZth,192.168.100.103,123,217.30.75.147,123,udp,,0.002732,48.0,...,,0,Dd,1,76,1,76,,Benign,
3204,1538565000.0,CgenZ5oaUlAs8oOP8,192.168.100.103,123,217.30.75.147,123,udp,,0.058227,48.0,...,,0,Dd,1,76,1,76,,Benign,
3205,1538565000.0,CuyoPRuDeCX9EleYd,192.168.100.103,123,217.30.75.147,123,udp,,0.002738,48.0,...,,0,Dd,1,76,1,76,,Benign,
3206,1538565000.0,CXya0e36iFhSxS3Mf9,192.168.100.103,123,217.30.75.147,123,udp,,0.033229,48.0,...,,0,Dd,1,76,1,76,,Benign,
3207,1538521000.0,CGLDvm3X8xOnv1Rfwf,192.168.100.103,38098,66.85.157.90,443,tcp,,44325.615625,19431.0,...,,4500,ShADCaGcgd,8872,372893,9307,388975,,Malicious C&C,Torii
3208,1538565000.0,Cr70I22hK5v40Wu9mh,192.168.100.103,123,217.30.75.147,123,udp,,0.042471,48.0,...,,0,Dd,1,76,1,76,,Benign,


In [5]:
df[LABEL_COLUMN_NAME] = df[ORIGINAL_LABEL_COLUMN_NAME].isin(['Malicious   C&C']).astype(int).astype(bool)

# converting the date
df["ts_converted"] = pd.to_datetime(
    df["ts"], errors="raise"
)

## Removing columns that the model doesn't use
TBD

In [6]:
df.drop(
    columns=[
        ORIGINAL_LABEL_COLUMN_NAME,     # was replaced
        "detailed-label",               # will be used in future version of this Notebook
        "ts",                           # was converted to a new column
        "uid"                           # unique identifier, not used by model
    ],
    inplace=True,
)

df.dtypes

id.orig_h                 object
id.orig_p                  int64
id.resp_h                 object
id.resp_p                  int64
proto                     object
service                   object
duration                 float64
orig_bytes               float64
resp_bytes               float64
conn_state                object
local_orig               float64
local_resp               float64
missed_bytes               int64
history                   object
orig_pkts                  int64
orig_ip_bytes              int64
resp_pkts                  int64
resp_ip_bytes              int64
tunnel_parents           float64
label_bool                  bool
ts_converted      datetime64[ns]
dtype: object

## Winsorizing numerical outliers
Description TBD

In [7]:
# # Winsorize the top 1% and bottom 1%
# percentile = 0.01

# COLUMNS_TO_WINSORIZE = ['duration_in_minutes']

# for iter_column_name in COLUMNS_TO_WINSORIZE:
#     new_column_name = iter_column_name + "_winsorized"

#     winsorized_data = stats.mstats.winsorize(
#         df[iter_column_name], limits=[percentile, percentile], inplace=False
#     )

#     df[new_column_name] = winsorized_data
#     df.drop(columns=iter_column_name, inplace=True)
#     print(
#         f"Winsorized column {iter_column_name} to {new_column_name} and removed original column."
#     )

## Replacing missing numerical values w/ their mean
Description TBD

In [8]:
# COLUMNS_TO_REPLACE_MISSING = ['duration_in_minutes_winsorized']

# for iter_column_name in df.select_dtypes(include=np.number).columns.tolist():
#     num_missing = np.sum(df[iter_column_name].isnull(), axis=0)

#     if num_missing > 0:
#         new_column_name = iter_column_name + "_replacedMissing"
#         mean = get_stat(iter_column_name, "mean")

#         df[new_column_name] = df[iter_column_name].fillna(value=mean, inplace=False)
#         df.drop(columns=iter_column_name, inplace=True)

#         # TODO: get a count of the number changed.
#         print(
#             f"Replaced missing values in column {iter_column_name} with the mean and created new column {new_column_name}. Removed original column"
#         )
#     else:
#         print(
#             f"No missing values detected in column {iter_column_name}, no changes made. Original column left intact."
#         )

## Normalizing numerical ranges
Description TBD

In [9]:
# def normalize(df_local, column_name, normalize_method_name):
#     """docstring TBD"""
#     df_temp = df_local.copy()
#     new_columnname = column_name + "_normalized"

#     if normalize_method_name == "absolute_range":
#         df_temp[new_columnname] = (
#             df_temp[column_name] / df_temp[column_name].abs().max()
#         )

#     elif normalize_method_name == "min_max":
#         # rescales a features to be in the range [0,1]
#         df_temp[new_columnname] = (
#             df_temp[column_name] - df_temp[column_name].min()
#         ) / (df_temp[column_name].max() - df_temp[column_name].min())

#     elif normalize_method_name == "z_score":
#         df_temp[new_columnname] = (
#             df_temp[column_name] - df_temp[column_name].mean()
#         ) / df_temp[column_name].std()

#     else:
#         raise NameError("Unrecogized normalization method")

#     df_temp.drop(columns=column_name, inplace=True)
#     print(
#         f"Normalized column {column_name} into {new_columnname} using {normalize_method_name}. Removed original."
#     )
#     return df_temp

# df.dtypes

# # iterate through the list of current numeric columns
# # for iter_column_name in df.select_dtypes(include=np.number).columns.tolist():
# for iter_column_name in ['duration_in_minutes_winsorized', 'month_published_int']:
#     df = normalize(df, iter_column_name, NORMALIZE_METHOD)

## Converting strings to one-hot encoded columns
Locate string columns that have a small number of unique values and replace them with one-hot encoded versions, then remove the original column.

In [10]:
# columns_to_OHE= ['proto', 'service', ]

# # create a one-hot encoded version in a new dataframe
# temp_df = pd.get_dummies(df["episode_type"], prefix="episode_type_")

# # merge the new dataframe into the existing one
# df.join(temp_df)

# # remove the original column now that it has been encoded

# # into the existing dataframe
# df.drop(columns="episode_type", inplace=True)

# Re-order the columns
Sort the column names alphabetically, but make sure the 'label' column is always last.

In [11]:
# alphabetically sort the column names, but leave the label as the last column
column_order = sorted(df.columns)
column_order.remove(LABEL_COLUMN_NAME)
column_order.append(LABEL_COLUMN_NAME)
df = df.reindex(column_order, axis=1)

# Final tests

In [12]:
# check for missing values
# check for any remaining strings
df.describe(include="all")

Unnamed: 0,conn_state,duration,history,id.orig_h,id.orig_p,id.resp_h,id.resp_p,local_orig,local_resp,missed_bytes,...,orig_ip_bytes,orig_pkts,proto,resp_bytes,resp_ip_bytes,resp_pkts,service,ts_converted,tunnel_parents,label_bool
count,3209,2324.0,3209,3209,3209.0,3209,3209.0,0.0,0.0,3209.0,...,3209.0,3209.0,3209,2324.0,3209.0,3209.0,592,3209,0.0,3209
unique,5,,8,3,,40,,,,,...,,,2,,,,1,,,2
top,SF,,Dd,192.168.100.103,,217.30.75.147,,,,,...,,,udp,,,,dns,,,False
freq,2132,,2130,3207,,1301,,,,,...,,,3192,,,,592,,,3193
mean,,37.898508,,,8939.868806,,113.302586,,,2.387348,...,310.303521,6.490807,,65.699225,291.973824,6.349018,,1970-01-01 00:00:01.538517695,,
min,,0.00048,,,123.0,,22.0,,,0.0,...,0.0,0.0,,0.0,0.0,0.0,,1970-01-01 00:00:01.538478769,,
25%,,0.002744,,,123.0,,123.0,,,0.0,...,76.0,1.0,,48.0,0.0,0.0,,1970-01-01 00:00:01.538493725,,
50%,,0.005746,,,123.0,,123.0,,,0.0,...,76.0,1.0,,48.0,76.0,1.0,,1970-01-01 00:00:01.538512689,,
75%,,0.056341,,,123.0,,123.0,,,0.0,...,76.0,1.0,,48.0,76.0,1.0,,1970-01-01 00:00:01.538542203,,
max,,44325.615625,,,60974.0,,5355.0,,,4500.0,...,372893.0,8872.0,,19431.0,388975.0,9307.0,,1970-01-01 00:00:01.538565109,,


In [13]:
# show the final datatypes before exporting to CSV
df.dtypes

conn_state                object
duration                 float64
history                   object
id.orig_h                 object
id.orig_p                  int64
id.resp_h                 object
id.resp_p                  int64
local_orig               float64
local_resp               float64
missed_bytes               int64
orig_bytes               float64
orig_ip_bytes              int64
orig_pkts                  int64
proto                     object
resp_bytes               float64
resp_ip_bytes              int64
resp_pkts                  int64
service                   object
ts_converted      datetime64[ns]
tunnel_parents           float64
label_bool                  bool
dtype: object

# Storing training and prediction data into CSV files

In [14]:
output_file_prefix = os.path.splitext(infile)[0]

# Create a training/test dataset and output to CSV
df_training = df.copy()
training_outfile = output_file_prefix + "_train.csv"
df_training.to_csv(training_outfile)
print(f"Training data saved to new CSV file:\n{training_outfile}")

Training data saved to new CSV file:
/Users/the-molecular-man/source_code/machine-learning/IoT_malware/data/CTU-IoT-Malware-Capture-20-1conn.log.labeled_train.csv
