In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, accuracy_score, f1_score, fbeta_score, make_scorer

sns.set_theme(style='whitegrid', context='paper')

In [10]:
# Read in column names from 'data/network-traffic/kddcup.names.dat'
import re

with open('data/network-traffic/kddcup.names.dat') as file:
    col_re = r'(\w+)\:.*'
    cols = [re.search(col_re, line)[1] for line in file.readlines()[1:]] + ['target']

cols[:5]

['duration', 'protocol_type', 'service', 'flag', 'src_bytes']

In [37]:
training_df = pd.read_csv('data/network-traffic/kddcup.data.csv', names=cols, index_col=False, header=None).drop_duplicates()
testing_df = pd.read_csv('data/network-traffic/corrected.csv', names=cols, index_col=False, header=None).drop_duplicates()
complete_df = pd.concat([training_df, testing_df], axis=0)

training_df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,target
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,normal
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,1.0,0.0,0.33,0.0,0.0,0.0,0.0,0.0,normal
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,1.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,normal


--------------------------------------------------------------------------------

# Preprocessing

In [12]:
complete_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1154080 entries, 0 to 310931
Data columns (total 42 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   duration                     1154080 non-null  int64  
 1   protocol_type                1154080 non-null  object 
 2   service                      1154080 non-null  object 
 3   flag                         1154080 non-null  object 
 4   src_bytes                    1154080 non-null  int64  
 5   dst_bytes                    1154080 non-null  int64  
 6   land                         1154080 non-null  int64  
 7   wrong_fragment               1154080 non-null  int64  
 8   urgent                       1154080 non-null  int64  
 9   hot                          1154080 non-null  int64  
 10  num_failed_logins            1154080 non-null  int64  
 11  logged_in                    1154080 non-null  int64  
 12  num_compromised              1154080 non-null  i

In [27]:
# Handle columns with `object`-type values
object_column_names = [complete_df[col].name for col in complete_df.columns if complete_df[col].dtype == 'object']
print(f'There are {len(object_column_names)} columns of type `object`: {object_column_names}.')

There are 4 columns of type `object`: ['protocol_type', 'service', 'flag', 'target'].


In [33]:
# Check out the 'protocol_type'
training_df.protocol_type.unique()

array(['tcp', 'udp', 'icmp'], dtype=object)

In [34]:
# Check out the `service`
training_df.service.unique()

array(['http', 'smtp', 'domain_u', 'auth', 'finger', 'telnet', 'eco_i',
       'ftp', 'ntp_u', 'ecr_i', 'other', 'urp_i', 'private', 'pop_3',
       'ftp_data', 'netstat', 'daytime', 'ssh', 'echo', 'time', 'name',
       'whois', 'domain', 'mtp', 'gopher', 'remote_job', 'rje', 'ctf',
       'supdup', 'link', 'systat', 'discard', 'X11', 'shell', 'login',
       'imap4', 'nntp', 'uucp', 'pm_dump', 'IRC', 'Z39_50', 'netbios_dgm',
       'ldap', 'sunrpc', 'courier', 'exec', 'bgp', 'csnet_ns', 'http_443',
       'klogin', 'printer', 'netbios_ssn', 'pop_2', 'nnsp', 'efs',
       'hostnames', 'uucp_path', 'sql_net', 'vmnet', 'iso_tsap',
       'netbios_ns', 'kshell', 'urh_i', 'http_2784', 'harvest', 'aol',
       'tftp_u', 'http_8001', 'tim_i', 'red_i'], dtype=object)

In [35]:
# Check out the `flag`
training_df.flag.unique()

array(['SF', 'S2', 'S1', 'S3', 'OTH', 'REJ', 'RSTO', 'S0', 'RSTR',
       'RSTOS0', 'SH'], dtype=object)

In [38]:
# check out the `target`
training_df.target.unique()     # -> one-hot

array(['normal', 'buffer_overflow', 'loadmodule', 'perl', 'neptune',
       'smurf', 'guess_passwd', 'pod', 'teardrop', 'portsweep', 'ipsweep',
       'land', 'ftp_write', 'back', 'imap', 'satan', 'phf', 'nmap',
       'multihop', 'warezmaster', 'warezclient', 'spy', 'rootkit'],
      dtype=object)

We will use a simple enumeration approach for those, each column with its own encoding and decoding function.

In [59]:
def encode(value, catalog): return catalog.index(value)
def decode(value, catalog): return catalog[value]
def make_encoding(column_name, dataframe):
    catalog = dataframe[column_name].unique().tolist()
    def encoder(value): return encode(value, catalog)
    def decoder(value): return decode(value, catalog)
    return catalog, encoder, decoder
def make_encoded_column(column_name, dataframe, encodings=None):
    catalog, encoder, decoder = encodings or make_encoding(column_name, dataframe)
    result = dataframe[column_name].map(encoder)
    result.name = f'{column_name}_enc'
    return result, catalog, encoder, decoder
def make_encoded_df(original_df, column_names, encodings=None):
    new_df = original_df.copy()
    def encode_columns(df, columns, encodings, make_new_encodings=True):
        match columns, make_new_encodings:
            case [], _:
                return df, encodings
            case [column, *rest], True:
                new_col, catalog, encoder, decoder = make_encoded_column(column, df)
                new_df = pd.concat([df.drop(column, axis=1), new_col], axis=1)
                return encode_columns(new_df, rest, encodings + [catalog, encoder, decoder])
            case [column, *rest], False:
                new_col, _, _, _ = make_encoded_column(column, df, encodings[-len(columns)])
                new_df = pd.concat([df.drop(column, axis=1), new_col], axis=1)
                return encode_columns(new_df, rest, encodings, make_new_encodings)
    return encode_columns(new_df, column_names, [])

In [60]:
# Use those functions to create fully-encrypted dataframes (copies):
testing_df_enc, encodings = make_encoded_df(testing_df, object_column_names)
training_df_enc, _ = make_encoded_df(training_df, object_column_names, encodings)

In [61]:
training_df_enc.tail()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,protocol_type_enc,service_enc,flag_enc,target_enc
4898426,0,212,2288,0,0,0,0,0,1,0,...,0.33,0.05,0.0,0.01,0.0,0.0,0,0,0,0
4898427,0,219,236,0,0,0,0,0,1,0,...,0.25,0.05,0.0,0.01,0.0,0.0,0,0,0,0
4898428,0,218,3610,0,0,0,0,0,1,0,...,0.2,0.05,0.0,0.01,0.0,0.0,0,0,0,0
4898429,0,219,1234,0,0,0,0,0,1,0,...,0.17,0.05,0.0,0.01,0.0,0.0,0,0,0,0
4898430,0,219,1098,0,0,0,0,0,1,0,...,0.14,0.05,0.0,0.01,0.0,0.0,0,0,0,0


In [62]:
training_df_enc.dtypes

duration                         int64
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate                float64
rerror_rate                    float64
srv_rerror_rate                float64
same_srv_rate            

Moving forward, we will use the encoded data frames, since the models require numeric input.
At any point, the original values can be re-constructed by using the respective decoder within
the captured `encodings` list. The order of these encodings matches the order of the respective
column names in the `object_column_names` list.

--------------------------------------------------------------------------------

# Random Forest Classification

In [63]:
from sklearn.ensemble import RandomForestClassifier