In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
import shap
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import shap
from explainerdashboard import ExplainerDashboard, ClassifierExplainer
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.optim as optim
import captum.attr as c

In [33]:
data = pd.read_csv('Cleaned_full_data.csv')



In [34]:
# Reset retained index.
data = data.reset_index(drop=True)
# Set NA to 0.
data['ct_ftp_cmd'] = data['ct_ftp_cmd'].fillna(0)
data['attack_cat'] = data['attack_cat'].str.replace(r'\s+', '', regex=True)
data['attack_cat'] = data['attack_cat'].str.replace('Backdoors', 'Backdoor')

In [35]:
data = data.drop(columns=['proto', 'dsport', 'service', 'state', 'srcip', 'sport', 'dstip'])

In [36]:
temp = data[['is_ftp_login', 'is_sm_ips_ports', 'label', 'attack_cat']]
data = data.drop(columns=['is_ftp_login', 'is_sm_ips_ports', 'label', 'attack_cat'])

In [37]:
ohe1 = pd.read_csv('Full_proto_encoded.csv')
ohe2 = pd.read_csv('Full_dsport_encoded.csv')
ohe3 = pd.read_csv('Full_service_encoded.csv')
ohe4 = pd.read_csv('Full_state_encoded.csv')
# Spelling error.
ohe5 = pd.read_csv('Full_scrip_encoded.csv')
#------------------------------------------#
ohe6 = pd.read_csv('Full_sport_encoded.csv')
ohe7 = pd.read_csv('Full_dstip_encoded.csv')

In [38]:
# MinMax seperates Normal data well and reduces noise. Please see Kmeans TSNE evaluation in Archive.
#scaler = MinMaxScaler()
#scaled_data = scaler.fit_transform(data)
data = pd.DataFrame(data, columns=data.columns)
data = pd.concat([data, temp, ohe1, ohe2, ohe3, ohe4, ohe5, ohe6, ohe7], axis=1)

# Original Run.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, data['label'], test_size=0.2, random_state=42)
# Drop attack cat before running model and store for later evaluation indexing.
test_attack_cat = X_test['attack_cat']
X_train = X_train.drop(columns=['attack_cat', 'label'])
X_test = X_test.drop(columns=['attack_cat', 'label'])
# Check label distribution.
print(data['attack_cat'].value_counts())

In [None]:
# Run RandomForest with default settings.
# - Not sure why were getting a worse result here.
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
# Run RandomForest with default settings.
# - Not sure why were getting a worse result here.
rf = RandomForestClassifier(random_state=42, n_estimators=50)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Feature Importance through Random Forest.

In [47]:
X_train, X_test, y_train, y_test = train_test_split(data, data['label'], test_size=0.2, random_state=42)
# Drop attack cat before running model and store for later evaluation indexing.
test_attack_cat = X_test['attack_cat']
X_train = X_train.drop(columns=['attack_cat', 'label'])
X_test = X_test.drop(columns=['attack_cat', 'label'])
# Check label distribution.
print(data['attack_cat'].value_counts())

attack_cat
Normal            2218764
Generic            215481
Exploits            44525
Fuzzers             24246
DoS                 16353
Reconnaissance      13987
Analysis             2677
Backdoor             2329
Shellcode            1511
Worms                 174
Name: count, dtype: int64


In [48]:
# Run RandomForest with default settings.
# - Not sure why were getting a worse result here.
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    443831
           1       0.99      0.98      0.99     64179

    accuracy                           1.00    508010
   macro avg       0.99      0.99      0.99    508010
weighted avg       1.00      1.00      1.00    508010

[[443003    828]
 [  1054  63125]]


In [49]:
importances = rf.feature_importances_
important_features = np.where(importances > 0)[0]
X_train = X_train.iloc[:, important_features]
X_test = X_test.iloc[:, important_features]

In [50]:
# Run RandomForest with default settings.
# - Not sure why were getting a worse result here.
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    443831
           1       0.99      0.98      0.99     64179

    accuracy                           1.00    508010
   macro avg       0.99      0.99      0.99    508010
weighted avg       1.00      1.00      1.00    508010

[[443009    822]
 [  1043  63136]]


In [None]:
# Run RandomForest with default settings.
# - Not sure why were getting a worse result here.
rf = RandomForestClassifier(random_state=42, n_estimators=50)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
print(len(X_test.columns))

# Features of low importance Filtered through SHAP.
Initially it causes much worse predictions but as I spend more time refining it we get closer to the original prediction with all variables. This was still a lazy approach and can be improved. I just need to finish my other Assignment then I will revisit this.

In [None]:
correct1 = ['proto_vines', 'proto_ib', 'proto_aes-sp3-d', 'proto_gmtp', 'proto_gmtp',
        'proto_stp', 'proto_fc', 'proto_ipv6-opts', 'proto_larp', 'proto_pnni', 'proto_ipv6-route',
        'proto_sat-expak', 'state_URH', 'proto_secure-vmtp', 'proto_pvp', 'proto_idrp',
        'proto_encap', 'proto_fire', 'proto_iatp', 'proto_leaf-1', 'proto_emcon', 'proto_xtp',
        'proto_tcf', 'proto_ipip', 'proto_micp', 'proto_mux', 'dstip_59.166.0.9', 'proto_scps',
        'proto_mtp', 'proto_igp', 'proto_narp', 'proto_kryptolan', 'proto_cphb', 'proto_mfe-nsp',
        'proto_cpnx', 'proto_uti', 'proto_zero', 'proto_tp++', 'proto_iplt', 'proto_ipx-n-ip',
        'proto_ptp', 'srcip_149.171.126.0', 'proto_ax.25', 'proto_sccopmce', 'proto_sm',
        'proto_ipv6-no', 'proto_xns-idp', 'proto_a/n', 'proto_vmtp', 'proto_crtp', 'proto_snp',
        'proto_leaf-2', 'proto_pgm', 'proto_ipcv', 'proto_idpr-cmtp', 'proto_sprite-rpc',
        'proto_compaq-peer', 'proto_ifmp']

In [None]:
correct2 = ['proto_trunk-1', 'proto_dcn', 'proto_qnx', 'proto_wsn', 'proto_nsfnet-igp',
            'proto_tlsp', 'proto_ipnip', 'proto_eigrp', 'proto_vrrp', 'proto_xnet', 'proto_iso-tp4',
            'proto_mhrp', 'proto_isis', 'proto_irtp', 'proto_wb-mon', 'proto_visa', 'proto_il',
            'proto_bbn-rcc', 'proto_cftp', 'proto_etherip', 'proto_iso-ip', 'proto_ddx', 'proto_wb-expak',
            'proto_netblt', 'dstip_59.166.0.6', 'proto_idpr', 'proto_merit-inp', 'proto_hmp',
            'proto_ipcomp', 'proto_ttp', 'proto_crudp', 'proto_skip', 'proto_srp', 'service_irc', 'proto_smp']

In [None]:
correct3 = ['srcip_149.171.126.5', 'state_CLO', 'dsport_631', 'proto_prm', 'sport_800', 'dstip_59.166.0.8', 'proto_3pc', 'proto_pri-enc', 
            'proto_pipe', 'proto_l2tp', 'srcip_149.171.126.2', 'dsport_6667','dstip_59.166.0.0', 'proto_dgp', 'dstip_59.166.0.0', 'proto_dgp',
            'service_ssl', 'proto_pup', 'dstip_59.166.0.5', 'dstip_59.166.0.4',
            'proto_rsvp', 'srcip_149.171.126.3', 'proto_ddp', 'dstip_59.166.0.1',
            'dstip_59.166.0.2', 'dstip_59.166.0.7', 'srcip_149.171.126.1',
            'dstip_59.166.0.3', 'srcip_149.171.126.13']

In [21]:
data = data.drop(columns=correct1)
data = data.drop(columns=correct2)

In [27]:
data = data.drop(columns=correct3)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(data, data['label'], test_size=0.2, random_state=42)
# Drop attack cat before running model and store for later evaluation indexing.
test_attack_cat = X_test['attack_cat']
X_train = X_train.drop(columns=['attack_cat', 'label'])
X_test = X_test.drop(columns=['attack_cat', 'label'])
# Check label distribution.
print(data['attack_cat'].value_counts())

attack_cat
Normal            2218764
Generic            215481
Exploits            44525
Fuzzers             24246
DoS                 16353
Reconnaissance      13987
Analysis             2677
Backdoor             2329
Shellcode            1511
Worms                 174
Name: count, dtype: int64


In [29]:
# Run RandomForest with default settings.
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    443831
           1       0.99      0.98      0.99     64179

    accuracy                           1.00    508010
   macro avg       0.99      0.99      0.99    508010
weighted avg       1.00      1.00      1.00    508010

[[443027    804]
 [  1038  63141]]


In [32]:
# Run RandomForest with default settings.
rf = RandomForestClassifier(random_state=42, n_estimators=50)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    443831
           1       0.99      0.98      0.99     64179

    accuracy                           1.00    508010
   macro avg       0.99      0.99      0.99    508010
weighted avg       1.00      1.00      1.00    508010

[[443012    819]
 [  1064  63115]]
