In [1]:
import sys
sys.path.insert(0, '../')

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.dummy import DummyClassifier

from datasets.iot_intrusion_dataset import IotIntrusionDataset

In [3]:
train_dataset = IotIntrusionDataset("../raw_data/IoT Network Intrusion Dataset.csv", is_train=True, include_malicious_traffic=False, lbl_is_src_ip=True)
test_dataset = IotIntrusionDataset("../raw_data/IoT Network Intrusion Dataset.csv", is_train=False, include_malicious_traffic=False, lbl_is_src_ip=True)

In [4]:
X_train, y_train = train_dataset.X, train_dataset.y
X_test, y_test = test_dataset.X, test_dataset.y

In [13]:
clf = RandomForestClassifier(n_jobs=80, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9992512167727443


In [7]:
sorted(clf.feature_importances_.tolist(), reverse=True)[:5]

[0.16704858280291304,
 0.1548757586585955,
 0.15243689619242062,
 0.06317504497741692,
 0.06264065098382395]

In [8]:
np.where(clf.feature_importances_ > 0.1)[0]

array([3, 4, 5])

In [10]:
train_dataset.clms[3:6]

Index(['dst_ip_4', 'Src_Port', 'Dst_Port'], dtype='object')

In [12]:
X_train_no_leak = np.hstack([X_train[:, :3], X_train[:, 6:]])
X_test_no_leak = np.hstack([X_test[:, :3], X_test[:, 6:]])

X_train_no_leak.shape, X_test_no_leak.shape

((32049, 79), (8013, 79))

In [15]:
clf_no_leak = RandomForestClassifier(n_jobs=80, random_state=42)
clf_no_leak.fit(X_train_no_leak, y_train)
y_pred = clf_no_leak.predict(X_test_no_leak)
print(accuracy_score(y_test, y_pred))

0.9609384749781605


In [17]:
sorted(clf_no_leak.feature_importances_.tolist(), reverse=True)[:5]

[0.10691044003634549,
 0.07822149549290855,
 0.062165789074353224,
 0.05600539739843491,
 0.051035006839218605]

In [19]:
might_leak = np.where(clf_no_leak.feature_importances_ > 0.05)[0]
train_dataset.clms[might_leak]

Index(['Tot_Fwd_Pkts', 'Fwd_Pkt_Len_Min', 'Fwd_Pkts/s', 'Subflow_Fwd_Pkts',
       'Subflow_Bwd_Pkts'],
      dtype='object')

In [21]:
dummy = DummyClassifier()
dummy.fit(X_train_no_leak, y_train)
y_pred = dummy.predict(X_test_no_leak)
print(accuracy_score(y_test, y_pred))

0.7858479970048671
