In [1]:
import sys
sys.path.insert(0, '../')

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.dummy import DummyClassifier

from datasets.mqtt_dataset import MqttDataset

In [3]:
train_dataset = MqttDataset("../raw_data/mqtt_fullrows.csv", is_train=True, include_malicious_traffic=False, lbl_is_src_ip=True)
test_dataset = MqttDataset("../raw_data/mqtt_fullrows.csv", is_train=False, include_malicious_traffic=False, lbl_is_src_ip=True)

In [4]:
X_train, y_train = train_dataset.X, train_dataset.y
X_test, y_test = test_dataset.X, test_dataset.y

In [11]:
clf = RandomForestClassifier(n_jobs=80, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.02209911841791756


In [12]:
sorted(clf.feature_importances_.tolist(), reverse=True)[:5]

[0.9816594919123302, 0.009445663967266408, 0.008894844120403332, 0.0, 0.0]

In [13]:
np.where(clf.feature_importances_ > 0.1)[0]

array([8], dtype=int64)

In [10]:
train_dataset.clms[8]

'tcp_flag_res'

In [14]:
X_train_no_leak = np.hstack([X_train[:, :7], X_train[:, 10:]])
X_test_no_leak = np.hstack([X_test[:, :7], X_test[:, 10:]])

X_train_no_leak.shape, X_test_no_leak.shape

((134301, 24), (33576, 24))

In [15]:
clf_no_leak = RandomForestClassifier(n_jobs=80, random_state=42)
clf_no_leak.fit(X_train_no_leak, y_train)
y_pred = clf_no_leak.predict(X_test_no_leak)
print(accuracy_score(y_test, y_pred))

0.09634858232070527


In [16]:
sorted(clf_no_leak.feature_importances_.tolist(), reverse=True)[:5]

[0.5039918004084324, 0.49600819959156767, 0.0, 0.0, 0.0]

In [21]:
np.where(clf_no_leak.feature_importances_ > 0.1)[0]

array([ 5, 17], dtype=int64)

In [17]:
might_leak = np.where(clf_no_leak.feature_importances_ > 0.05)[0]
train_dataset.clms[might_leak]

Index(['ip_flag_rb', 'tcp_flag_fin'], dtype='object')

In [22]:
X_train_no_leak = np.hstack([X_train_no_leak[:, :5], X_train_no_leak[:, 6:]])
X_test_no_leak = np.hstack([X_test_no_leak[:, :5], X_test_no_leak[:, 6:]])
X_train_no_leak = np.hstack([X_train_no_leak[:, :17], X_train_no_leak[:, 18:]])
X_test_no_leak = np.hstack([X_test_no_leak[:, :17], X_test_no_leak[:, 18:]])

X_train_no_leak.shape, X_test_no_leak.shape

((134301, 22), (33576, 22))

In [23]:
clf_no_leak = RandomForestClassifier(n_jobs=80, random_state=42)
clf_no_leak.fit(X_train_no_leak, y_train)
y_pred = clf_no_leak.predict(X_test_no_leak)
print(accuracy_score(y_test, y_pred))

0.09634858232070527


In [24]:
sorted(clf_no_leak.feature_importances_.tolist(), reverse=True)[:5]

[1.0, 0.0, 0.0, 0.0, 0.0]

In [25]:
np.where(clf_no_leak.feature_importances_ > 0.1)[0]

array([16], dtype=int64)

In [26]:
might_leak = np.where(clf_no_leak.feature_importances_ > 0.05)[0]
train_dataset.clms[might_leak]

Index(['tcp_flag_syn'], dtype='object')

In [27]:
X_train_no_leak = np.hstack([X_train_no_leak[:, :16], X_train_no_leak[:, 17:]])
X_test_no_leak = np.hstack([X_test_no_leak[:, :16], X_test_no_leak[:, 17:]])

In [28]:
clf_no_leak = RandomForestClassifier(n_jobs=80, random_state=42)
clf_no_leak.fit(X_train_no_leak, y_train)
y_pred = clf_no_leak.predict(X_test_no_leak)
print(accuracy_score(y_test, y_pred))

0.08077197998570407


In [29]:
sorted(clf_no_leak.feature_importances_.tolist(), reverse=True)[:5]

[0.0, 0.0, 0.0, 0.0, 0.0]

In [30]:
dummy = DummyClassifier()
dummy.fit(X_train_no_leak, y_train)
y_pred = dummy.predict(X_test_no_leak)
print(accuracy_score(y_test, y_pred))

0.08077197998570407
