In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras

from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn import datasets
from sklearn import model_selection
from sklearn import linear_model
from sklearn import feature_selection
from sklearn import svm
from sklearn import multiclass
from sklearn import metrics

np.random.seed(10)

import category_encoders as ce

In [2]:
data = pd.read_csv('kddcup.csv.data_10_percent_corrected')

In [3]:
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [4]:
y = data['class']

In [5]:
y.value_counts()

smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: class, dtype: int64

Zelimo da objedinimo klase koje imaju jednocifren i dvocifren broj instanci u jednu novu klasu pod imenom 'other_attacks'. Pokušali smo da napravimo model i bez objedinjavanja, ali za ove klase nije dao dobro rešenje.

In [8]:
y.loc[y == 'spy.'] = 'other_attacks.'

In [9]:
y.loc[y == 'perl.'] = 'other_attacks.'

In [10]:
y.loc[y == 'phf.'] = 'other_attacks.'

In [11]:
y.loc[y == 'multihop.'] = 'other_attacks.'

In [12]:
y.loc[y == 'ftp_write.'] = 'other_attacks.'

In [13]:
y.loc[y == 'loadmodule.'] = 'other_attacks.'

In [14]:
y.loc[y == 'rootkit.'] = 'other_attacks.'

In [15]:
y.loc[y == 'imap.'] = 'other_attacks.'

In [16]:
y.loc[y == 'warezmaster.'] = 'other_attacks.'

In [17]:
y.loc[y == 'land.'] = 'other_attacks.'

In [18]:
y.loc[y == 'buffer_overflow.'] = 'other_attacks.'

In [19]:
y.loc[y == 'guess_passwd.'] = 'other_attacks.'

In [20]:
len(np.where(y == 'other_attacks.')[0])

179

In [21]:
y.value_counts()

smurf.            280790
neptune.          107201
normal.            97278
back.               2203
satan.              1589
ipsweep.            1247
portsweep.          1040
warezclient.        1020
teardrop.            979
pod.                 264
nmap.                231
other_attacks.       179
Name: class, dtype: int64

In [22]:
X = data.drop(columns = ['class'], axis = 1)

In [23]:
y = y.astype('category')

In [24]:
d = dict(enumerate(y.cat.categories))

In [25]:
d

{0: 'back.',
 1: 'ipsweep.',
 2: 'neptune.',
 3: 'nmap.',
 4: 'normal.',
 5: 'other_attacks.',
 6: 'pod.',
 7: 'portsweep.',
 8: 'satan.',
 9: 'smurf.',
 10: 'teardrop.',
 11: 'warezclient.'}

In [26]:
y = y.cat.codes

In [27]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33, stratify=y, random_state=7)

 Ranije smo uocili da atributi 'is_host_login' i 'num_outbound_cmds' imaju sve vrednosti 0, a to nam ne znaci za predvidjanje klase, stoga ih izbacujemo iz podataka.

In [28]:
X_train = X_train.drop(columns = ['is_host_login', 'num_outbound_cmds'], axis = 1)

In [29]:
X_test = X_test.drop(columns = ['is_host_login', 'num_outbound_cmds'], axis = 1)

Enkodiranje i skaliranje.

In [30]:
encoder = ce.BinaryEncoder(cols=['protocol_type', 'service', 'flag'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [31]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Napravili smo dva modela. Prvi koji radi na principu razlike jedne klase u odnosu na ostale, dok drugi radi na principu razlika izmedju dve klase.

In [50]:
ovr_classifier = multiclass.OneVsRestClassifier(svm.LinearSVC())
ovo_classifier = multiclass.OneVsOneClassifier(svm.LinearSVC())

In [83]:
ovr_classifier.fit(X_train, y_train)



OneVsRestClassifier(estimator=LinearSVC())

In [53]:
ovo_classifier.fit(X_train, y_train)



OneVsOneClassifier(estimator=LinearSVC())

Kod oba od prethodna dva učenja imamo upozorenje da uvećamo broj iteracija zbog neuspeha konvergencije, što je i urađeno. Pokušavali smo sa 5000, 10000, 20000, 50000 i 100000 kao broj iteracija. Međutim, povećan broja iteracija nije doneo mnogo bolje rešenje od trenutnog sa 1000 iteracija, a sam proces učenja je bio znatno produžen.

Evaluacija

In [52]:
y_predicted_ovr = ovr_classifier.predict(X_test)

In [54]:
print(metrics.classification_report(y_test, y_predicted_ovr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       727
           1       0.97      0.97      0.97       412
           2       1.00      1.00      1.00     35376
           3       0.95      0.83      0.89        76
           4       1.00      1.00      1.00     32102
           5       0.87      0.66      0.75        59
           6       1.00      0.97      0.98        87
           7       0.99      0.98      0.99       343
           8       1.00      0.94      0.97       524
           9       1.00      1.00      1.00     92661
          10       0.98      1.00      0.99       323
          11       0.90      0.91      0.90       337

    accuracy                           1.00    163027
   macro avg       0.97      0.94      0.95    163027
weighted avg       1.00      1.00      1.00    163027



In [55]:
y_predicted_ovo = ovo_classifier.predict(X_test)

In [56]:
print(metrics.classification_report(y_test, y_predicted_ovo))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00       727
           1       0.99      0.99      0.99       412
           2       1.00      1.00      1.00     35376
           3       0.94      0.95      0.94        76
           4       1.00      1.00      1.00     32102
           5       0.94      0.76      0.84        59
           6       0.99      1.00      0.99        87
           7       1.00      1.00      1.00       343
           8       0.99      0.98      0.98       524
           9       1.00      1.00      1.00     92661
          10       1.00      1.00      1.00       323
          11       0.97      0.66      0.79       337

    accuracy                           1.00    163027
   macro avg       0.98      0.94      0.96    163027
weighted avg       1.00      1.00      1.00    163027



In [57]:
metrics.f1_score(y_test, y_predicted_ovo, average='macro')

0.960795907932031

In [58]:
metrics.f1_score(y_test, y_predicted_ovr, average='macro')

0.9526837011037331

In [59]:
metrics.f1_score(y_test, y_predicted_ovo, average='micro')

0.9989388260840228

In [60]:
metrics.f1_score(y_test, y_predicted_ovr, average='micro')

0.9988406828316782

In [61]:
metrics.accuracy_score(y_test, y_predicted_ovo)

0.9989388260840228

In [62]:
metrics.accuracy_score(y_test, y_predicted_ovr)

0.9988406828316782

In [63]:
metrics.r2_score(y_test, y_predicted_ovo)

0.9958778574355254

In [64]:
metrics.r2_score(y_test, y_predicted_ovr)

0.9970766043351381

In [65]:
cm_ovo = metrics.confusion_matrix(y_test, y_predicted_ovo)
print(cm_ovo)

[[  727     0     0     0     0     0     0     0     0     0     0     0]
 [    0   406     0     3     2     0     0     0     1     0     0     0]
 [    0     0 35374     0     1     1     0     0     0     0     0     0]
 [    0     1     0    72     2     0     0     0     1     0     0     0]
 [    5     3     1     0 32082     2     1     0     2     1     0     5]
 [    0     0     0     0    13    45     0     0     0     0     0     1]
 [    0     0     0     0     0     0    87     0     0     0     0     0]
 [    0     0     0     0     1     0     0   342     0     0     0     0]
 [    0     0     0     2    10     0     0     0   512     0     0     0]
 [    0     0     0     0     0     0     0     0     0 92661     0     0]
 [    0     0     0     0     0     0     0     0     0     0   323     0]
 [    0     0     0     0   114     0     0     0     0     0     0   223]]


In [66]:
cm_ovr = metrics.confusion_matrix(y_test, y_predicted_ovr)
print(cm_ovr)

[[  726     0     0     0     1     0     0     0     0     0     0     0]
 [    0   401     0     3     8     0     0     0     0     0     0     0]
 [    0     0 35371     0     3     1     0     0     0     0     1     0]
 [    0     2     0    63    11     0     0     0     0     0     0     0]
 [    3    10     1     0 32040     5     0     2     2     1     6    32]
 [    0     1     1     0    16    39     0     0     0     0     0     2]
 [    0     0     0     0     3     0    84     0     0     0     0     0]
 [    0     0     0     0     6     0     0   337     0     0     0     0]
 [    0     0     0     0    33     0     0     0   491     0     0     0]
 [    0     0     0     0     4     0     0     0     0 92657     0     0]
 [    0     0     0     0     0     0     0     0     0     0   323     0]
 [    0     0     0     0    31     0     0     0     0     0     0   306]]


Najvece se kod OneVsOne gresi u sledecim klasama:

In [70]:
cm_ovo[11][4]

114

In [74]:
d[11]

'warezclient.'

In [75]:
d[4]

'normal.'

In [76]:
cm_ovo[5][4]

13

In [72]:
d[5]

'other_attacks.'

In [73]:
d[4]

'normal.'

Najvece se kod OneVsRest gresi u sledecim klasama:

In [77]:
cm_ovr[8][4]

33

In [78]:
d[8]

'satan.'

In [79]:
d[4]

'normal.'

In [80]:
cm_ovr[4][11]

32

In [81]:
d[4]

'normal.'

In [82]:
d[11]

'warezclient.'

Viseklasna klasifikacija radi sa velikom preciznoscu. Greske su slicne greskama modela neuronskih mreza. Parametar max_iter smo zbog pojavljivanja upozorenja menjale i sa njim eksperimentisale. Pokazalo se da nema velike promene u preciznosti kada se ovaj parametar poveca. Vreme ucenja  za OneVsOne traje oko dva minuta, za OneVsRest traje oko 11 minuta, dok je neuronskim mrezama bilo potrebno preko 20 minuta.