In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn import model_selection
from sklearn import feature_selection
from sklearn import linear_model
from sklearn import metrics

import category_encoders as ce

np.random.seed(10)

In [2]:
data = pd.read_csv('kddcup.csv.data_10_percent_corrected')

In [3]:
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [4]:
y = data['class']

Klase koje su normalne oznacavamo sa 0, a klase koje su napad sa 1.

In [5]:
y = [instance!='normal.' for instance in y]

In [6]:
np.bincount(y)

array([ 97278, 396743])

In [7]:
X = data.drop(columns = ['class'], axis = 1)

In [8]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33, stratify=y, random_state=7)

Kao sto smo ranije primetili, ovi atributi svuda imaju vrednost 0. Stoga, ne nose nikakvu informaciju za odredjivanje da li je konekcija napad ili ne, pa se mogu izbaciti.

In [9]:
X_train = X_train.drop(columns = ['is_host_login', 'num_outbound_cmds'], axis = 1)

In [10]:
X_test = X_test.drop(columns = ['is_host_login', 'num_outbound_cmds'], axis = 1)

Enkodiranje i skaliranje

In [11]:
encoder = ce.BinaryEncoder(X_train, cols = ['protocol_type', 'service', 'flag'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [12]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

Formiranje modela i ucenje

In [22]:
model = linear_model.LogisticRegression(solver='lbfgs', max_iter=1000)

In [23]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

In [24]:
model.intercept_

array([9.23489669])

In [25]:
model.coef_

array([[ 5.56973625e-02,  0.00000000e+00,  8.83946439e-01,
         1.35011494e+00,  0.00000000e+00,  1.85179688e-03,
         3.78661790e-01, -4.82439228e-01,  2.53420807e-01,
        -6.23033672e-01, -2.27278629e+00,  3.29709357e-01,
         0.00000000e+00, -8.35225537e-03, -8.90315142e-02,
        -3.17248256e+00, -1.37564980e+00,  1.64711865e+00,
         2.27564981e-02, -3.45859853e-02,  1.68588394e+00,
        -9.06960724e-02,  4.24586335e-01,  9.48029308e-02,
         4.41661308e-01,  1.18487260e+01,  6.22049772e-02,
        -2.60979618e-01, -1.30419783e+01,  7.09271594e-03,
         8.97114781e-03, -2.45089318e-02, -3.41344041e-01,
         6.06158969e+00, -1.38545035e+00, -1.05491747e+00,
         2.06534675e+00, -5.52828325e-01,  6.28066627e-01,
        -2.86481283e+00, -2.25329733e-01,  1.77909980e-01,
         1.07878967e+00, -1.44212895e+00,  1.78438513e+00,
         3.33155373e-01,  1.96192745e+00,  4.00830004e-01,
         1.44049804e-01,  2.07819280e+00,  5.62756657e-0

Evaluacija

In [26]:
y_test_predicted = model.predict(X_test)

In [27]:
metrics.accuracy_score(y_test, y_test_predicted)

0.9983315647101401

In [28]:
metrics.precision_score(y_test, y_test_predicted)

0.9992434142650801

In [29]:
metrics.recall_score(y_test, y_test_predicted)

0.9986786328050411

In [30]:
metrics.f1_score(y_test, y_test_predicted)

0.9989609437075974

In [31]:
metrics.confusion_matrix(y_test, y_test_predicted)

array([[ 32003,     99],
       [   173, 130752]])

In [32]:
print(metrics.classification_report(y_test, y_test_predicted, digits=7))

              precision    recall  f1-score   support

       False  0.9946233 0.9969161 0.9957684     32102
        True  0.9992434 0.9986786 0.9989609    130925

    accuracy                      0.9983316    163027
   macro avg  0.9969334 0.9977974 0.9973647    163027
weighted avg  0.9983337 0.9983316 0.9983323    163027



In [35]:
metrics.roc_auc_score(y_test, y_test_predicted)

0.9977973564000285