In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss

In [2]:
raw_data = pd.read_csv('datasets/kddcup.data_10_percent.gz', header=None)
# Raw Data Backup
raw_data_backup = raw_data

In [3]:
# Conditions start to be in categories
DOS = (raw_data[41] == 'back.') | (raw_data[41] == 'land.') | (raw_data[41] == 'neptune.') | (raw_data[41] == 'pod.') | (raw_data[41] == 'smurf.') | (raw_data[41] == 'teardrop.')
U2R = (raw_data[41] == 'buffer_overflow.') | (raw_data[41] == 'loadmodule.') | (raw_data[41] == 'perl.') | (raw_data[41] == 'rootkit.')
R2L = (raw_data[41] == 'ftp_write.') | (raw_data[41] == 'guess_passwd.') | (raw_data[41] == 'imap.') | (raw_data[41] == 'multihop.') | (raw_data[41] == 'phf.') | (raw_data[41] == 'spy.') | (raw_data[41] == 'warezclient.') | (raw_data[41] == 'warezmaster.')
probe = (raw_data[41] == 'satan.') | (raw_data[41] == 'ipsweep.') | (raw_data[41] == 'portsweep.') | (raw_data[41] == 'nmap.')
# Conditions end

raw_data[42] = np.where(DOS, 'dos', np.where(U2R, 'u2r', np.where(R2L, 'r2l', np.where(probe, 'probe', raw_data[41]))))

In [4]:
raw_data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,http,SF,181,5450,0,0,0,0,...,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.,normal.
5,0,tcp,http,SF,217,2032,0,0,0,0,...,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.,normal.
6,0,tcp,http,SF,212,1940,0,0,0,0,...,1.0,0.0,1.0,0.04,0.0,0.0,0.0,0.0,normal.,normal.
7,0,tcp,http,SF,159,4087,0,0,0,0,...,1.0,0.0,0.09,0.04,0.0,0.0,0.0,0.0,normal.,normal.
8,0,tcp,http,SF,210,151,0,0,0,0,...,1.0,0.0,0.12,0.04,0.0,0.0,0.0,0.0,normal.,normal.
9,0,tcp,http,SF,212,786,0,0,0,1,...,1.0,0.0,0.12,0.05,0.0,0.0,0.0,0.0,normal.,normal.


In [5]:
raw_data.drop([41], axis=1, inplace=True)

In [6]:
# Categorize columns: "protocol", "service", "flag", "attack_type"
raw_data[1], protocols= pd.factorize(raw_data[1])
raw_data[2], services = pd.factorize(raw_data[2])
raw_data[3], flags    = pd.factorize(raw_data[3])
raw_data[42], attacks_cat = pd.factorize(raw_data[42])

In [7]:
features= raw_data.iloc[:,:raw_data.shape[1]-1]
labels= raw_data.iloc[:,raw_data.shape[1]-1:]

In [8]:
# convert them into numpy arrays
#features= numpy.array(features)
#labels= numpy.array(labels).ravel() # this becomes an 'horizontal' array
labels= labels.values.ravel() # this becomes a 'horizontal' array

In [9]:
# Separate data in train set and test set
df= pd.DataFrame(features)
# create training and testing vars
# Note: train_size + test_size < 1.0 means we are subsampling
# Use small numbers for slow classifiers, as KNN, Radius, SVC,...
X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8, test_size=0.2)
print("X_train, y_train:", X_train.shape, y_train.shape)
print("X_test, y_test:", X_test.shape, y_test.shape)

X_train, y_train: (395216, 41) (395216,)
X_test, y_test: (98805, 41) (98805,)


In [10]:
# Training, choose model by commenting/uncommenting clf=
print("Training model")
#Boosting - Ada Boost

clf = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators = 5, learning_rate = 1)

Training model


In [11]:
%%time
model = clf.fit(X_train, y_train)

Wall time: 3.32 s


In [12]:
%%time
from sklearn.externals import joblib
# save the model to disk
filename = 'Adaboost_finalized_model.sav'
joblib.dump(model, filename)

Wall time: 8 ms


In [13]:
%%time
# load the model from disk
trained_model = joblib.load(filename)

Wall time: 2 ms


In [14]:
print("Score: ", trained_model.score(X_train, y_train))

Score:  1.0


In [15]:
%%time
# Predicting
print("Predicting")
y_pred = clf.predict(X_test)

Predicting
Wall time: 64.2 ms


In [16]:
print("Computing performance metrics")
results = confusion_matrix(y_test, y_pred)
error = zero_one_loss(y_test, y_pred)

Computing performance metrics


In [17]:
from sklearn.metrics import classification_report
labels.shape
# target_names = dict(enumerate(labels))
# np.unique(target_names)
# print(target_names)
# print(classification_report(y_true, y_pred, target_names=target_names))

(494021,)

In [18]:
target_names = dict(zip(np.unique(labels), attacks_cat))

In [19]:
reversefactor = target_names
yy_test = np.vectorize(reversefactor.get)(y_test)
yy_pred = np.vectorize(reversefactor.get)(y_pred)

# print("============================= Labels Start ========================")
# print(np.unique(labels))
# print(attacks)
# a = [1,2]
# b = np.array(a)
# print(b.shape)

# print("============================= Labels END ==========================")

print("============================= Printing Classification Report ==========================")
print(classification_report(yy_test, yy_pred))

# # Making the Confusion Matrix
print("============================= Confusion Matrix ===========================")
pd_cm = pd.crosstab(yy_test, yy_pred, rownames=['Actual'], colnames=['Predicted'])

              precision    recall  f1-score   support

         dos       1.00      1.00      1.00     78350
     normal.       1.00      1.00      1.00     19393
       probe       0.99      0.99      0.99       840
         r2l       0.93      0.98      0.95       212
         u2r       0.62      0.50      0.56        10

   micro avg       1.00      1.00      1.00     98805
   macro avg       0.91      0.89      0.90     98805
weighted avg       1.00      1.00      1.00     98805



In [20]:
# pd_cm.to_csv('output/Categorized_Adaboost_Confusion_Matricx_Output_Full.csv', index_label = 'attacks|attacks')

In [21]:
print(pd_cm)

Predicted    dos  normal.  probe  r2l  u2r
Actual                                    
dos        78344        3      3    0    0
normal.        0    19377      2   12    2
probe          1        5    833    1    0
r2l            1        3      0  207    1
u2r            0        2      0    3    5


In [22]:
from pandas_ml import ConfusionMatrix

In [23]:
pdml_cm = ConfusionMatrix(yy_test, yy_pred)

In [24]:
pdml_cm.print_stats()

  num = df[df > 1].dropna(axis=[0, 1], thresh=1).applymap(lambda n: choose(n, 2)).sum().sum() - np.float64(nis2 * njs2) / n2
  num = df[df > 1].dropna(axis=[0, 1], thresh=1).applymap(lambda n: choose(n, 2)).sum().sum() - np.float64(nis2 * njs2) / n2
  den = (np.float64(nis2 + njs2) / 2 - np.float64(nis2 * njs2) / n2)


Confusion Matrix:

Predicted    dos  normal.  probe  r2l  u2r  __all__
Actual                                             
dos        78344        3      3    0    0    78350
normal.        0    19377      2   12    2    19393
probe          1        5    833    1    0      840
r2l            1        3      0  207    1      212
u2r            0        2      0    3    5       10
__all__    78346    19390    838  223    8    98805


Overall Statistics:

Accuracy: 0.9996052831334447
95% CI: (0.9994604481376433, 0.9997193031365268)
No Information Rate: ToDo
P-Value [Acc > NIR]: 0.0
Kappa: 0.9988133312874137
Mcnemar's Test P-Value: ToDo


Class Statistics:

Classes                                        dos      normal.        probe  \
Population                                   98805        98805        98805   
P: Condition positive                        78350        19393          840   
N: Condition negative                        20455        79412        97965   
Test outcome posi

In [25]:
# pdml_cm.stats()