In [2]:
import timeit
start_time = timeit.default_timer()

import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from pandas_ml import ConfusionMatrix

In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore')

In [4]:
raw_data = pd.read_csv('datasets/kddcup.data_10_percent.gz', header=None)
# Raw Data Backup
raw_data_backup = raw_data.copy()

In [5]:
# Conditions start to be in categories
DOS = (raw_data[41] == 'back.') | (raw_data[41] == 'land.') | (raw_data[41] == 'neptune.') | (raw_data[41] == 'pod.') | (raw_data[41] == 'smurf.') | (raw_data[41] == 'teardrop.')
U2R = (raw_data[41] == 'buffer_overflow.') | (raw_data[41] == 'loadmodule.') | (raw_data[41] == 'perl.') | (raw_data[41] == 'rootkit.')
R2L = (raw_data[41] == 'ftp_write.') | (raw_data[41] == 'guess_passwd.') | (raw_data[41] == 'imap.') | (raw_data[41] == 'multihop.') | (raw_data[41] == 'phf.') | (raw_data[41] == 'spy.') | (raw_data[41] == 'warezclient.') | (raw_data[41] == 'warezmaster.')
probe = (raw_data[41] == 'satan.') | (raw_data[41] == 'ipsweep.') | (raw_data[41] == 'portsweep.') | (raw_data[41] == 'nmap.')
# Conditions end

raw_data[42] = np.where(DOS, 'dos', np.where(U2R, 'u2r', np.where(R2L, 'r2l', np.where(probe, 'probe', raw_data[41]))))

In [6]:
# raw_data[43] = np.where((raw_data[41] == 'normal.'), raw_data[41], 'attack.')

In [7]:
raw_data[43] = np.where((raw_data[41] == 'normal.'), 0, 1)

In [8]:
# Dropped Column [6, 8, 14, 18, 19, 20] for Feature Selection
raw_data.drop([6, 8, 14, 18, 19, 20], axis=1, inplace=True)

In [9]:
raw_data.head()

Unnamed: 0,0,1,2,3,4,5,7,9,10,11,...,34,35,36,37,38,39,40,41,42,43
0,0,tcp,http,SF,181,5450,0,0,0,1,...,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.,normal.,0
1,0,tcp,http,SF,239,486,0,0,0,1,...,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.,normal.,0
2,0,tcp,http,SF,235,1337,0,0,0,1,...,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.,normal.,0
3,0,tcp,http,SF,219,1337,0,0,0,1,...,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.,normal.,0
4,0,tcp,http,SF,217,2032,0,0,0,1,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.,normal.,0


In [10]:
# Categorize columns: "protocol", "service", "flag", "attack_type"
raw_data[1], protocols= pd.factorize(raw_data[1])
raw_data[2], services = pd.factorize(raw_data[2])
raw_data[3], flags    = pd.factorize(raw_data[3])
raw_data[41], attacks = pd.factorize(raw_data[41])
raw_data[42], attacks_cat = pd.factorize(raw_data[42])

In [11]:
Attack_Training_Sub_Dataset = raw_data.loc[raw_data[43] == 1] # Attack
U2R_Training_Sub_Dataset    = raw_data.loc[raw_data[42] == 1] # u2r
DOS_Training_Sub_Dataset    = raw_data.loc[raw_data[42] == 2] # dos
R2L_Training_Sub_Dataset    = raw_data.loc[raw_data[42] == 3] # r2l
Probe_Training_Sub_Dataset  = raw_data.loc[raw_data[42] == 4] # probe

In [12]:
# Set the target names for confusion matrix
target_names_root   = dict(zip([0, 1], ['normal.', 'attack.']))
target_names_cat    = dict(zip(np.unique(raw_data[42].values.ravel()), attacks_cat))
target_names_attack = dict(zip(np.unique(raw_data[41].values.ravel()), attacks))

In [13]:
print("Using Random Forest Classifier")

# Random Forest Classifier
clf = RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)

Using Random Forest Classifier


In [14]:
def get_X(dataset):
    X_test = dataset.iloc[:,:raw_data.shape[1]-3]
    return X_test


def get_y(dataset, feature_column):
    y_test = dataset[feature_column].values.ravel() # this becomes a 'horizontal' array
    return y_test

In [15]:
def get_confusion_matrix(y, y_pred, target_names):
    y = np.vectorize(target_names.get)(y)
    y_pred = np.vectorize(target_names.get)(y_pred)
    # print(target_names)
    
    # Making the Confusion Matrix
    print("=================================== Confusion Matrix ==================================")
    cm = ConfusionMatrix(y, y_pred)
    cm.print_stats()
    
    return cm

In [16]:
def print_classification_report(y, y_pred, target_names):
    y = np.vectorize(target_names.get)(y)
    y_pred = np.vectorize(target_names.get)(y_pred)
    # print(target_names)
    
    print("============================= Printing Classification Report ==========================")
    print(classification_report(y, y_pred))

In [17]:
def test_model(X, y, CV, target_names):
    # Predicting
    print("Predicting with {}-Fold cross validation".format(CV))
    y_pred = cross_val_predict(clf, X, y, cv=CV)
    scores = cross_val_score(clf, X, y, cv=CV)
    print("Accuracy: %0.2f" % (scores.mean()))
    
    get_confusion_matrix(y, y_pred, target_names)
    print_classification_report(y, y_pred, target_names)
    
    return y_pred

In [24]:
def test_root_model():
    # Test Root Model
    print("======================================================================================")
    print("================================ Root Model Output ===================================")
    print("======================================================================================")
    
    X = get_X(raw_data)
    y = get_y(raw_data, 43)
    
    # Predicting
    print("Predicting with 10-Fold cross validation")
    # root_model_pred = test_model(X, y, 5, target_names_root)
    root_model_pred = cross_val_predict(clf, X, y, cv=10)
    
    # Making the Confusion Matrix
    print("=================================== Confusion Matrix ==================================")
    root_cm = ConfusionMatrix(y, root_model_pred)
#     root_cm.print_stats()
    print(root_cm)
    
    print_classification_report(y, root_model_pred, target_names_root)
    
    return root_model_pred

In [30]:
print(test_root_model())

Predicting with 10-Fold cross validation
Predicted  False    True  __all__
Actual                           
False      97225      53    97278
True        1280  395463   396743
__all__    98505  395516   494021
              precision    recall  f1-score   support

     attack.       1.00      1.00      1.00    396743
     normal.       0.99      1.00      0.99     97278

   micro avg       1.00      1.00      1.00    494021
   macro avg       0.99      1.00      1.00    494021
weighted avg       1.00      1.00      1.00    494021

[0 0 0 ... 0 0 0]


In [25]:
def test_category_model():
    # Test Category Model
    print("\n\n\n")
    print("======================================================================================")
    print("================================ Category Model Output ===============================")
    print("======================================================================================")
    
    X = get_X(Attack_Training_Sub_Dataset)
    y = get_y(Attack_Training_Sub_Dataset, 42)
    
    attack_category_model_pred = test_model(X, y, 10, target_names_cat)
    
    return attack_category_model_pred

In [31]:
print(test_category_model())





Predicting with 10-Fold cross validation
Accuracy: 0.95


ValueError: math domain error

In [26]:
def test_by_attack_name(attack_name_data, CV):
    
    X = get_X(attack_name_data)
    y = get_y(attack_name_data, 41)
    
    attack_model_pred = test_model(X, y, CV, target_names_attack)

In [27]:
def test_our_model():
    # Root Model Testing
    test_root_model()
    
    # Category Model
    test_category_model()
    
    # U2R Model Testing
    print("\n\n\n")
    print("======================================================================================")
    print("=================================== U2R Model Output =================================")
    print("======================================================================================")
    test_by_attack_name(U2R_Training_Sub_Dataset, 2)
    # DOS Model Testing
    print("\n\n\n")
    print("======================================================================================")
    print("=================================== DOS Model Output =================================")
    print("======================================================================================")
    test_by_attack_name(DOS_Training_Sub_Dataset, 5)
    # R2L Model Testing
    print("\n\n\n")
    print("======================================================================================")
    print("=================================== R2L Model Output =================================")
    print("======================================================================================")
    test_by_attack_name(R2L_Training_Sub_Dataset, 2)
    # Probe Model Testing
    print("\n\n\n")
    print("======================================================================================")
    print("=================================== Probe Model Output ===============================")
    print("======================================================================================")
    test_by_attack_name(Probe_Training_Sub_Dataset, 10)

In [28]:
%%time
test_our_model()

Predicting with 10-Fold cross validation
Predicted  False    True  __all__
Actual                           
False      97225      53    97278
True        1280  395463   396743
__all__    98505  395516   494021
              precision    recall  f1-score   support

     attack.       1.00      1.00      1.00    396743
     normal.       0.99      1.00      0.99     97278

   micro avg       1.00      1.00      1.00    494021
   macro avg       0.99      1.00      1.00    494021
weighted avg       1.00      1.00      1.00    494021





Predicting with 10-Fold cross validation
Accuracy: 0.95


ValueError: math domain error

In [23]:
stop_time = timeit.default_timer()
print('Execution Time: ', stop_time - start_time, 'Sec')

Execution Time:  175.19608359999995 Sec
