In [1]:
import timeit
start_time = timeit.default_timer()

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, zero_one_loss

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore')

In [3]:
raw_data = pd.read_csv('../datasets/kddcup.data_10_percent.gz', header=None)
# Raw Data Backup
raw_data_backup = raw_data.copy()

In [4]:
# Conditions start to be in categories
DOS = (raw_data[41] == 'back.') | (raw_data[41] == 'land.') | (raw_data[41] == 'neptune.') | (raw_data[41] == 'pod.') | (raw_data[41] == 'smurf.') | (raw_data[41] == 'teardrop.')
U2R = (raw_data[41] == 'buffer_overflow.') | (raw_data[41] == 'loadmodule.') | (raw_data[41] == 'perl.') | (raw_data[41] == 'rootkit.')
R2L = (raw_data[41] == 'ftp_write.') | (raw_data[41] == 'guess_passwd.') | (raw_data[41] == 'imap.') | (raw_data[41] == 'multihop.') | (raw_data[41] == 'phf.') | (raw_data[41] == 'spy.') | (raw_data[41] == 'warezclient.') | (raw_data[41] == 'warezmaster.')
probe = (raw_data[41] == 'satan.') | (raw_data[41] == 'ipsweep.') | (raw_data[41] == 'portsweep.') | (raw_data[41] == 'nmap.')
# Conditions end

raw_data[42] = np.where(DOS, 'dos', np.where(U2R, 'u2r', np.where(R2L, 'r2l', np.where(probe, 'probe', raw_data[41]))))

In [5]:
raw_data[43] = np.where((raw_data[41] == 'normal.'), 0, 1)

In [6]:
# Dropped Column [6, 8, 14, 18, 19, 20] for Feature Selection
raw_data.drop([6, 8, 14, 18, 19, 20], axis=1, inplace=True)

In [7]:
raw_data.head()

Unnamed: 0,0,1,2,3,4,5,7,9,10,11,...,34,35,36,37,38,39,40,41,42,43
0,0,tcp,http,SF,181,5450,0,0,0,1,...,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.,normal.,0
1,0,tcp,http,SF,239,486,0,0,0,1,...,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.,normal.,0
2,0,tcp,http,SF,235,1337,0,0,0,1,...,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.,normal.,0
3,0,tcp,http,SF,219,1337,0,0,0,1,...,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.,normal.,0
4,0,tcp,http,SF,217,2032,0,0,0,1,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.,normal.,0


In [8]:
# Categorize columns: "protocol", "service", "flag", "attack_type"
raw_data[1], protocols= pd.factorize(raw_data[1])
raw_data[2], services = pd.factorize(raw_data[2])
raw_data[3], flags    = pd.factorize(raw_data[3])
raw_data[41], attacks = pd.factorize(raw_data[41])
raw_data[42], attacks_cat = pd.factorize(raw_data[42])

In [9]:
features = raw_data.iloc[:,:raw_data.shape[1]-3]
labels = raw_data.iloc[:,raw_data.shape[1]-3]

In [10]:
# convert them into numpy arrays
#features= numpy.array(features)
#labels= numpy.array(labels).ravel() # this becomes an 'horizontal' array
labels = labels.values.ravel() # this becomes a 'horizontal' array
print(labels)

[0 0 0 ... 0 0 0]


In [11]:
# Separate data in train set and test set
df = pd.DataFrame(features)
# create training and testing vars
# Note: train_size + test_size < 1.0 means we are subsampling
# Use small numbers for slow classifiers, as KNN, Radius, SVC,...
X_train, X_test, y_train, y_test = train_test_split(df, labels, train_size=0.8, test_size=0.2)
print("X_train, y_train:", X_train.shape, y_train.shape)
print("X_test, y_test:", X_test.shape, y_test.shape)

X_train, y_train: (395216, 35) (395216,)
X_test, y_test: (98805, 35) (98805,)


In [12]:
train_indexes = X_train.index.values
Training_Dataset = raw_data.loc[train_indexes,:]
Training_Dataset.head()

Unnamed: 0,0,1,2,3,4,5,7,9,10,11,...,34,35,36,37,38,39,40,41,42,43
403850,0,2,9,0,520,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5,2,1
424171,0,2,9,0,520,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5,2,1
277486,0,2,9,0,1032,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5,2,1
345312,0,1,3,0,32,48,0,0,0,0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
458504,1,0,1,0,1804,330,0,0,0,1,...,0.03,0.01,0.02,0.0,0.0,0.0,0.0,0,0,0


In [13]:
test_indexes = X_test.index.values
Testing_Dataset = raw_data.loc[test_indexes,:]
Testing_Dataset.head()

Unnamed: 0,0,1,2,3,4,5,7,9,10,11,...,34,35,36,37,38,39,40,41,42,43
398609,0,2,9,0,520,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5,2,1
339581,0,2,9,0,1032,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5,2,1
302,0,0,0,0,310,1481,0,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
460294,0,0,11,2,0,0,0,0,0,0,...,0.06,0.0,0.0,0.0,0.0,1.0,1.0,4,2,1
39666,2,0,1,0,1960,400,0,0,0,1,...,0.02,0.01,0.02,0.0,0.0,0.0,0.0,0,0,0


In [14]:
Normal_Training_Sub_Dataset = Training_Dataset.loc[Training_Dataset[42] == 0] # Normal
Attack_Training_Sub_Dataset = Training_Dataset.loc[Training_Dataset[43] == 1] # Attack
U2R_Training_Sub_Dataset    = Training_Dataset.loc[Training_Dataset[42] == 1] # u2r
DOS_Training_Sub_Dataset    = Training_Dataset.loc[Training_Dataset[42] == 2] # dos
R2L_Training_Sub_Dataset    = Training_Dataset.loc[Training_Dataset[42] == 3] # r2l
Probe_Training_Sub_Dataset  = Training_Dataset.loc[Training_Dataset[42] == 4] # probe

In [15]:
def make_model(raw_data, feature_column):
    X_train = raw_data.iloc[:,:raw_data.shape[1]-3]
    y_train = raw_data[feature_column]
    
    # convert them into numpy arrays
    #features= numpy.array(features)
    #labels= numpy.array(labels).ravel() # this becomes an 'horizontal' array
    y_train = y_train.values.ravel() # this becomes a 'horizontal' array
    
    df = pd.DataFrame(X_train)
    
    # Training, choose model by commenting/uncommenting clf=
    print("Training model")
    # Random Forest Classifier

    rfc = RandomForestClassifier(n_jobs=-1, random_state=3, n_estimators=102)
    
    model = rfc.fit(X_train, y_train)
    
    print("Score: ", model.score(X_train, y_train))
    
    return model

In [16]:
def get_X(dataset):
    X_test = dataset.iloc[:,:raw_data.shape[1]-3]
    return X_test


def get_y(dataset, feature_column):
    y_test = dataset[feature_column]
    return y_test

In [17]:
from sklearn.metrics import classification_report

def test_model(model, X_test, y_test, target_names):    
    print("X_test, y_test:", X_test.shape, y_test.shape)
    
    # Predicting
    print("Predicting")
    y_pred = model.predict(X_test)
    
    y_test = np.vectorize(target_names.get)(y_test)
    y_pred = np.vectorize(target_names.get)(y_pred)
    
    # print('Target Values are: ')
    # print(pd.DataFrame(np.array(target_names)))
    # print(target_names)
    # Making the Confusion Matrix
    print("=================================== Confusion Matrix ==================================")
    pd_cm = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
    error = zero_one_loss(y_test, y_pred)
    print(pd_cm)
    
    print("============================= Printing Classification Report ==========================")
    print(classification_report(y_test, y_pred))

In [18]:
def test_by_attack_name(model, X_test_category, y_pred_category, cat_id):
#     attack_name_model_test = X_test
    # Concating the X_test and y_test
    X_test_category.loc[:,42] = y_pred_category
    Test_Rows = X_test_category.loc[X_test_category[42] == cat_id] # Attack Name

    # X_test_attack_category = get_X(attack_to_category)
    X_test = get_X(Test_Rows)
    # y_test_attack_category = get_y(attack_to_category, 43)

    model_pred = get_prediction(model, X_test)

    test_indexes = Test_Rows.index.values
    Testing_Sub_Dataset = Testing_Dataset.loc[test_indexes,:]

    y_test = get_y(Testing_Sub_Dataset, 41)
    test_model(model, X_test, y_test, target_names_attack)

In [19]:
def get_prediction(model, X_test):
    print("X_test:", X_test.shape)
    
    # Predicting
    print("Predicting")
    y_pred = model.predict(X_test)
    print("================================ Prediction Probability ===============================")
    print(model.predict_proba(X_test))
    return y_pred

In [20]:
# Set the target names for confusion matrix   
target_names_root   = dict(zip([0, 1], ['normal', 'attack']))
target_names_cat    = dict(zip(np.unique(raw_data[42].values.ravel()), attacks_cat))
target_names_attack = dict(zip(np.unique(labels), attacks))

In [21]:
root_model = make_model(Training_Dataset, 43)

Training model
Score:  0.9999949394761346


In [22]:
# Making Model
attack_category_model = make_model(Attack_Training_Sub_Dataset, 42)
u2r_model    = make_model(U2R_Training_Sub_Dataset, 41)
dos_model    = make_model(DOS_Training_Sub_Dataset, 41)
r2l_model    = make_model(R2L_Training_Sub_Dataset, 41)
probe_model  = make_model(Probe_Training_Sub_Dataset, 41)

Training model
Score:  1.0
Training model
Score:  1.0
Training model
Score:  1.0
Training model
Score:  1.0
Training model
Score:  0.9996953077391835


In [23]:
X_test_root_model = get_X(Testing_Dataset)
y_test_root_model = get_y(Testing_Dataset, 43)

In [24]:
def test_our_model(X_test, y_test):
    X_test_root_model = X_test
    y_test_root_model = y_test
    # Test Root Model
    print("======================================================================================")
    print("================================ Root Model Output ===================================")
    print("======================================================================================")
    test_model(root_model, X_test_root_model, y_test_root_model, target_names_root)
    root_model_pred = get_prediction(root_model, X_test_root_model)
    category_model_test = X_test_root_model.copy()
    # Concating the X_test and y_test
    category_model_test.loc[:,43] = root_model_pred
    attack_to_category = category_model_test.loc[category_model_test[43] == 1] # Attack
    attack_to_category.head()
    
    # Test Category Model
    print("\n\n\n")
    print("======================================================================================")
    print("================================ Category Model Output ===============================")
    print("======================================================================================")
    Attack_Training_Sub_Dataset = Training_Dataset.loc[Training_Dataset[43] == 1] # Attack
    X_test_attack_category = get_X(attack_to_category)
    attack_category_model_pred = get_prediction(attack_category_model, X_test_attack_category)
    
    attack_category_test_indexes = attack_to_category.index.values
    Attack_Testing_Sub_Dataset = Testing_Dataset.loc[attack_category_test_indexes,:]

    y_test_attack_category = get_y(Attack_Testing_Sub_Dataset, 42)
    test_model(attack_category_model, X_test_attack_category, y_test_attack_category, target_names_cat)
    
    # U2R Model Testing
    print("\n\n\n")
    print("======================================================================================")
    print("=================================== U2R Model Output =================================")
    print("======================================================================================")
    test_by_attack_name(u2r_model, X_test_attack_category, attack_category_model_pred, 1)
    # DOS Model Testing
    print("\n\n\n")
    print("======================================================================================")
    print("=================================== DOS Model Output =================================")
    print("======================================================================================")
    test_by_attack_name(dos_model, X_test_attack_category, attack_category_model_pred, 2)
    # R2L Model Testing
    print("\n\n\n")
    print("======================================================================================")
    print("=================================== R2L Model Output =================================")
    print("======================================================================================")
    test_by_attack_name(r2l_model, X_test_attack_category, attack_category_model_pred, 3)
    # Probe Model Testing
    print("\n\n\n")
    print("======================================================================================")
    print("=================================== Probe Model Output ===============================")
    print("======================================================================================")
    test_by_attack_name(probe_model, X_test_attack_category, attack_category_model_pred, 4)

In [25]:
%%time
test_our_model(X_test_root_model, y_test_root_model)
print('Execution Time: ', timeit.default_timer() - start_time, 'Sec')

X_test, y_test: (98805, 35) (98805,)
Predicting
Predicted  attack  normal
Actual                   
attack      79403       7
normal          2   19393
              precision    recall  f1-score   support

      attack       1.00      1.00      1.00     79410
      normal       1.00      1.00      1.00     19395

   micro avg       1.00      1.00      1.00     98805
   macro avg       1.00      1.00      1.00     98805
weighted avg       1.00      1.00      1.00     98805

X_test: (98805, 35)
Predicting
[[0. 1.]
 [0. 1.]
 [1. 0.]
 ...
 [1. 0.]
 [0. 1.]
 [0. 1.]]




X_test: (79405, 35)
Predicting
[[0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 ...
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]]
X_test, y_test: (79405, 35) (79405,)
Predicting
Predicted    dos  probe  r2l  u2r
Actual                           
dos        78355      0    0    0
normal.        0      1    1    0
probe          0    824    0    0
r2l            0      0  216    0
u2r            0      0    0    8
             

In [26]:
def pred_by_attack_name(model, X_test_category, y_pred_category, cat_id):
#     attack_name_model_test = X_test
    # Concating the X_test and y_test
    X_test_category.loc[:,42] = y_pred_category
    Test_Rows = X_test_category.loc[X_test_category[42] == cat_id] # Attack Name

    # X_test_attack_category = get_X(attack_to_category)
    X_test = get_X(Test_Rows)
    # y_test_attack_category = get_y(attack_to_category, 43)

    model_pred = get_prediction(model, X_test)
    return model_pred

In [27]:
def predict_instance(X_test):
    X_test_root_model = X_test
    # Predict Root Model
    root_model_pred = get_prediction(root_model, X_test_root_model)
    print(root_model_pred)
    
    # For Category Nodes
    category_model_test = X_test_root_model.copy()
    # Concating the X_test and y_test
    category_model_test.loc[:,43] = root_model_pred
    attack_to_category = category_model_test.loc[category_model_test[43] == 1] # Attack
    
    
    # Predict Category Model
    if(attack_to_category.shape[0]>0):
        X_test_attack_category = get_X(attack_to_category)
        attack_category_model_pred = get_prediction(attack_category_model, X_test_attack_category)
        print(attack_category_model_pred)
    
    # For Attack Nodes
    attack_model_test = X_test_attack_category.copy()
    # Concating the X_test and y_test
    attack_model_test.loc[:,42] = attack_category_model_pred
    category_to_u2r = attack_model_test.loc[attack_model_test[42] == 1] # U2R
    category_to_dos = attack_model_test.loc[attack_model_test[42] == 2] # DOS
    category_to_r2l = attack_model_test.loc[attack_model_test[42] == 3] # R2L
    category_to_probe = attack_model_test.loc[attack_model_test[42] == 4] # Probe
    
    # U2R Model Predict
    if(category_to_u2r.shape[0]>0):
        attack_names = get_prediction(u2r_model, get_X(category_to_u2r))
        print(attack_names)
        
    # DOS Model Predict
    if(category_to_dos.shape[0]>0):
        attack_names = get_prediction(dos_model, get_X(category_to_dos))
        print(attack_names)
        
    # R2L Model Predict
    if(category_to_r2l.shape[0]>0):
        attack_names = get_prediction(r2l_model, get_X(category_to_r2l))
        print(attack_names)
        
    # Probe Model Predict
    if(category_to_probe.shape[0]>0):
        attack_names = get_prediction(probe_model, get_X(category_to_probe))
        print(attack_names)

In [28]:
train_value_count = pd.DataFrame(np.vectorize(target_names_attack.get)(Training_Dataset[41]))[0].value_counts()
# train_value_count.to_frame()

In [29]:
test_value_count = pd.DataFrame(np.vectorize(target_names_attack.get)(Testing_Dataset[41]))[0].value_counts()
# test_value_count.to_frame()

In [30]:
Training_Dataset.to_csv('output/train_dataset.csv', sep=',', encoding='utf-8', index=False)

In [31]:
Testing_Dataset.to_csv('output/test_dataset.csv', sep=',', encoding='utf-8', index=False)

In [32]:
print('Train Value Shape: ', train_value_count.shape)
print('Test  Value Shape: ', test_value_count.shape)

Train Value Shape:  (23,)
Test  Value Shape:  (19,)


In [33]:
pd.concat([train_value_count.to_frame(), test_value_count.to_frame()], 
                   axis='columns', keys=['Train', 'Test']).keys()

MultiIndex(levels=[['Train', 'Test'], [0]],
           labels=[[0, 1], [0, 0]])

In [34]:
pd.concat([train_value_count.to_frame(), test_value_count.to_frame()], 
                   axis='columns', keys=['Train', 'Test']).sort_values([('Train', 0)], ascending=False).style.highlight_null(null_color='red')

Unnamed: 0_level_0,Train,Test
Unnamed: 0_level_1,0,0
smurf.,224480,56310.0
neptune.,85849,21352.0
normal.,77883,19395.0
back.,1781,422.0
satan.,1253,336.0
ipsweep.,1017,230.0
portsweep.,822,218.0
warezclient.,815,205.0
teardrop.,766,213.0
pod.,211,53.0


In [35]:
stop_time = timeit.default_timer()
print('Execution Time: ', stop_time - start_time, 'Sec')

Execution Time:  68.2332332 Sec
