In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing  import KBinsDiscretizer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.ensemble import RandomForestClassifier

# Data Reading


In [2]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
"dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
"logged_in","num_compromised","root_shell","su_attempted","num_root",
"num_file_creations","num_shells","num_access_files","num_outbound_cmds",
"is_host_login","is_guest_login","count","srv_count","serror_rate",
"srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
"diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
"dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
"dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
official_train_data = pd.read_csv("kddcup.data.gz",header=None,compression='gzip', sep=',',names = col_names)
official_train_data_df = official_train_data.copy()

In [3]:
train_attacks_type = {'normal.': 'normal','back.': 'dos','buffer_overflow.': 'u2r','ftp_write.': 'r2l',
'guess_passwd.': 'r2l','imap.': 'r2l','ipsweep.': 'probe','land.': 'dos',
'loadmodule.': 'u2r','multihop.': 'r2l','neptune.': 'dos',
'nmap.': 'probe','perl.': 'u2r','phf.': 'r2l',
'pod.': 'dos','portsweep.': 'probe','rootkit.': 'u2r',
'satan.': 'probe','smurf.': 'dos','spy.': 'r2l','teardrop.': 'dos','warezclient.': 'r2l','warezmaster.': 'r2l',}
official_train_data_df['label'] = official_train_data_df['label'].replace(train_attacks_type)

In [4]:
# Binary attack type
Binary_attack_type = {'dos' : 1,
                     'normal' : 0,
                     'probe' : 1,
                     'r2l' : 1,
                     'u2r' : 1}
official_train_data_df['label'] = official_train_data_df['label'].replace(Binary_attack_type)

In [5]:
#delete feature num_outbound_cmds cuz all 0
official_train_data_df_cleared = official_train_data_df.drop('num_outbound_cmds', 1)

In [6]:
# encoding nominal features to numerical features
nominal_features = ["protocol_type", "service", "flag"]
binary_features = ["land", "logged_in", "root_shell", "su_attempted", "is_host_login", "is_guest_login"]
numeric_features = ["duration", "src_bytes",
            "dst_bytes", "wrong_fragment", "urgent", "hot",
            "num_failed_logins", "num_compromised", "num_root",
            "num_file_creations", "num_shells", "num_access_files",
            "count", "srv_count", "serror_rate","srv_serror_rate", "rerror_rate", "srv_rerror_rate", 
            "same_srv_rate","diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
            "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
            "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
            "dst_host_rerror_rate", "dst_host_srv_rerror_rate"]

In [7]:
# change nominal features from string to integer
label_encoder = LabelEncoder()
for item in nominal_features:
    official_train_data_df_cleared[item] = label_encoder.fit_transform(official_train_data_df_cleared[item])

In [8]:
#the attack type has been changed to binary attack & normal
# for split training set and testing set
y = official_train_data_df_cleared.label
x = official_train_data_df_cleared.drop('label',axis = 1)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)

#-----------------------✂---------------------------
# official test dataset reading

In [9]:
col_names = ["duration","protocol_type","service","flag","src_bytes",
"dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
"logged_in","num_compromised","root_shell","su_attempted","num_root",
"num_file_creations","num_shells","num_access_files","num_outbound_cmds",
"is_host_login","is_guest_login","count","srv_count","serror_rate",
"srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
"diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
"dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
"dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
official_test_data = pd.read_csv("test data with corrected label.gz",header=None,compression='gzip', sep=',',names = col_names)
official_test_data_df = official_test_data.copy()

In [10]:
# preprocessing official test data
official_test_attacks_type = {
'normal.': 'normal','back.': 'dos','buffer_overflow.': 'u2r','ftp_write.': 'r2l','guess_passwd.': 'r2l','imap.': 'r2l',
'ipsweep.': 'probe','land.': 'dos','loadmodule.': 'u2r','multihop.': 'r2l','neptune.': 'dos','nmap.': 'probe',
'perl.': 'u2r','phf.': 'r2l','pod.': 'dos','portsweep.': 'probe','rootkit.': 'u2r','satan.': 'probe','smurf.': 'dos',
'spy.': 'r2l','teardrop.': 'dos','warezclient.': 'r2l','warezmaster.': 'r2l','snmpgetattack.':'probe','mscan.': 'probe',
'saint.': 'probe','apache2.': 'dos','mailbomb.': 'dos',
'processtable.': 'dos','udpstorm.': 'dos','httptunnel.':'dos','ps.':'dos','sqlattack.':'dos',
'xterm.':'dos','named.': 'r2l','sendmail.':'r2l','snmpgetattack.':'r2l','snmpguess.':'r2l','worm.': 'r2l','xlock.':'r2l',
'xsnoop.':'r2l'}
official_test_data_df['label'] = official_test_data_df['label'].replace(official_test_attacks_type)

In [11]:
# Binary attack type
Binary_attack_type = {'dos' : 1,
                     'normal' : 0,
                     'probe' : 1,
                     'r2l' : 1,
                     'u2r' : 1}
official_test_data_df['label'] = official_test_data_df['label'].replace(Binary_attack_type)
official_test_data_df['label'].value_counts()

1    250436
0     60593
Name: label, dtype: int64

In [12]:
nominal_features = ["protocol_type", "service", "flag"]
binary_features = ["land", "logged_in", "root_shell", "su_attempted", "is_host_login", "is_guest_login"]
numeric_features = ["duration", "src_bytes",
            "dst_bytes", "wrong_fragment", "urgent", "hot",
            "num_failed_logins", "num_compromised", "num_root",
            "num_file_creations", "num_shells", "num_access_files",
            "count", "srv_count", "serror_rate","srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
            "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
            "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
            "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
            "dst_host_rerror_rate", "dst_host_srv_rerror_rate"]
# change label
label_encoder = LabelEncoder()
for item in nominal_features:
    official_test_data_df[item] = label_encoder.fit_transform(official_test_data_df[item])

In [13]:
X_official_test = official_test_data_df.drop(['label','num_outbound_cmds'], axis = 1)
y_official_test = official_test_data_df['label']
X_official_test_categorical = pd.concat([X_official_test[nominal_features],X_official_test[binary_features]],axis = 1)
X_official_test_continuous = X_official_test[numeric_features]

In [14]:
# prepare dataset to be ready for discretization: 
# X_train_continuous, X_train_categorical, X_test_continuous, X_test_categorical
X_train_continuous = X_train[numeric_features]
X_train_categorical = pd.concat([X_train[nominal_features],X_train[binary_features]],axis = 1)
X_train_continuous_label = pd.concat([X_train[numeric_features],y_train],axis = 1)
X_test_continuous = X_test[numeric_features]
X_test_categorical = pd.concat([X_test[nominal_features],X_test[binary_features]],axis = 1)

##-----------------------✂---------------------------##
# Preprocessing Discretization
'''
X_train_continuous
X_test_continuous
X_official_test_continuous

X_train_categorical
X_test_categorical
X_official_test_categorical

'''

In [15]:
# X_train_EWD
est_ewd = KBinsDiscretizer(n_bins = 5, encode = 'ordinal', strategy = 'uniform')
est_ewd.fit(X_train[numeric_features])
Xtt = est_ewd.transform(X_train[numeric_features])
X_train_continuous_ewd = pd.DataFrame(Xtt)
X_train_continuous_ewd.columns = numeric_features
X_train_ewd = pd.concat([X_train_categorical.reset_index(drop = True), X_train_continuous_ewd], axis = 1)

In [16]:
#  X_test_ewd
X_test_continuous_ewd = est_ewd.transform(X_test[numeric_features])
X_test_continuous_ewd_df = pd.DataFrame(X_test_continuous_ewd)
X_test_continuous_ewd_df.columns = numeric_features
X_test_ewd = pd.concat([X_test_categorical.reset_index(drop = True), X_test_continuous_ewd_df], axis = 1)

In [17]:
# X_official_test_ewd
X_officicial_test_continuous_ewd = est_ewd.transform(X_official_test_continuous)
X_officicial_test_continuous_ewd_df = pd.DataFrame(X_officicial_test_continuous_ewd)
X_officicial_test_continuous_ewd_df.columns = numeric_features
X_official_test_ewd = pd.concat([X_official_test_categorical.reset_index(drop = True),\
                                 X_officicial_test_continuous_ewd_df], axis = 1)

In [18]:
# X_train_kmeans
from sklearn.preprocessing  import KBinsDiscretizer
est_kmeans = KBinsDiscretizer(n_bins = 2, encode = 'ordinal', strategy = 'kmeans')
est_kmeans.fit(X_train[numeric_features])
est_kmeans_transformed = est_kmeans.transform(X_train[numeric_features])
X_train_continuous_kmeans = pd.DataFrame(est_kmeans_transformed)
X_train_continuous_kmeans.columns = numeric_features
X_train_kmeans = pd.concat([X_train_categorical.reset_index(drop = True), X_train_continuous_kmeans], axis = 1)

In [19]:
#  X_test_kmeans
X_test_continuous_kmeans = est_kmeans.transform(X_test[numeric_features])
X_test_continuous_kmeans_df = pd.DataFrame(X_test_continuous_kmeans)
X_test_continuous_kmeans_df.columns = numeric_features
X_test_kmeans = pd.concat([X_test_categorical.reset_index(drop = True), X_test_continuous_kmeans_df], axis = 1)

In [20]:
# X_official_test_kmeans
X_official_test_continuous_kmeans = est_kmeans.transform(X_official_test_continuous)
X_official_test_continuous_kmeans_df = pd.DataFrame(X_official_test_continuous_kmeans)
X_official_test_continuous_kmeans_df.columns = numeric_features
X_official_test_kmeans = pd.concat([X_official_test_categorical.reset_index(drop = True), \
                                    X_official_test_continuous_kmeans_df], axis = 1)

In [21]:
# Entropy Minimization Discretization method
from Orange.data import Table, Domain
import Orange.preprocess as OrangePre
training_domain = Domain.from_numpy(X_train_continuous.values,y_train.values)
training_continuous_table = Table.from_numpy(training_domain,X_train_continuous.values,y_train.values) #v2=1  v1 =0
# setting discretization method
disc_entropy = OrangePre.Discretize()
disc_entropy.method = OrangePre.discretize.EntropyMDL(force = True) 
# discretize the continuous variables in trainning set
train_continuous_discretized = disc_entropy(training_continuous_table)
# transfer discretized data back to dataframe, getting final X_train_EMD
train_continuous_discretized_df = pd.DataFrame(np.array(train_continuous_discretized))
train_continuous_discretized_df.columns = numeric_features + ['label'] # add feature name
train_continuous_discretized_df = train_continuous_discretized_df.drop(['label'], axis = 1)
X_train_emd = pd.concat([X_train_categorical.reset_index(drop = True), train_continuous_discretized_df],axis = 1) # X_train_EMD

In [22]:
# prepare X_test_EMD, fit the learnt discretization method on test data
X_test_continuous = X_test[numeric_features]
test_domain = Domain.from_numpy(X_test_continuous)
X_test_continuous_table = Table.from_numpy(test_domain, X_test_continuous.values)
X_test_continuous_discretized = X_test_continuous_table.transform(train_continuous_discretized.domain)
X_test_continuous_discretized_df = pd.DataFrame(np.array(X_test_continuous_discretized))
X_test_continuous_discretized_df.columns = numeric_features + ['label']
X_test_continuous_discretized_df = X_test_continuous_discretized_df.drop(['label'], axis = 1)
X_test_emd = pd.concat([X_test_categorical.reset_index(drop = True), X_test_continuous_discretized_df], axis = 1)

In [23]:
# prepare X_official_test_EMD, fit the learnt discretization method on official test data
X_official_test_continuous = X_official_test[numeric_features]
official_test_domain = Domain.from_numpy(X_official_test_continuous)
X_official_test_continuous_table = Table.from_numpy(official_test_domain, X_official_test_continuous.values)
X_official_test_continuous_discretized = X_official_test_continuous_table.transform(train_continuous_discretized.domain)
X_official_test_continuous_discretized_df = pd.DataFrame(np.array(X_official_test_continuous_discretized))
X_official_test_continuous_discretized_df.columns = numeric_features + ['label']
X_official_test_continuous_discretized_df = X_official_test_continuous_discretized_df.drop(['label'], axis = 1)
X_official_test_emd = pd.concat([X_official_test_categorical.reset_index(drop = True), X_official_test_continuous_discretized_df], axis = 1)

In [24]:
# Due to a large amount of duplcated values in each feature, it is necessary to build a specific equal frequency disc PKID
#
import math
def EqualFrequencyDisc(train_df, test_df, k, label):#
    disc_train_df = pd.DataFrame()
    disc_test_df = pd.DataFrame()
    for item in train_df.columns:
        unique_list = train_df[item].unique()
        est_efd = KBinsDiscretizer(n_bins = k, encode = 'ordinal', strategy = 'quantile')
        est_efd.fit(unique_list.reshape(-1,1))
        train_col_disc = est_efd.transform(np.array(train_df[item]).reshape(-1,1))
        test_col_disc = est_efd.transform(np.array(test_df[item]).reshape(-1,1))
        disc_train_df = pd.concat([disc_train_df,pd.DataFrame(train_col_disc)], axis = 1)
        disc_test_df = pd.concat([disc_test_df,pd.DataFrame(test_col_disc)], axis = 1)
    disc_train_df.columns = label
    disc_test_df.columns = label
    return disc_train_df, disc_test_df 
    


def EqualFrequencyDisc_pkid(train_df, test_df, label):
    disc_train_df = pd.DataFrame()
    disc_test_df = pd.DataFrame()
    for item in train_df.columns:
        bin_nums = int(math.sqrt(train_df[item].nunique()))
        if bin_nums < 2:
            bin_nums += 1
        else:
            bin_nums += 0
        unique_list = train_df[item].unique()
        est_efd = KBinsDiscretizer(n_bins = bin_nums, encode = 'ordinal', strategy = 'quantile')
        est_efd.fit(unique_list.reshape(-1,1))
        train_col_disc = est_efd.transform(np.array(train_df[item]).reshape(-1,1))
        test_col_disc = est_efd.transform(np.array(test_df[item]).reshape(-1,1))
        disc_train_df = pd.concat([disc_train_df,pd.DataFrame(train_col_disc)], axis = 1)
        disc_test_df = pd.concat([disc_test_df,pd.DataFrame(test_col_disc)], axis = 1)
    disc_train_df.columns = label
    disc_test_df.columns = label
    return disc_train_df, disc_test_df

In [25]:
# EFD X_train_continuous_EFD, X_test_continuous_EFD, k = 5
X_train_continuous_EFD, X_test_continuous_EFD = EqualFrequencyDisc(\
                X_train[numeric_features],X_test[numeric_features],5,numeric_features)

In [26]:
# X_train_efd, X_test_efd
X_train_efd = pd.concat([X_train_categorical.reset_index(drop = True), X_train_continuous_EFD], axis = 1)
X_test_efd = pd.concat([X_test_categorical.reset_index(drop = True), X_test_continuous_EFD], axis = 1)

In [27]:
# X_official_test_continuous_EFD, k = 5
X_train_continuous_EFD, X_official_test_continuous_EFD = EqualFrequencyDisc(\
                X_train[numeric_features],X_official_test[numeric_features],5,numeric_features)

In [28]:
# X_official_test_efd
X_official_test_efd = pd.concat([X_official_test_categorical.reset_index(drop = True), X_official_test_continuous_EFD], axis = 1)

In [29]:
# X_train_pkid, X_test_pkid
X_train_continuous_pkid, X_test_continuous_pkid = EqualFrequencyDisc_pkid(\
                X_train[numeric_features],X_test[numeric_features],numeric_features)


In [30]:
# X_train_pkid, X_test_pkid
X_train_pkid = pd.concat([X_train_categorical.reset_index(drop = True), X_train_continuous_pkid], axis = 1)
X_test_pkid = pd.concat([X_test_categorical.reset_index(drop = True), X_test_continuous_pkid], axis = 1)

In [31]:
# X_official_test_pkid
X_train_continuous_pkid, X_test_official_continuous_pkid = EqualFrequencyDisc(\
                X_train[numeric_features],X_official_test[numeric_features],5,numeric_features)

In [32]:
X_official_test_pkid = pd.concat([X_official_test_categorical.reset_index(drop = True), X_test_official_continuous_pkid], axis = 1)

##-----------------------✂---------------------------##
# Training model & Evaluation

In [None]:
# Prepare the list of unknow cases which does not exist in Training set, check its accuracy

In [33]:
# find the special label which does not exist in training dataset
speical_test_attack = []
for item in official_test_data['label'].unique():
    if np.isin(item,official_train_data['label'].unique(),invert=True):
        speical_test_attack.append(item)

In [34]:
# locate the index
special_attack_index_list = []
for index,row in official_test_data.iterrows():
    if official_test_data['label'][index] in speical_test_attack:
        special_attack_index_list.append(index)

In [35]:
Unknown_cases = y_official_test.loc[special_attack_index_list]

# Naive Bayesian with Gaussian assumption

In [155]:
# Naive Bayesian with Gaussian assumption : X_test
from sklearn.naive_bayes import GaussianNB
X_train_GaussianNB = X_train
y_train_GaussianNB = y_train
# training the model
NB_Gaussian = GaussianNB()
NB_Gaussian.fit(X_train_GaussianNB,y_train_GaussianNB)
expected = y_test
predicted = NB_Gaussian.predict(X_test)
accuracy_gaussian = accuracy_score(expected, predicted)
recall_gaussian = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for Naive Bayes with Gaussian aussmption is:',round(accuracy_gaussian*100,2),'%')
print('Split test data, Recall Rate for Naive Bayes with Gaussian aussmption is:',round(recall_gaussian*100,2),'%')

Accuracy Rate is: 93.69 %
Recall Rate is: 92.36 %


In [166]:
# Naive Bayesian with Gaussian assumption : official_test
from sklearn.naive_bayes import GaussianNB
X_train_GaussianNB = X_train
y_train_GaussianNB = y_train
# training the model
NB_Gaussian = GaussianNB()
NB_Gaussian.fit(X_train_GaussianNB,y_train_GaussianNB)
expected = y_official_test
predicted = NB_Gaussian.predict(X_official_test)
accuracy_gaussian_official = accuracy_score(expected, predicted)
recall_gaussian_official = recall_score(expected, predicted)
print('Official test data, Accuracy Rate for Naive Bayes with Gaussian aussmption is:',round(accuracy_gaussian_official*100,2),'%')
print('Official test data, Recall Rate for Naive Bayes with Gaussian aussmption is:',round(recall_gaussian_official*100,2),'%')

Official test data, Accuracy Rate for Naive Bayes with Gaussian aussmption is: 85.81 %
Official test data, Recall Rate for Naive Bayes with Gaussian aussmption is: 82.92 %


In [70]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print(accuracy_for_unknown_cases)

0.03390463986331358


# Naive Bayes Equal Width Discretization

In [36]:
#Naive Bayesian EWD
from sklearn.naive_bayes import MultinomialNB
# training the model
MultinomialNB = MultinomialNB()
MultinomialNB.fit(X_train_ewd,y_train)
expected = y_test
predicted = MultinomialNB.predict(X_test_ewd)
accuracy_ewd = accuracy_score(expected, predicted)
recall_ewd = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for Naive Bayes with ewd is:',round(accuracy_ewd*100,2),'%')
print('Split test data, Recall Rate for Naive Bayes with ewd is:',round(recall_ewd*100,2),'%')

Split test data, Accuracy Rate for Naive Bayes with ewd is: 96.66 %
Split test data, Recall Rate for Naive Bayes with ewd is: 96.17 %


In [37]:
#Naive Bayesian EWD
from sklearn.naive_bayes import MultinomialNB
# training the model
MultinomialNB = MultinomialNB()
MultinomialNB.fit(X_train_ewd,y_train)
expected = y_official_test
predicted = MultinomialNB.predict(X_official_test_ewd)
accuracy_ewd_official = accuracy_score(expected, predicted)
recall_ewd_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for Naive Bayes with EMD is:',round(accuracy_ewd_official*100,2),'%')
print('Official test data, Recall Rate for Naive Bayes with EMD is:',round(recall_ewd_official*100,2),'%')

Official test data, Accuracy Rate for Naive Bayes with EMD is: 84.28 %
Official test data, Recall Rate for Naive Bayes with EMD is: 81.04 %


In [38]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 8.12 %


# Naive Bayes Entropy Minimization Discretization

In [176]:
#Naive Bayesian Entropy Minimization Discretization : X_test
from sklearn.naive_bayes import MultinomialNB
# training the model
MultinomialNB_emd = MultinomialNB()
MultinomialNB_emd.fit(X_train_emd,y_train)
expected = y_test
predicted = MultinomialNB_emd.predict(X_test_emd)
accuracy_emd = accuracy_score(expected, predicted)
recall_emd = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for Naive Bayes with EMD is:',round(accuracy_emd*100,2),'%')
print('Split test data, Recall Rate for Naive Bayes with EMD is:',round(recall_emd*100,2),'%')

Split test data, Accuracy Rate for Naive Bayes with EMD is: 99.19 %
Split test data, Recall Rate for Naive Bayes with EMD is: 99.23 %


In [177]:
#Naive Bayesian Entropy Minimization Discretization : X_official_test
from sklearn.naive_bayes import MultinomialNB
# training the model
MultinomialNB_emd = MultinomialNB()
MultinomialNB_emd.fit(X_train_emd,y_train)
expected = y_official_test
predicted = MultinomialNB_emd.predict(X_official_test_emd)
accuracy_emd_official = accuracy_score(expected, predicted)
recall_emd_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for Naive Bayes with EMD is:',round(accuracy_emd_official*100,2),'%')
print('Official test data, Recall Rate for Naive Bayes with EMD is:',round(recall_emd_official*100,2),'%')

Official test data, Accuracy Rate for Naive Bayes with EMD is: 91.45 %
Official test data, Recall Rate for Naive Bayes with EMD is: 89.96 %


In [178]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 8.21 %


# Naive Bayes Equal Frequency Discretization

In [42]:
#Naive Bayesian efd    : X_test
from sklearn.naive_bayes import MultinomialNB
# training the model
MultinomialNB_efd = MultinomialNB()
MultinomialNB_efd.fit(X_train_efd,y_train)
expected = y_test
predicted = MultinomialNB_efd.predict(X_test_efd)
accuracy_efd = accuracy_score(expected, predicted)
recall_efd = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for Naive Bayes with efd is:',round(accuracy_efd*100,2),'%')
print('Split test data, Recall Rate for Naive Bayes with efd is:',round(recall_efd*100,2),'%')

Split test data, Accuracy Rate for Naive Bayes with efd is: 96.85 %
Split test data, Recall Rate for Naive Bayes with efd is: 96.4 %


In [43]:
#Naive Bayesian efd    : X_official_test
from sklearn.naive_bayes import MultinomialNB
# training the model
MultinomialNB_efd = MultinomialNB()
MultinomialNB_efd.fit(X_train_efd,y_train)
expected = y_official_test
predicted = MultinomialNB_efd.predict(X_official_test_efd)
accuracy_efd_official = accuracy_score(expected, predicted)
recall_efd_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for Naive Bayes with efd is:',round(accuracy_efd_official*100,2),'%')
print('Official test data, Recall Rate for Naive Bayes with efd is:',round(recall_efd_official*100,2),'%')

Official test data, Accuracy Rate for Naive Bayes with efd is: 84.28 %
Official test data, Recall Rate for Naive Bayes with efd is: 81.03 %


In [44]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 7.07 %


# Naive Bayes Proportional k-Interval Discretization

In [39]:
#Naive Bayesian PKID    : X_test
from sklearn.naive_bayes import MultinomialNB
# training the model
MultinomialNB_pkid = MultinomialNB()
MultinomialNB_pkid.fit(X_train_pkid,y_train)
expected = y_test
predicted = MultinomialNB_pkid.predict(X_test_pkid)
accuracy_pkid = accuracy_score(expected, predicted)
recall_pkid = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for Naive Bayes with pkid is:',round(accuracy_pkid*100,2),'%')
print('Split test data, Recall Rate for Naive Bayes with pkid is:',round(recall_pkid*100,2),'%')

Split test data, Accuracy Rate for Naive Bayes with pkid is: 98.11 %
Split test data, Recall Rate for Naive Bayes with pkid is: 98.27 %


In [40]:
#Naive Bayesian efd    : X_official_test_efd
from sklearn.naive_bayes import MultinomialNB
# training the model
MultinomialNB_pkid = MultinomialNB()
MultinomialNB_pkid.fit(X_train_pkid,y_train)
expected = y_official_test
predicted = MultinomialNB_pkid.predict(X_official_test_pkid)
accuracy = accuracy_score(expected, predicted)
accuracy_pkid_official = accuracy_score(expected, predicted)
recall_pkid_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for Naive Bayes with pkid is:',round(accuracy_pkid_official*100,2),'%')
print('Official test data, Recall Rate for Naive Bayes with pkid is:',round(recall_pkid_official*100,2),'%')

Official test data, Accuracy Rate for Naive Bayes with pkid is: 77.77 %
Official test data, Recall Rate for Naive Bayes with pkid is: 72.74 %


In [41]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 3.87 %


# Naive Bayes K-means Discretization

In [179]:
#Naive Bayesian Kmeans   : X_test
from sklearn.naive_bayes import MultinomialNB
# training the model
MultinomialNB_kmeans = MultinomialNB()
MultinomialNB_kmeans.fit(X_train_kmeans,y_train)
expected = y_test
predicted = MultinomialNB_kmeans.predict(X_test_kmeans)
accuracy_kmeans = accuracy_score(expected, predicted)
recall_kmeans = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for Naive Bayes with kmeans is:',round(accuracy_kmeans*100,2),'%')
print('Split test data, Recall Rate for Naive Bayes with kmeans is:',round(recall_kmeans*100,2),'%')

Split test data, Accuracy Rate for Naive Bayes with kmeans is: 95.01 %
Split test data, Recall Rate for Naive Bayes with kmeans is: 94.21 %


In [45]:
#Naive Bayesian Kmeans   : X_official_test
from sklearn.naive_bayes import MultinomialNB
# training the model
MultinomialNB_kmeans = MultinomialNB()
MultinomialNB_kmeans.fit(X_train_kmeans,y_train)
expected = y_official_test
predicted = MultinomialNB_kmeans.predict(X_official_test_kmeans)
accuracy_kmeans_official = accuracy_score(expected, predicted)
recall_kmeans_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for Naive Bayes with kmeans is:',round(accuracy_kmeans_official*100,2),'%')
print('Official test data, Recall Rate for Naive Bayes with kmeans is:',round(recall_kmeans_official*100,2),'%')

Official test data, Accuracy Rate for Naive Bayes with kmeans is: 79.0 %
Official test data, Recall Rate for Naive Bayes with kmeans is: 74.38 %


In [181]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 7.1 %


# CART Equal Width Discretization

In [46]:
# CART EWD on test data
from sklearn.tree import DecisionTreeClassifier
clf_ewd = DecisionTreeClassifier()
clf_ewd.fit(X_train_ewd, y_train)
expected = y_test
predicted = clf_ewd.predict(X_test_ewd)
accuracy_cart_ewd = accuracy_score(expected, predicted)
recall_cart_ewd = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for CART with ewd is:',round(accuracy_cart_ewd*100,2),'%')
print('Split test data, Recall Rate for CART with ewd is:',round(recall_cart_ewd*100,2),'%')

Split test data, Accuracy Rate for CART with ewd is: 99.94 %
Split test data, Recall Rate for CART with ewd is: 99.93 %


In [47]:
# CART EWD on official test data
clf_ewd = DecisionTreeClassifier()
clf_ewd.fit(X_train_ewd, y_train)
expected = y_official_test
predicted = clf_ewd.predict(X_official_test_ewd)
accuracy_cart_ewd_official = accuracy_score(expected, predicted)
recall_cart_ewd_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for CART with ewd is:',round(accuracy_cart_ewd_official*100,2),'%')
print('Official test data, Recall Rate for CART with ewd is:',round(recall_cart_ewd_official*100,2),'%')

Official test data, Accuracy Rate for CART with ewd is: 91.91 %
Official test data, Recall Rate for CART with ewd is: 90.08 %


In [49]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 9.81 %


# CART Entropy Minimization Discretization

In [50]:
# CART EMD on train data
from sklearn.tree import DecisionTreeClassifier
clf_emd = DecisionTreeClassifier()
clf_emd.fit(X_train_emd, y_train)
expected = y_test
predicted = clf_emd.predict(X_test_emd)
accuracy_cart_emd = accuracy_score(expected, predicted)
recall_cart_emd = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for CART with emd is:',round(accuracy_cart_emd*100,2),'%')
print('Split test data, Recall Rate for CART with emd is:',round(recall_cart_emd*100,2),'%')

Split test data, Accuracy Rate for CART with emd is: 99.99 %
Split test data, Recall Rate for CART with emd is: 100.0 %


In [51]:
# CART EMD on official test data
from sklearn.tree import DecisionTreeClassifier
clf_emd = DecisionTreeClassifier()
clf_emd.fit(X_train_emd, y_train)
expected = y_official_test
predicted = clf_emd.predict(X_official_test_emd)
accuracy_cart_emd_official = accuracy_score(expected, predicted)
recall_cart_emd_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for CART with emd is:',round(accuracy_cart_emd_official*100,2),'%')
print('Official test data, Recall Rate for CART with emd is:',round(recall_cart_emd_official*100,2),'%')

Official test data, Accuracy Rate for CART with emd is: 93.41 %
Official test data, Recall Rate for CART with emd is: 91.94 %


In [54]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 14.44 %


# CART Equal Frequency Discretization

In [55]:
# CART EFD on test data
clf_efd = DecisionTreeClassifier()
clf_efd.fit(X_train_efd, y_train)
expected = y_test
predicted = clf_efd.predict(X_test_efd)
accuracy_cart_efd = accuracy_score(expected, predicted)
recall_cart_efd = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for CART with efd is:',round(accuracy_cart_efd*100,2),'%')
print('Split test data, Recall Rate for CART with efd is:',round(recall_cart_efd*100,2),'%')

Split test data, Accuracy Rate for CART with efd is: 99.98 %
Split test data, Recall Rate for CART with efd is: 99.99 %


In [56]:
# CART EFD on official test data
clf_efd = DecisionTreeClassifier()
clf_efd.fit(X_train_efd, y_train)
expected = y_official_test
predicted = clf_efd.predict(X_official_test_efd)
accuracy_cart_efd_official = accuracy_score(expected, predicted)
recall_cart_efd_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for CART with efd is:',round(accuracy_cart_efd_official*100,2),'%')
print('Official test data, Recall Rate for CART with efd is:',round(recall_cart_efd_official*100,2),'%')

Official test data, Accuracy Rate for CART with efd is: 92.24 %
Official test data, Recall Rate for CART with efd is: 90.52 %


In [57]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 10.89 %


# CART Proportional k-Interval Discretization

In [58]:
# CARY pkid on test data
clf_pkid = DecisionTreeClassifier()
clf_pkid.fit(X_train_pkid, y_train)
expected = y_test
predicted = clf_pkid.predict(X_test_pkid)
accuracy_cart_pkid = accuracy_score(expected, predicted)
recall_cart_pkid = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for CART with pkid is:',round(accuracy_cart_pkid*100,2),'%')
print('Split test data, Recall Rate for CART with pkid is:',round(recall_cart_pkid*100,2),'%')

Split test data, Accuracy Rate for CART with pkid is: 99.99 %
Split test data, Recall Rate for CART with pkid is: 99.99 %


In [60]:
# CART pkid on official data
clf_pkid = DecisionTreeClassifier()
clf_pkid.fit(X_train_pkid, y_train)
expected = y_official_test
predicted = clf_pkid.predict(X_official_test_pkid)
accuracy_cart_pkid_official = accuracy_score(expected, predicted)
recall_cart_pkid_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for CART with pkid is:',round(accuracy_cart_pkid_official*100,2),'%')
print('Official test data, Recall Rate for CART with pkid is:',round(recall_cart_pkid_official*100,2),'%')

Official test data, Accuracy Rate for CART with pkid is: 77.19 %
Official test data, Recall Rate for CART with pkid is: 71.96 %


In [61]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 6.44 %


# CART  k-means Discretization

In [65]:
# CART kmeans on test data
clf_kmeans = DecisionTreeClassifier()
clf_kmeans.fit(X_train_kmeans, y_train)
expected = y_test
predicted = clf_kmeans.predict(X_test_kmeans)
accuracy_cart_kmeans = accuracy_score(expected, predicted)
recall_cart_kmeans = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for CART with kmeans is:',round(accuracy_cart_kmeans*100,2),'%')
print('Split test data, Recall Rate for CART with kmeans is:',round(recall_cart_kmeans*100,2),'%')

Split test data, Accuracy Rate for CART with kmeans is: 99.91 %
Split test data, Recall Rate for CART with kmeans is: 99.9 %


In [66]:
# CART kmeans on official test data
clf_kmeans = DecisionTreeClassifier()
clf_kmeans.fit(X_train_kmeans, y_train)
expected = y_official_test
predicted = clf_kmeans.predict(X_official_test_kmeans)
accuracy_cart_kmeans_official = accuracy_score(expected, predicted)
recall_cart_kmeans_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for CART with kmeans is:',round(accuracy_cart_kmeans_official*100,2),'%')
print('Official test data, Recall Rate for CART with kmeans is:',round(recall_cart_kmeans_official*100,2),'%')

Official test data, Accuracy Rate for CART with kmeans is: 34.32 %
Official test data, Recall Rate for CART with kmeans is: 25.36 %


In [67]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 12.5 %


# RF Equal Width Discretization

In [70]:
rf_ewd = RandomForestClassifier(random_state = 0, bootstrap = True)
rf_ewd.fit(X_train_ewd, y_train)
expected = y_test
predicted = rf_ewd.predict(X_test_ewd)
accuracy_rf_ewd = accuracy_score(expected, predicted)
recall_rf_ewd = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for rf with ewd is:',round(accuracy_rf_ewd*100,2),'%')
print('Split test data, Recall Rate for rf with ewd is:',round(recall_rf_ewd*100,2),'%')

Split test data, Accuracy Rate for rf with ewd is: 99.94 %
Split test data, Recall Rate for rf with ewd is: 99.93 %


In [71]:
expected = y_official_test
predicted = rf_ewd.predict(X_official_test_ewd)
accuracy_rf_ewd_official = accuracy_score(expected, predicted)
recall_rf_ewd_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for rf with ewd is:',round(accuracy_rf_ewd_official*100,2),'%')
print('Official test data, Recall Rate for rf with ewd is:',round(recall_rf_ewd_official*100,2),'%')

Official test data, Accuracy Rate for rf with ewd is: 91.9 %
Official test data, Recall Rate for rf with ewd is: 90.04 %


In [72]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 9.37 %


# RF Entropy Minimization Discretization

In [73]:
from sklearn.ensemble import RandomForestClassifier
rf_emd = RandomForestClassifier(random_state = 0, bootstrap = True)
rf_emd.fit(X_train_emd, y_train)
expected = y_test
predicted = rf_emd.predict(X_test_emd)
accuracy_rf_emd = accuracy_score(expected, predicted)
recall_rf_emd = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for rf with emd is:',round(accuracy_rf_emd*100,2),'%')
print('Split test data, Recall Rate for rf with emd is:',round(recall_rf_emd*100,2),'%')

Split test data, Accuracy Rate for rf with emd is: 100.0 %
Split test data, Recall Rate for rf with emd is: 100.0 %


In [74]:
expected = y_official_test
predicted = rf_emd.predict(X_official_test_emd)
accuracy_rf_emd_official = accuracy_score(expected, predicted)
recall_rf_emd_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for rf with emd is:',round(accuracy_rf_emd_official*100,2),'%')
print('Official test data, Recall Rate for rf with emd is:',round(recall_rf_emd_official*100,2),'%')

Official test data, Accuracy Rate for rf with emd is: 92.3 %
Official test data, Recall Rate for rf with emd is: 90.55 %


In [75]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 5.96 %


# RF Equal Frequency Discretization

In [76]:
rf_efd = RandomForestClassifier(random_state = 0, bootstrap = True)
rf_efd.fit(X_train_efd, y_train)
expected = y_test
predicted = rf_efd.predict(X_test_efd)
accuracy_rf_efd = accuracy_score(expected, predicted)
recall_rf_efd = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for rf with efd is:',round(accuracy_rf_efd*100,2),'%')
print('Split test data, Recall Rate for rf with efd is:',round(recall_rf_efd*100,2),'%')

Split test data, Accuracy Rate for rf with efd is: 99.98 %
Split test data, Recall Rate for rf with efd is: 99.99 %


In [77]:
expected = y_official_test
predicted = rf_efd.predict(X_official_test_efd)
accuracy_rf_efd_official = accuracy_score(expected, predicted)
recall_rf_efd_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for rf with efd is:',round(accuracy_rf_efd_official*100,2),'%')
print('Official test data, Recall Rate for rf with efd is:',round(recall_rf_efd_official*100,2),'%')

Official test data, Accuracy Rate for rf with efd is: 92.27 %
Official test data, Recall Rate for rf with efd is: 90.5 %


In [78]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 10.27 %


# RF Proportional k-Interval Discretization

In [79]:
from sklearn.ensemble import RandomForestClassifier
rf_pkid = RandomForestClassifier(random_state = 0, bootstrap = True)
rf_pkid.fit(X_train_pkid, y_train)
expected = y_test
predicted = rf_pkid.predict(X_test_pkid)
accuracy_rf_pkid = accuracy_score(expected, predicted)
recall_rf_pkid = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for rf with pkid is:',round(accuracy_rf_pkid*100,2),'%')
print('Split test data, Recall Rate for rf with pkid is:',round(recall_rf_pkid*100,2),'%')

Split test data, Accuracy Rate for rf with pkid is: 99.99 %
Split test data, Recall Rate for rf with pkid is: 99.99 %


In [80]:
expected = y_official_test
predicted = rf_pkid.predict(X_official_test_pkid)
accuracy_rf_pkid_official = accuracy_score(expected, predicted)
recall_rf_pkid_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for rf with pkid is:',round(accuracy_rf_pkid_official*100,2),'%')
print('Official test data, Recall Rate for rf with pkid is:',round(recall_rf_pkid_official*100,2),'%')

Official test data, Accuracy Rate for rf with pkid is: 81.12 %
Official test data, Recall Rate for rf with pkid is: 76.63 %


In [81]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 3.88 %


# RF K-means Discretization

In [82]:
rf_kmeans = RandomForestClassifier(random_state = 0, bootstrap = True)
rf_kmeans.fit(X_train_kmeans, y_train)
expected = y_test
predicted = rf_kmeans.predict(X_test_kmeans)
accuracy_rf_kmeans = accuracy_score(expected, predicted)
recall_rf_kmeans = recall_score(expected,predicted)
print('Split test data, Accuracy Rate for rf with kmeans is:',round(accuracy_rf_kmeans*100,2),'%')
print('Split test data, Recall Rate for rf with kmeans is:',round(recall_rf_kmeans*100,2),'%')

Split test data, Accuracy Rate for rf with kmeans is: 99.91 %
Split test data, Recall Rate for rf with kmeans is: 99.9 %


In [83]:
expected = y_official_test
predicted = rf_kmeans.predict(X_official_test_kmeans)
accuracy_rf_kmeans_official = accuracy_score(expected, predicted)
recall_rf_kmeans_official = recall_score(expected,predicted)
print('Official test data, Accuracy Rate for rf with kmeans is:',round(accuracy_rf_kmeans_official*100,2),'%')
print('Official test data, Recall Rate for rf with kmeans is:',round(recall_rf_kmeans_official*100,2),'%')

Official test data, Accuracy Rate for rf with kmeans is: 81.75 %
Official test data, Recall Rate for rf with kmeans is: 77.42 %


In [84]:
accuracy_for_unknown_cases = accuracy_score(Unknown_cases,pd.Series(predicted)[special_attack_index_list])
print('Unknown attack type detected rate is:',round(accuracy_for_unknown_cases*100,2),'%')

Unknown attack type detected rate is: 12.46 %
