In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

In [2]:
# attach the column names to the dataset
col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]

In [5]:
# load the dataset to the model
df = pd.read_csv("KDDTrain+_2.csv", header=None, names=col_names)
df_test = pd.read_csv("KDDTest+_2.csv", header=None, names=col_names)

In [6]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [7]:
print(df_test['label'].value_counts())

normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178
portsweep           157
ipsweep             141
httptunnel          133
nmap                 73
pod                  41
buffer_overflow      20
multihop             18
named                17
ps                   15
sendmail             14
xterm                13
rootkit              13
teardrop             12
xlock                 9
land                  7
xsnoop                4
ftp_write             3
perl                  2
phf                   2
udpstorm              2
loadmodule            2
worm                  2
sqlattack             2
imap                  1
Name: label, dtype: int64


In [8]:
# Non-numeric columns in the train dataset
print("Train Dataset:")
for col_name in df.columns:
    if df[col_name].dtypes == 'object' :
        char_col = len(df[col_name].unique())
        print("Feature '{col_name}' has {char_col}".format(col_name=col_name,char_col=char_col))

print("")

# Non-numeric columns in the test dataset
print("Test Dataset:")
for col_name in df_test.columns:
    if df_test[col_name].dtypes == 'object' :
        char_col = len(df_test[col_name].unique())
        print("Feature '{col_name}' has {char_col}".format(col_name=col_name,char_col=char_col))

Train Dataset:
Feature 'protocol_type' has 3
Feature 'service' has 66
Feature 'flag' has 11
Feature 'label' has 22

Test Dataset:
Feature 'protocol_type' has 3
Feature 'service' has 64
Feature 'flag' has 11
Feature 'label' has 38


In [9]:
categorical_columns = ['protocol_type', 'service', 'flag']

# Represent Categorical columns in 2D numpy 
df_categorical_columns = df[categorical_columns]
df_test_categorical_columns = df_test[categorical_columns]
df_categorical_columns.head()

Unnamed: 0,protocol_type,service,flag
0,tcp,ftp_data,SF
1,udp,other,SF
2,tcp,private,S0
3,tcp,http,SF
4,tcp,http,SF


In [10]:
# Dummy columns to perform onehotencoding 

# Train protocol 
cat_protocol = sorted(df.protocol_type.unique())
string1 = 'protocol_type_'
cat_protocol2 = [string1 + x for x in cat_protocol]

# Train service
cat_service = sorted(df.service.unique())
string2 = 'service_'
cat_service2 = [string2 + x for x in cat_service]

# train flag
cat_flag = sorted(df.flag.unique())
string3 = 'flag'
cat_flag2 = [string3 + x for x in cat_flag]

dummy_cols = cat_protocol2 + cat_service2 + cat_flag2 

# Test dummy column
cat_service_test = sorted(df_test.service.unique())
cat_service_test2 = [string2 + x for x in cat_service_test]

testdummy_cols = cat_protocol2 + cat_service_test2 + cat_flag2

print(dummy_cols)

['protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp', 'service_IRC', 'service_X11', 'service_Z39_50', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs', 'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_hostnames', 'service_http', 'service_http_443', 'service_http_8001', 'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link', 'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns', 'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u', 'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private', 'service_red_i', 'service_remote_job', 'service_rje', 'service_shell', 'service_smtp', 'service_sql

In [11]:
# onehotencoding to change categorical data to numeric
enc = OneHotEncoder()

# Encoding train set
df_cat_cols = enc.fit_transform(df_categorical_columns)
df_encoded_cols = pd.DataFrame(df_cat_cols.toarray(), columns = dummy_cols)

# Encoding test set
df_test_cat_cols = enc.fit_transform(df_test_categorical_columns)
df_test_encoded_cols = pd.DataFrame(df_test_cat_cols.toarray(), columns = testdummy_cols)

df_encoded_cols.head()

Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_auth,service_bgp,service_courier,service_csnet_ns,...,flagREJ,flagRSTO,flagRSTOS0,flagRSTR,flagS0,flagS1,flagS2,flagS3,flagSF,flagSH
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
# Shape or dimensions of the dataset
df_test_encoded_cols.shape

(22544, 78)

In [13]:
train_service = df['service'].tolist()
test_service = df_test['service'].tolist()
trainnew = list(set(train_service) - set(test_service))
trainnew_service = [string2 + x for x in trainnew]
trainnew_service

['service_red_i', 'service_urh_i', 'service_http_8001']

In [14]:
for col in trainnew_service:
    df_test_encoded_cols[col] = 0
df_test_encoded_cols.shape

(22544, 81)

In [15]:
# Updating train dataset by replacing old non-numeric column with encoded numeric columns
new_df = df.join(df_encoded_cols)
new_df.drop('protocol_type', axis = 1, inplace = True)
new_df.drop('service', axis = 1, inplace = True)
new_df.drop('flag', axis = 1, inplace =True)

# Updating test dataset
new_df_test = df_test.join(df_test_encoded_cols)
new_df_test.drop('protocol_type', axis = 1, inplace = True)
new_df_test.drop('service', axis = 1, inplace = True)
new_df_test.drop('flag', axis = 1, inplace = True)

print(new_df.shape)
print(new_df_test.shape)

new_df.head()

(25192, 119)
(22544, 120)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flagREJ,flagRSTO,flagRSTOS0,flagRSTR,flagS0,flagS1,flagS2,flagS3,flagSF,flagSH
0,0,491,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,146,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0,232,8153,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,199,420,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
# take label column
labeldf=new_df['label']
labeldf_test=new_df_test['label']
# change the label column
newlabeldf=labeldf.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})
newlabeldf_test=labeldf_test.replace({ 'normal' : 0, 'neptune' : 1 ,'back': 1, 'land': 1, 'pod': 1, 'smurf': 1, 'teardrop': 1,'mailbomb': 1, 'apache2': 1, 'processtable': 1, 'udpstorm': 1, 'worm': 1,
                           'ipsweep' : 2,'nmap' : 2,'portsweep' : 2,'satan' : 2,'mscan' : 2,'saint' : 2
                           ,'ftp_write': 3,'guess_passwd': 3,'imap': 3,'multihop': 3,'phf': 3,'spy': 3,'warezclient': 3,'warezmaster': 3,'sendmail': 3,'named': 3,'snmpgetattack': 3,'snmpguess': 3,'xlock': 3,'xsnoop': 3,'httptunnel': 3,
                           'buffer_overflow': 4,'loadmodule': 4,'perl': 4,'rootkit': 4,'ps': 4,'sqlattack': 4,'xterm': 4})
# put the new label column back
new_df['label'] = newlabeldf
new_df_test['label'] = newlabeldf_test
print(new_df['label'].head())

0    0
1    0
2    1
3    0
4    0
Name: label, dtype: int64


In [17]:
# Splitting data such that one part has features and other part has only label in numeric

# Train set
X_Label = new_df.drop('label', 1)
Y_Label = new_df.label

# Test set
X_Label_test = new_df_test.drop('label', 1)
Y = new_df_test.label

X_Label.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,flagREJ,flagRSTO,flagRSTOS0,flagRSTR,flagS0,flagS1,flagS2,flagS3,flagSF,flagSH
0,0,491,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0,146,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0,232,8153,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,199,420,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [18]:
Y_Label.head()

0    0
1    0
2    1
3    0
4    0
Name: label, dtype: int64

In [19]:
new_cols = list(X_Label)
new_cols_test = list(X_Label_test)
print(new_cols)

['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp', 'service_IRC', 'service_X11', 'service_Z39_50', 'service_auth', 'service_bgp', 'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard', 'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', '

In [20]:
# Fitting the Recursive Feature Elimination techinque to the dataset for feature selection
clf = RandomForestRegressor(n_estimators = 100, n_jobs = -2, random_state = 0)
rfe_clf = RFE(estimator = clf, n_features_to_select = 9, step = 1)
lab_enc = preprocessing.LabelEncoder()
Y_Label_enc = lab_enc.fit_transform(Y_Label)
rfe_clf.fit(X_Label, Y_Label_enc)
X_Label_rfe = rfe_clf.transform(X_Label)
true = rfe_clf.support_
rfe_label = [i for i, x in enumerate(true) if x]
rfe_cols = list(new_cols[i] for i in rfe_label)

In [21]:
print(X_Label_rfe)

[[4.91e+02 0.00e+00 0.00e+00 ... 1.70e-01 0.00e+00 1.00e+00]
 [1.46e+02 0.00e+00 0.00e+00 ... 8.80e-01 0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00 0.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 ...
 [0.00e+00 0.00e+00 0.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00 0.00e+00 ... 0.00e+00 0.00e+00 0.00e+00]
 [0.00e+00 0.00e+00 0.00e+00 ... 1.00e-02 0.00e+00 0.00e+00]]


In [22]:
rfe_clf.ranking_

array([  4,   1,   1,  44,  28,  51,   1,  59,   9,  27,  37,  43,  12,
        21,  55,  26,  77,  76,  18,   1,  31,  24,  41,  39,  45,  29,
        15,  53,  10,   5,   1,   1,   1,   3,  11,  13,   6,  25,  17,
        14,  42, 108,  35, 109,  38, 110, 106,  91,  93,  90,  96,  83,
        32,  84,  34,   1,  98, 100,  20,  36,   1,  64,  80,   2,  81,
        86,  16,  74,  61,  75,  73,  68,  71,  67,  87,  88,  63,  65,
        72,  89,  58,  66,  23,  19, 103,  56, 101,   7, 102, 104,  78,
        97,  47,  95,  85,  99,  60,  82,  22,  40,  50,  69,  52,  92,
        94,  79, 107,  62,  48,  54,  30,   8,  49,  33,  57,  70,  46,
       105])

In [23]:
print('Selected Features :', rfe_cols)

Selected Features : ['src_bytes', 'dst_bytes', 'hot', 'count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'service_ecr_i', 'service_ftp_data']


In [24]:
mod = RandomForestRegressor(n_estimators = 100, n_jobs = -2, random_state = 0)
mod.fit(X_Label_rfe, Y_Label_enc)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-2, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [25]:
# Replace test datasets according to the selected datasets
x = X_Label_test.iloc[:, rfe_label]

In [26]:
# Predict the selected dataset with classifier
Y_Label_pred = mod.predict(x)

# Creating a Confusion Matrix
pd.crosstab(Y, Y_Label_pred, rownames = ['Actual attacks'], colnames = ['Predicted attacks'])

Predicted attacks,0.000000,0.010000,0.020000,0.022862,0.030000,0.040000,0.050000,0.060000,0.070000,0.080000,...,2.450000,2.480000,2.640000,2.680000,3.080000,3.200000,3.560000,3.700000,3.840000,3.850000
Actual attacks,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,8891,20,24,1,34,63,4,19,3,269,...,0,0,1,0,0,0,0,0,0,0
1,626,14,31,4,5,134,25,3,2,17,...,0,0,0,0,0,0,0,0,0,0
2,5,1,2,1,5,4,6,5,3,17,...,0,0,0,0,0,0,0,0,0,0
3,1426,130,234,1,68,4,15,286,43,17,...,3,4,0,1,1,1,1,0,0,0
4,18,1,0,0,3,2,1,2,0,3,...,0,0,0,0,0,0,0,2,1,1


In [6]:
import pickle


In [30]:
model = RandomForestRegressor(n_estimators = 100, n_jobs = -2)
model.fit(x, Y)
pickle.dump(model, open('eye_model.pkl', 'wb'))

In [31]:
load_model = pickle.load(open('eye_model.pkl', 'rb'))
score = load_model.score(x, Y)
print(score)

0.9740163543103803


In [35]:
import socket, subprocess

In [36]:
cmd = "/root/build-files/src/kdd99extractor"

In [37]:
process = subprocess.Popen(cmd, stdout = subprocess.PIPE, stderr = subprocess.STDOUT)

In [38]:
load_model = pickle.load(open('eye_model.pkl', 'rb'))
for line in iter(process.stdout.readline, b""):
    line = line.strip()
    columns = str(line)
    columns = columns.replace("b'", "")
    columns = columns.replace("'", "")
    lis = columns[0:113]
    test_list  = [float(x) for x in lis.split(',')]
    index_list = [4, 5, 8, 9, 20, 21, 22, 2, 2]  
    res_list = [test_list[i] for i in index_list] 
    fin = [np.array(res_list)]
    test = load_model.predict(fin)
    print(test)
    if test < 1:
        print("normal")
    elif test < 2:
        print("dos")
    elif test < 3:
        print("port")
    elif test < 4:
        print("nep")
    else:
        print("work")

[0.18]
normal
[1.114]
dos
[1.06]
dos
[0.18]
normal
[0.69]
normal
[0.62457143]
normal
[2.54]
port
[1.25]
dos
[0.69]
normal
[2.26]
port
[0.676]
normal
[0.33]
normal
[0.41]
normal
[0.69]
normal
[0.63]
normal
[0.96]
normal
[0.68]
normal
[0.54]
normal
[0.91]
normal
[0.57]
normal
[0.9]
normal
[0.92]
normal
[0.15]
normal
[0.19]
normal
[0.19]
normal
[0.64]
normal
[0.58]
normal
[0.05]
normal
[0.03]
normal
[0.55]
normal
[0.03]
normal
[0.55]
normal
[0.03]
normal
[0.75]
normal
[0.56]
normal
[0.03]
normal
[0.03]
normal
[0.45]
normal
[0.65]
normal
[0.03]
normal
[0.51]
normal
[0.03]
normal
[0.57]
normal
[0.06]
normal
[0.83]
normal
[0.07]
normal
[0.62]
normal
[0.86]
normal
[0.06]
normal
[0.74]
normal
[0.75]
normal
[0.07]
normal
[0.87]
normal
[0.87]
normal
[0.06]
normal
[1.08]
dos
[0.06]
normal
[1.09]
dos
[1.12]
dos
[0.05]
normal
[0.97]
normal
[0.96]
normal
[0.97]
normal
[0.23]
normal
[0.97]
normal
[1.]
dos
[0.24]
normal
[1.]
dos
[1.]
dos
[0.21]
normal
[0.96]
normal
[0.16]
normal
[0.97]
normal
[0.96]
n

KeyboardInterrupt: 