In [144]:
import math
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf

OUTDIR = './TechM'
shutil.rmtree(OUTDIR, ignore_errors = True) # start fresh each time

In [145]:
print(tf.__version__)

1.8.0


In [146]:
%bash
gsutil ls -l gs://my-project-98645-machine-hack

   3441513  2019-02-27T10:01:17Z  gs://my-project-98645-machine-hack/Test.txt
  19109423  2019-02-27T10:01:16Z  gs://my-project-98645-machine-hack/Train.txt
                                 gs://my-project-98645-machine-hack/datalab-backups/
TOTAL: 2 objects, 22550936 bytes (21.51 MiB)


In [147]:
%bash
gsutil cp gs://my-project-98645-machine-hack/Test.txt Test.txt
gsutil cp gs://my-project-98645-machine-hack/Train.txt Train.txt

Copying gs://my-project-98645-machine-hack/Test.txt...
/ [0 files][    0.0 B/  3.3 MiB]                                                -- [1 files][  3.3 MiB/  3.3 MiB]                                                
Operation completed over 1 objects/3.3 MiB.                                      
Copying gs://my-project-98645-machine-hack/Train.txt...
/ [0 files][    0.0 B/ 18.2 MiB]                                                -- [0 files][ 16.2 MiB/ 18.2 MiB]                                                - [1 files][ 18.2 MiB/ 18.2 MiB]                                                \
Operation completed over 1 objects/18.2 MiB.                                     


In [148]:
# Load the datasets

train_df = pd.read_csv('Train.txt')
test_df  = pd.read_csv('Test.txt')

In [149]:
# make copy of the datasets

train = train_df.copy()
test  = test_df.copy()

### Exploratory Data Analysis

In [150]:
# Display the dimensions of the data sets

print("train data set dimensions:",train.shape)
print("test data set dimensions:",test.shape)

('train data set dimensions:', (125972, 43))
('test data set dimensions:', (22543, 43))


In [151]:
# Display the header of the train data set

train.head()

Unnamed: 0,0,tcp,ftp_data,SF,491,0.1,0.2,0.3,0.4,0.5,...,0.17.1,0.03,0.17.2,0.00.6,0.00.7,0.00.8,0.05,0.00.9,normal,20
0,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
2,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
3,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.07,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21


In [152]:
# Display the header of the test data set

test.head()

Unnamed: 0,0,tcp,private,REJ,0.1,0.2,0.3,0.4,0.5,0.6,...,0.04.1,0.06.1,0.00.3,0.00.4,0.00.5,0.00.6,1.00.2,1.00.3,neptune,21
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
1,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal,21
2,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint,15
3,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan,11
4,0,tcp,http,SF,267,14515,0,0,0,0,...,1.0,0.0,0.01,0.03,0.01,0.0,0.0,0.0,normal,21


In [153]:
train.columns = ["duration",
"protocol_type",
"service",
"flag",
"src_bytes",
"dst_bytes",
"land",
"wrong_fragment",
"urgent",
"hot",
"num_failed_logins",
"logged_in",
"num_compromised",
"root_shell",
"su_attempted",
"num_root",
"num_file_creations",
"num_shells",
"num_access_files",
"num_outbound_cmds",
"is_host_login",
"is_guest_login",
"count",
"srv_count",
"serror_rate", 
"srv_serror_rate",
"rerror_rate",
"srv_rerror_rate",
"same_srv_rate", 
"diff_srv_rate", 
"srv_diff_host_rate",
"dst_host_count",
"dst_host_srv_count",
"dst_host_same_srv_rate",
"dst_host_diff_srv_rate",
"dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate",
"dst_host_serror_rate",
"dst_host_srv_serror_rate",
"dst_host_rerror_rate",
"dst_host_srv_rerror_rate",
"attack", 
"last_flag"]

In [154]:
test.columns = ["duration",
"protocol_type",
"service",
"flag",
"src_bytes",
"dst_bytes",
"land",
"wrong_fragment",
"urgent",
"hot",
"num_failed_logins",
"logged_in",
"num_compromised",
"root_shell",
"su_attempted",
"num_root",
"num_file_creations",
"num_shells",
"num_access_files",
"num_outbound_cmds",
"is_host_login",
"is_guest_login",
"count",
"srv_count",
"serror_rate", 
"srv_serror_rate",
"rerror_rate",
"srv_rerror_rate",
"same_srv_rate", 
"diff_srv_rate", 
"srv_diff_host_rate",
"dst_host_count",
"dst_host_srv_count",
"dst_host_same_srv_rate",
"dst_host_diff_srv_rate",
"dst_host_same_src_port_rate",
"dst_host_srv_diff_host_rate",
"dst_host_serror_rate",
"dst_host_srv_serror_rate",
"dst_host_rerror_rate",
"dst_host_srv_rerror_rate",
"attack", 
"last_flag"]

In [155]:
# Display the information of the TRAIN data set

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125972 entries, 0 to 125971
Data columns (total 43 columns):
duration                       125972 non-null int64
protocol_type                  125972 non-null object
service                        125972 non-null object
flag                           125972 non-null object
src_bytes                      125972 non-null int64
dst_bytes                      125972 non-null int64
land                           125972 non-null int64
wrong_fragment                 125972 non-null int64
urgent                         125972 non-null int64
hot                            125972 non-null int64
num_failed_logins              125972 non-null int64
logged_in                      125972 non-null int64
num_compromised                125972 non-null int64
root_shell                     125972 non-null int64
su_attempted                   125972 non-null int64
num_root                       125972 non-null int64
num_file_creations             125972 

In [156]:
# Display the information of the TEST data set

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22543 entries, 0 to 22542
Data columns (total 43 columns):
duration                       22543 non-null int64
protocol_type                  22543 non-null object
service                        22543 non-null object
flag                           22543 non-null object
src_bytes                      22543 non-null int64
dst_bytes                      22543 non-null int64
land                           22543 non-null int64
wrong_fragment                 22543 non-null int64
urgent                         22543 non-null int64
hot                            22543 non-null int64
num_failed_logins              22543 non-null int64
logged_in                      22543 non-null int64
num_compromised                22543 non-null int64
root_shell                     22543 non-null int64
su_attempted                   22543 non-null int64
num_root                       22543 non-null int64
num_file_creations             22543 non-null int64
num_

In [157]:
# Display the header of the train data set

train.tail()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag
125967,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.06,0.0,0.0,1.0,1.0,0.0,0.0,neptune,20
125968,8,udp,private,SF,105,145,0,0,0,0,...,0.96,0.01,0.01,0.0,0.0,0.0,0.0,0.0,normal,21
125969,0,tcp,smtp,SF,2231,384,0,0,0,0,...,0.12,0.06,0.0,0.0,0.72,0.0,0.01,0.0,normal,18
125970,0,tcp,klogin,S0,0,0,0,0,0,0,...,0.03,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,20
125971,0,tcp,ftp_data,SF,151,0,0,0,0,0,...,0.3,0.03,0.3,0.0,0.0,0.0,0.0,0.0,normal,21


In [158]:
# Display the header of the TEST data set

test.tail()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag
22538,0,tcp,smtp,SF,794,333,0,0,0,0,...,0.72,0.06,0.01,0.01,0.01,0.0,0.0,0.0,normal,21
22539,0,tcp,http,SF,317,938,0,0,0,0,...,1.0,0.0,0.01,0.01,0.01,0.0,0.0,0.0,normal,21
22540,0,tcp,http,SF,54540,8314,0,0,0,2,...,1.0,0.0,0.0,0.0,0.0,0.0,0.07,0.07,back,15
22541,0,udp,domain_u,SF,42,42,0,0,0,0,...,0.99,0.01,0.0,0.0,0.0,0.0,0.0,0.0,normal,21
22542,0,tcp,sunrpc,REJ,0,0,0,0,0,0,...,0.08,0.03,0.0,0.0,0.0,0.0,0.44,1.0,mscan,14


In [159]:
# [TRAIN DATA SET] seperate out the Categorical and Numerical features

numerical_feature   = train.dtypes[train.dtypes!= 'object'].index
categorical_feature = train.dtypes[train.dtypes== 'object'].index

print ("There are {} numeric and {} categorical columns in train data"
       .format(numerical_feature.shape[0],categorical_feature.shape[0]))

There are 39 numeric and 4 categorical columns in train data


In [160]:
numerical_feature

Index([u'duration', u'src_bytes', u'dst_bytes', u'land', u'wrong_fragment',
       u'urgent', u'hot', u'num_failed_logins', u'logged_in',
       u'num_compromised', u'root_shell', u'su_attempted', u'num_root',
       u'num_file_creations', u'num_shells', u'num_access_files',
       u'num_outbound_cmds', u'is_host_login', u'is_guest_login', u'count',
       u'srv_count', u'serror_rate', u'srv_serror_rate', u'rerror_rate',
       u'srv_rerror_rate', u'same_srv_rate', u'diff_srv_rate',
       u'srv_diff_host_rate', u'dst_host_count', u'dst_host_srv_count',
       u'dst_host_same_srv_rate', u'dst_host_diff_srv_rate',
       u'dst_host_same_src_port_rate', u'dst_host_srv_diff_host_rate',
       u'dst_host_serror_rate', u'dst_host_srv_serror_rate',
       u'dst_host_rerror_rate', u'dst_host_srv_rerror_rate', u'last_flag'],
      dtype='object')

In [161]:
categorical_feature

Index([u'protocol_type', u'service', u'flag', u'attack'], dtype='object')

In [162]:
train["attack"].unique()

array(['normal', 'neptune', 'warezclient', 'ipsweep', 'portsweep',
       'teardrop', 'nmap', 'satan', 'smurf', 'pod', 'back',
       'guess_passwd', 'ftp_write', 'multihop', 'rootkit',
       'buffer_overflow', 'imap', 'warezmaster', 'phf', 'land',
       'loadmodule', 'spy', 'perl'], dtype=object)

In [163]:
train['attack'].value_counts()

normal             67342
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: attack, dtype: int64

In [164]:
train.describe()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,last_flag
count,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,...,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0,125972.0
mean,287.146929,45567.1,19779.27,0.000198,0.022688,0.000111,0.204411,0.001222,0.395739,0.279253,...,115.653725,0.521244,0.082952,0.148379,0.032543,0.284455,0.278487,0.118832,0.120241,19.504056
std,2604.525522,5870354.0,4021285.0,0.014086,0.253531,0.014366,2.149977,0.045239,0.489011,23.942137,...,110.702886,0.44895,0.188922,0.308998,0.112564,0.444785,0.44567,0.306559,0.31946,2.291512
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0
50%,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,63.0,0.51,0.02,0.0,0.0,0.0,0.0,0.0,0.0,20.0
75%,0.0,276.0,516.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,255.0,1.0,0.07,0.06,0.02,1.0,1.0,0.0,0.0,21.0
max,42908.0,1379964000.0,1309937000.0,1.0,3.0,3.0,77.0,5.0,1.0,7479.0,...,255.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,21.0


In [165]:
d ={'normal':['normal'],
'dos':['neptune','smurf','back','teardrop','pod','land','apache2','udpstorm','processtable','worm'],
'probe':['satan','ipsweep','portsweep','nmap','mscan','saint'],
'r2l':['guess_passwd','ftp_write','imap','phf','multihop','warezmaster','warezclient','spy','xlock','xsnoop','snmpguess','snmpgetattak','httptunnel','sendmail','named'],
'u2r':['buffer_overflow','rootkit','loadmodule','perl','sqlattack','xterm','ps']}

In [166]:
d1 = {k: oldk for oldk, oldv in d.items() for k in oldv}

In [167]:
print (d1)

{'guess_passwd': 'r2l', 'processtable': 'dos', 'named': 'r2l', 'ftp_write': 'r2l', 'spy': 'r2l', 'nmap': 'probe', 'back': 'dos', 'multihop': 'r2l', 'rootkit': 'u2r', 'udpstorm': 'dos', 'snmpguess': 'r2l', 'pod': 'dos', 'perl': 'u2r', 'portsweep': 'probe', 'sqlattack': 'u2r', 'httptunnel': 'r2l', 'sendmail': 'r2l', 'normal': 'normal', 'apache2': 'dos', 'ipsweep': 'probe', 'teardrop': 'dos', 'satan': 'probe', 'loadmodule': 'u2r', 'buffer_overflow': 'u2r', 'mscan': 'probe', 'snmpgetattak': 'r2l', 'saint': 'probe', 'ps': 'u2r', 'xterm': 'u2r', 'phf': 'r2l', 'warezmaster': 'r2l', 'imap': 'r2l', 'warezclient': 'r2l', 'land': 'dos', 'neptune': 'dos', 'worm': 'dos', 'xlock': 'r2l', 'smurf': 'dos', 'xsnoop': 'r2l'}


In [168]:
# [TRAINING DATA SET] Target Label - attack value converted to 5 level values

train['attack'] = train['attack'].map(d1)

In [169]:
# [TEST DATA SET] Target Label - attack value converted to 5 level values

test['attack'] = test['attack'].map(d1)

In [170]:
train['attack'].value_counts()

normal    67342
dos       45927
probe     11656
r2l         995
u2r          52
Name: attack, dtype: int64

In [171]:
test['attack'].value_counts()

normal    9711
dos       7166
r2l       2707
probe     2421
u2r         67
Name: attack, dtype: int64

In [172]:
train['attack']=train['attack'].apply(lambda x: 'normal' if x=='normal' else 'attack')

In [173]:
train['attack'].value_counts()

normal    67342
attack    58630
Name: attack, dtype: int64

In [174]:
train['attack']=train['attack'].apply(lambda x: 0 if x=='normal' else 1)

### Missing values

In [175]:
# [TRAIN DataSet] Missing values - checking

train.isnull().sum().sort_values(ascending=False)

last_flag                      0
num_failed_logins              0
num_access_files               0
num_shells                     0
num_file_creations             0
num_root                       0
su_attempted                   0
root_shell                     0
num_compromised                0
logged_in                      0
hot                            0
is_host_login                  0
urgent                         0
wrong_fragment                 0
land                           0
dst_bytes                      0
src_bytes                      0
flag                           0
service                        0
protocol_type                  0
num_outbound_cmds              0
is_guest_login                 0
attack                         0
dst_host_srv_count             0
dst_host_srv_rerror_rate       0
dst_host_rerror_rate           0
dst_host_srv_serror_rate       0
dst_host_serror_rate           0
dst_host_srv_diff_host_rate    0
dst_host_same_src_port_rate    0
dst_host_d

In [176]:
# [TEST DataSet] Missing values - checking

test.isnull().sum().sort_values(ascending=False)

attack                         471
is_host_login                    0
num_outbound_cmds                0
num_access_files                 0
num_shells                       0
num_file_creations               0
num_root                         0
su_attempted                     0
root_shell                       0
num_compromised                  0
logged_in                        0
last_flag                        0
hot                              0
urgent                           0
wrong_fragment                   0
land                             0
dst_bytes                        0
src_bytes                        0
flag                             0
service                          0
protocol_type                    0
num_failed_logins                0
is_guest_login                   0
count                            0
srv_count                        0
dst_host_srv_rerror_rate         0
dst_host_rerror_rate             0
dst_host_srv_serror_rate         0
dst_host_serror_rate

### Train DataSet categorical features verification

In [177]:
#[TRAIN DataSet] Display of the categorical features
categorical_feature.tolist()

['protocol_type', 'service', 'flag', 'attack']

In [178]:
train.protocol_type.value_counts()

tcp     102688
udp      14993
icmp      8291
Name: protocol_type, dtype: int64

In [179]:
train.service.value_counts()

http           40338
private        21853
domain_u        9043
smtp            7313
ftp_data        6859
eco_i           4586
other           4359
ecr_i           3077
telnet          2353
finger          1767
ftp             1754
auth             955
Z39_50           862
uucp             780
courier          734
bgp              710
whois            693
uucp_path        689
iso_tsap         687
time             654
imap4            647
nnsp             630
vmnet            617
urp_i            602
domain           569
ctf              563
csnet_ns         545
supdup           544
discard          538
http_443         530
               ...  
klogin           433
login            429
ldap             410
netbios_dgm      405
sunrpc           381
netbios_ssn      362
netstat          360
netbios_ns       347
ssh              311
kshell           299
nntp             296
pop_3            264
sql_net          245
IRC              187
ntp_u            168
rje               86
remote_job   

In [180]:
train.flag.value_counts()

SF        74944
S0        34851
REJ       11233
RSTR       2421
RSTO       1562
S1          365
SH          271
S2          127
RSTOS0      103
S3           49
OTH          46
Name: flag, dtype: int64

#### Test data set - Categorical features verification

In [181]:
test.protocol_type.value_counts()

tcp     18879
udp      2621
icmp     1043
Name: protocol_type, dtype: int64

In [182]:
test.service.value_counts()

http           7853
private        4773
telnet         1626
pop_3          1019
smtp            934
domain_u        894
ftp_data        851
other           838
ecr_i           752
ftp             692
imap4           306
eco_i           262
sunrpc          159
finger          136
auth             67
domain           51
uucp             50
iso_tsap         48
uucp_path        46
bgp              46
Z39_50           45
vmnet            43
nnsp             42
link             41
ctf              41
courier          40
whois            40
echo             37
name             37
netbios_ns       36
               ... 
efs              33
mtp              32
systat           32
login            29
daytime          28
exec             27
supdup           27
netstat          26
discard          26
ssh              26
netbios_dgm      25
kshell           24
urp_i            23
hostnames        23
nntp             21
klogin           21
ldap             19
sql_net          18
pm_dump          16


In [183]:
test.flag.value_counts()

SF        14875
REJ        3849
S0         2013
RSTO        773
RSTR        669
S3          249
SH           73
S1           21
S2           15
OTH           4
RSTOS0        2
Name: flag, dtype: int64

#### Feature Engineering

In [184]:
#Src_bytes
#Dst_bytes

# new feature - total_bytes

#[TRAIN DataSet]
train['total_bytes'] = train['src_bytes']+train['dst_bytes']

# [TEST DataSet]
test['total_bytes'] = test['src_bytes']+test['dst_bytes']

In [185]:
train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag,total_bytes
0,0,udp,other,SF,146,0,0,0,0,0,...,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0,15,146
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1,19,0
2,0,tcp,http,SF,232,8153,0,0,0,0,...,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0,21,8385
3,0,tcp,http,SF,199,420,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,21,619
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.0,0.0,0.0,0.0,1.0,1.0,1,21,0


In [186]:
test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag,total_bytes
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.06,0.0,0.0,0.0,0.0,1.0,1.0,dos,21,0
1,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal,21,12983
2,0,icmp,eco_i,SF,20,0,0,0,0,0,...,0.0,1.0,0.28,0.0,0.0,0.0,0.0,probe,15,20
3,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.17,0.03,0.02,0.0,0.0,0.83,0.71,probe,11,15
4,0,tcp,http,SF,267,14515,0,0,0,0,...,0.0,0.01,0.03,0.01,0.0,0.0,0.0,normal,21,14782


##### Scaling of Numeric features

In [187]:
# Numeric features of Dataset without the target feature
numeric=['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate', 'last_flag']

In [188]:
#[TRAIN and TEST Dataset] Normalizing Numerical Features 


#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()


#train[numeric]=scaler.fit_transform(train[numeric])
#test[numeric] = scaler.fit_transform(test[numeric])

In [189]:
train.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag,total_bytes
0,0,udp,other,SF,146,0,0,0,0,0,...,0.6,0.88,0.0,0.0,0.0,0.0,0.0,0,15,146
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.05,0.0,0.0,1.0,1.0,0.0,0.0,1,19,0
2,0,tcp,http,SF,232,8153,0,0,0,0,...,0.0,0.03,0.04,0.03,0.01,0.0,0.01,0,21,8385
3,0,tcp,http,SF,199,420,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,21,619
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.0,0.0,0.0,0.0,1.0,1.0,1,21,0


In [190]:
test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag,total_bytes
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.06,0.0,0.0,0.0,0.0,1.0,1.0,dos,21,0
1,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal,21,12983
2,0,icmp,eco_i,SF,20,0,0,0,0,0,...,0.0,1.0,0.28,0.0,0.0,0.0,0.0,probe,15,20
3,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.17,0.03,0.02,0.0,0.0,0.83,0.71,probe,11,15
4,0,tcp,http,SF,267,14515,0,0,0,0,...,0.0,0.01,0.03,0.01,0.0,0.0,0.0,normal,21,14782


In [191]:
# Split the data into features and target label
target   = train['attack']
features = train.drop('attack', axis = 1)

In [192]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'target' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    target, 
                                                    test_size = 0.20, 
                                                    random_state = 12345)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 100777 samples.
Testing set has 25195 samples.


In [193]:
duration                    = tf.feature_column.numeric_column('duration')
src_bytes                   = tf.feature_column.numeric_column('src_bytes')
dst_bytes                   = tf.feature_column.numeric_column('dst_bytes')
land                        = tf.feature_column.numeric_column('land')
wrong_fragment              = tf.feature_column.numeric_column('wrong_fragment')
urgent                      = tf.feature_column.numeric_column('urgent')
hot                         = tf.feature_column.numeric_column('hot')
num_failed_logins           = tf.feature_column.numeric_column('num_failed_logins')
logged_in                   = tf.feature_column.numeric_column('logged_in')
num_compromised             = tf.feature_column.numeric_column('num_compromised')
root_shell                  = tf.feature_column.numeric_column('root_shell')
su_attempted                = tf.feature_column.numeric_column('su_attempted')
num_root                    = tf.feature_column.numeric_column('num_root')
num_file_creations          = tf.feature_column.numeric_column('num_file_creations')
num_shells                  = tf.feature_column.numeric_column('num_shells')
num_access_files            = tf.feature_column.numeric_column('num_access_files')
num_outbound_cmds           = tf.feature_column.numeric_column('num_outbound_cmds')
is_host_login               = tf.feature_column.numeric_column('is_host_login')
is_guest_login              = tf.feature_column.numeric_column('is_guest_login')
count                       = tf.feature_column.numeric_column('count')
srv_count                   = tf.feature_column.numeric_column('srv_count')
serror_rate                 = tf.feature_column.numeric_column('serror_rate')
srv_serror_rate             = tf.feature_column.numeric_column('srv_serror_rate')
rerror_rate                 = tf.feature_column.numeric_column('rerror_rate')
srv_rerror_rate             = tf.feature_column.numeric_column('srv_rerror_rate')
same_srv_rate               = tf.feature_column.numeric_column('same_srv_rate')
diff_srv_rate               = tf.feature_column.numeric_column('diff_srv_rate')
srv_diff_host_rate          = tf.feature_column.numeric_column('srv_diff_host_rate')
dst_host_count              = tf.feature_column.numeric_column('dst_host_count')
dst_host_srv_count          = tf.feature_column.numeric_column('dst_host_srv_count')
dst_host_same_srv_rate      = tf.feature_column.numeric_column('dst_host_same_srv_rate')
dst_host_diff_srv_rate      = tf.feature_column.numeric_column('dst_host_diff_srv_rate')
dst_host_same_src_port_rate = tf.feature_column.numeric_column('dst_host_same_src_port_rate')
dst_host_srv_diff_host_rate = tf.feature_column.numeric_column('dst_host_srv_diff_host_rate')
dst_host_serror_rate        = tf.feature_column.numeric_column('dst_host_serror_rate')
dst_host_srv_serror_rate    = tf.feature_column.numeric_column('dst_host_srv_serror_rate')
dst_host_rerror_rate        = tf.feature_column.numeric_column('dst_host_rerror_rate')
dst_host_srv_rerror_rate    = tf.feature_column.numeric_column('dst_host_srv_rerror_rate')
last_flag                   = tf.feature_column.numeric_column('last_flag')

In [194]:
train.service.unique()

array(['other', 'private', 'http', 'remote_job', 'ftp_data', 'name',
       'netbios_ns', 'eco_i', 'mtp', 'telnet', 'finger', 'domain_u',
       'supdup', 'uucp_path', 'Z39_50', 'smtp', 'csnet_ns', 'uucp',
       'netbios_dgm', 'urp_i', 'auth', 'domain', 'ftp', 'bgp', 'ldap',
       'ecr_i', 'gopher', 'vmnet', 'systat', 'http_443', 'efs', 'whois',
       'imap4', 'iso_tsap', 'echo', 'klogin', 'link', 'sunrpc', 'login',
       'kshell', 'sql_net', 'time', 'hostnames', 'exec', 'ntp_u',
       'discard', 'nntp', 'courier', 'ctf', 'ssh', 'daytime', 'shell',
       'netstat', 'pop_3', 'nnsp', 'IRC', 'pop_2', 'printer', 'tim_i',
       'pm_dump', 'red_i', 'netbios_ssn', 'rje', 'X11', 'urh_i',
       'http_8001', 'aol', 'http_2784', 'tftp_u', 'harvest'], dtype=object)

In [195]:
test.service.unique()

array(['private', 'ftp_data', 'eco_i', 'telnet', 'http', 'smtp', 'ftp',
       'ldap', 'pop_3', 'courier', 'discard', 'ecr_i', 'imap4',
       'domain_u', 'mtp', 'systat', 'iso_tsap', 'other', 'csnet_ns',
       'finger', 'uucp', 'whois', 'netbios_ns', 'link', 'Z39_50',
       'sunrpc', 'auth', 'netbios_dgm', 'uucp_path', 'vmnet', 'domain',
       'name', 'pop_2', 'http_443', 'urp_i', 'login', 'gopher', 'exec',
       'time', 'remote_job', 'ssh', 'kshell', 'sql_net', 'shell',
       'hostnames', 'echo', 'daytime', 'pm_dump', 'IRC', 'netstat', 'ctf',
       'nntp', 'netbios_ssn', 'tim_i', 'supdup', 'bgp', 'nnsp', 'rje',
       'printer', 'efs', 'X11', 'ntp_u', 'klogin', 'tftp_u'], dtype=object)

In [196]:
protocol_type = tf.feature_column.categorical_column_with_vocabulary_list("protocol_type", ["tcp", "udp", "icmp"])
flag = tf.feature_column.categorical_column_with_hash_bucket(key = "flag",hash_bucket_size = 5000)
service = tf.feature_column.categorical_column_with_hash_bucket(key = "service",hash_bucket_size = 5000)

In [197]:
bucketized_protocol_type = tf.feature_column.embedding_column(categorical_column=protocol_type,dimension=3)
bucketized_flag          = tf.feature_column.embedding_column(categorical_column=flag,dimension=500)
bucketized_service       = tf.feature_column.embedding_column(categorical_column=service,dimension=500)

In [198]:
bucketized_duration                    = tf.feature_column.bucketized_column(duration,boundaries=[100,1000,10000,25000,100000])
bucketized_src_bytes                   = tf.feature_column.bucketized_column(src_bytes,boundaries=[100,1000,5000,10000])
bucketized_dst_bytes                   = tf.feature_column.bucketized_column(dst_bytes,boundaries=[100,1000,5000,10000])
bucketized_land                        = tf.feature_column.bucketized_column(land,boundaries=[0,1])
bucketized_wrong_fragment              = tf.feature_column.bucketized_column(wrong_fragment,boundaries=[0,2,3])
bucketized_urgent                      = tf.feature_column.bucketized_column(urgent,boundaries=[0,2,3])
bucketized_hot                         = tf.feature_column.bucketized_column(hot,boundaries=[25,50,100])
bucketized_num_failed_logins           = tf.feature_column.bucketized_column(num_failed_logins,boundaries=[0,3,5])
bucketized_logged_in                   = tf.feature_column.bucketized_column(logged_in,boundaries=[0,2,4])
bucketized_num_compromised             = tf.feature_column.bucketized_column(num_compromised,boundaries=[1000,5000])
bucketized_root_shell                  = tf.feature_column.bucketized_column(root_shell,boundaries=[0,1])
bucketized_su_attempted                = tf.feature_column.bucketized_column(su_attempted,boundaries=[0,1])
bucketized_num_root                    = tf.feature_column.bucketized_column(num_root,boundaries=[1000,10000])
bucketized_num_file_creations          = tf.feature_column.bucketized_column(num_file_creations,boundaries=[25,50])
bucketized_num_shells                  = tf.feature_column.bucketized_column(num_shells,boundaries=[0,1])
bucketized_num_access_files            = tf.feature_column.bucketized_column(num_access_files,boundaries=[3,6])
bucketized_num_outbound_cmds           = tf.feature_column.bucketized_column(num_outbound_cmds,boundaries=[0,1])
bucketized_is_host_login               = tf.feature_column.bucketized_column(is_host_login,boundaries=[0,1])
bucketized_is_guest_login              = tf.feature_column.bucketized_column(is_guest_login,boundaries=[0,1])
bucketized_count                       = tf.feature_column.bucketized_column(count,boundaries=[25,50,100,150])
bucketized_srv_count                   = tf.feature_column.bucketized_column(srv_count,boundaries=[100,300,500])
bucketized_serror_rate                 = tf.feature_column.bucketized_column(serror_rate,boundaries=[0,1])
bucketized_srv_serror_rate             = tf.feature_column.bucketized_column(srv_serror_rate,boundaries=[0,1])
bucketized_rerror_rate                 = tf.feature_column.bucketized_column(rerror_rate,boundaries=[0,1])
bucketized_srv_rerror_rate             = tf.feature_column.bucketized_column(srv_rerror_rate,boundaries=[0,1])
bucketized_same_srv_rate               = tf.feature_column.bucketized_column(same_srv_rate,boundaries=[1,2])
bucketized_diff_srv_rate               = tf.feature_column.bucketized_column(diff_srv_rate,boundaries=[1,2])
bucketized_srv_diff_host_rate          = tf.feature_column.bucketized_column(srv_diff_host_rate,boundaries=[0,1])
bucketized_dst_host_count              = tf.feature_column.bucketized_column(dst_host_count,boundaries=[0,100,200])
bucketized_dst_host_srv_count          = tf.feature_column.bucketized_column(dst_host_srv_count,boundaries=[50,100,250])
bucketized_dst_host_same_srv_rate      = tf.feature_column.bucketized_column(dst_host_same_srv_rate,boundaries=[0,1])
bucketized_dst_host_diff_srv_rate      = tf.feature_column.bucketized_column(dst_host_diff_srv_rate,boundaries=[0,1])
bucketized_dst_host_same_src_port_rate = tf.feature_column.bucketized_column(dst_host_same_src_port_rate,boundaries=[0,1])
bucketized_dst_host_srv_diff_host_rate = tf.feature_column.bucketized_column(dst_host_srv_diff_host_rate,boundaries=[0,1])
bucketized_dst_host_serror_rate        = tf.feature_column.bucketized_column(dst_host_serror_rate,boundaries=[0,1])
bucketized_dst_host_srv_serror_rate    = tf.feature_column.bucketized_column(dst_host_srv_serror_rate,boundaries=[0,1])
bucketized_dst_host_rerror_rate        = tf.feature_column.bucketized_column(dst_host_rerror_rate,boundaries=[0,1])
bucketized_dst_host_srv_rerror_rate    = tf.feature_column.bucketized_column(dst_host_srv_rerror_rate,boundaries=[0,1])
bucketized_last_flag                   = tf.feature_column.bucketized_column(last_flag,boundaries=[10,50,100])

In [199]:
feat_columns=[bucketized_duration,
bucketized_src_bytes,                  
bucketized_dst_bytes,                  
bucketized_land,                       
bucketized_wrong_fragment,            
bucketized_urgent,                     
bucketized_hot,                       
bucketized_num_failed_logins,         
bucketized_logged_in,                  
bucketized_num_compromised,           
bucketized_root_shell,                
bucketized_su_attempted,               
bucketized_num_root,                   
bucketized_num_file_creations,       
bucketized_num_shells,                 
bucketized_num_access_files,           
bucketized_num_outbound_cmds,          
bucketized_is_host_login,              
bucketized_is_guest_login,             
bucketized_count,                      
bucketized_srv_count ,                 
bucketized_serror_rate,                
bucketized_srv_serror_rate,            
bucketized_rerror_rate,                
bucketized_srv_rerror_rate,            
bucketized_same_srv_rate,             
bucketized_diff_srv_rate,              
bucketized_srv_diff_host_rate,         
bucketized_dst_host_count ,            
bucketized_dst_host_srv_count,        
bucketized_dst_host_same_srv_rate,     
bucketized_dst_host_diff_srv_rate,     
bucketized_dst_host_same_src_port_rate,
bucketized_dst_host_srv_diff_host_rate,
bucketized_dst_host_serror_rate,       
bucketized_dst_host_srv_serror_rate,   
bucketized_dst_host_rerror_rate,       
bucketized_dst_host_srv_rerror_rate,   
bucketized_last_flag,
bucketized_protocol_type,
bucketized_flag,         
bucketized_service]

In [200]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train,y=y_train ,batch_size=10,
                                                 num_epochs=1000,shuffle=True)

In [201]:
myopt = tf.train.AdamOptimizer(learning_rate=0.005)

In [202]:
model = tf.estimator.DNNClassifier(hidden_units=[11,5,5,11,5,5,11],
                                  feature_columns=feat_columns,
                                  optimizer = myopt)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_train_distribute': None, '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f974e2abf90>, '_evaluation_master': '', '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': '/tmp/tmp66z3qP', '_global_id_in_cluster': 0, '_save_summary_steps': 100}


In [203]:
model.train(input_fn=input_func,steps=5000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp66z3qP/model.ckpt.
INFO:tensorflow:loss = 6.9465823, step = 1
INFO:tensorflow:global_step/sec: 10.2257
INFO:tensorflow:loss = 0.21964216, step = 101 (9.786 sec)
INFO:tensorflow:global_step/sec: 11.6847
INFO:tensorflow:loss = 0.30951053, step = 201 (8.558 sec)
INFO:tensorflow:global_step/sec: 12.009
INFO:tensorflow:loss = 0.2703184, step = 301 (8.327 sec)
INFO:tensorflow:global_step/sec: 11.0942
INFO:tensorflow:loss = 0.3692786, step = 401 (9.013 sec)
INFO:tensorflow:global_step/sec: 11.8573
INFO:tensorflow:loss = 4.2493267, step = 501 (8.434 sec)
INFO:tensorflow:global_step/sec: 11.7054
INFO:tensorflow:loss = 0.34377274, step = 601 (8.543 sec)
INFO:tensorflow:global_step/sec: 11.4862
INFO:tensorflow:

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f974e2e2050>

In [204]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(
      x=X_test,
      batch_size=10,
      num_epochs=1,
      shuffle=False)

In [205]:
eval_pred_gen = model.predict(eval_input_func)

In [206]:
predictions = list(eval_pred_gen)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp66z3qP/model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [207]:
final_preds = [pred['class_ids'][0] for pred in predictions]

In [208]:
#final_preds

In [209]:
from sklearn.metrics import classification_report,confusion_matrix

In [210]:
print(classification_report(y_test,final_preds))

             precision    recall  f1-score   support

          0       0.97      0.99      0.98     13486
          1       0.99      0.97      0.98     11709

avg / total       0.98      0.98      0.98     25195



In [211]:
print(confusion_matrix(y_test,final_preds))

[[13403    83]
 [  402 11307]]


In [212]:
test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag,total_bytes
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.06,0.0,0.0,0.0,0.0,1.0,1.0,dos,21,0
1,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal,21,12983
2,0,icmp,eco_i,SF,20,0,0,0,0,0,...,0.0,1.0,0.28,0.0,0.0,0.0,0.0,probe,15,20
3,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.17,0.03,0.02,0.0,0.0,0.83,0.71,probe,11,15
4,0,tcp,http,SF,267,14515,0,0,0,0,...,0.0,0.01,0.03,0.01,0.0,0.0,0.0,normal,21,14782


In [214]:
test.drop('attack',axis=1,inplace=True)

In [215]:
test.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,last_flag,total_bytes
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,21,0
1,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,21,12983
2,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,15,20
3,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,11,15
4,0,tcp,http,SF,267,14515,0,0,0,0,...,1.0,0.0,0.01,0.03,0.01,0.0,0.0,0.0,21,14782


In [216]:
test_input_func = tf.estimator.inputs.pandas_input_fn(
      x=test,
      batch_size=10,
      num_epochs=1,
      shuffle=False)

In [217]:
test_pred_gen = model.predict(test_input_func)

In [218]:
test_predictions = list(test_pred_gen)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp66z3qP/model.ckpt-5000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [219]:
test_final_preds = [pred['class_ids'][0] for pred in test_predictions]

In [220]:
test_final_preds

[1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,


In [221]:
test['attack']=test_final_preds

In [223]:
test.attack.value_counts()

0    13705
1     8838
Name: attack, dtype: int64