Reference article: https://scholarworks.rit.edu/cgi/viewcontent.cgi?article=11848&context=theses

Kaggle data: https://www.kaggle.com/solarmainframe/ids-intrusion-csv

In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('ids2018_train.csv')
test = pd.read_csv('ids2018_test.csv')

Obtaining the X, y data

In [3]:
y = train[['Label']]
y_val = test[['Label']]
X = train.drop(columns=['Label'])
X_val = test.drop(columns=['Label'])

In [13]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
ohc = OneHotEncoder()
le = LabelEncoder()
#y = le.fit_transform(y.values.ravel())
#y_val = le.transform(y_val.values.ravel())

y = ohc.fit_transform(y).toarray()
y_val = ohc.transform(y_val).toarray()

Determining the features to use. We will be using sklearns `SelectFromModel`

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3)

In [31]:
sel = SelectFromModel(RandomForestClassifier(n_estimators=100))
sel.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier())

In [32]:
sel.get_support()

array([ True,  True, False,  True, False, False, False, False, False,
       False, False, False, False,  True, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True,  True,  True,  True, False,
       False, False, False, False, False, False, False,  True,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True,  True, False, False,  True,  True,
        True, False, False, False, False, False, False, False, False])

In [34]:
selected_features = X_train.columns[(sel.get_support())]
print(selected_features)

Index(['Flow Duration', 'Tot Fwd Pkts', 'TotLen Fwd Pkts', 'Flow IAT Mean',
       'Flow IAT Max', 'Flow IAT Min', 'Fwd Header Len', 'Bwd Header Len',
       'Fwd Pkts/s', 'Bwd Pkts/s', 'ACK Flag Cnt', 'URG Flag Cnt',
       'Subflow Fwd Pkts', 'Subflow Fwd Byts', 'Init Bwd Win Byts',
       'Fwd Act Data Pkts', 'Fwd Seg Size Min'],
      dtype='object')


Let us choose the top 10 features

In [61]:
importance = list(zip(X_train.columns, sel.estimator_.feature_importances_))
importance.sort(key=lambda x: x[1], reverse=rse=True)

In [65]:
top_features = [i[0] for i in importance[:10]]

In [16]:
X_top = X[top_features]
X_val_top = X_val[top_features]

<b>Selecting the model</b>

In [39]:
''' Normalizing the values '''
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_top = pd.DataFrame(scaler.fit_transform(X_top))
X_val_top = pd.DataFrame(scaler.transform(X_val_top))

In [42]:
X_train_top, X_test_top, y_train, y_test = train_test_split(X_top, y, test_size=0.3)

In [23]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, BatchNormalization

In [44]:
model = Sequential()
model.add(Dense(512, input_dim=17, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 512)               9216      
                                                                 
 batch_normalization_4 (Batc  (None, 512)              2048      
 hNormalization)                                                 
                                                                 
 dense_11 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_5 (Batc  (None, 256)              1024      
 hNormalization)                                                 
                                                                 
 dense_12 (Dense)            (None, 128)               32896     
                                                                 
 dense_13 (Dense)            (None, 64)               

In [45]:
model.fit(X_train_top, y_train, batch_size=512, epochs=5, validation_data=(X_test_top, y_test) )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x152e110cc10>

In [46]:
model.evaluate(X_val_top, y_val, batch_size=512)



[0.00031956416205503047, 0.9999336004257202]