In [None]:
import numpy as np
import pandas as pd
import time
import warnings
import seaborn as sns
from sklearn import metrics, preprocessing, tree
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, roc_auc_score
warnings.filterwarnings("ignore")

In [None]:
np.set_printoptions(precision=3)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
warnings.filterwarnings('ignore')
np.random.seed(8)
%matplotlib inline

In [None]:
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [None]:
data = pd.read_csv('DDoS_2019_update_dataset.csv')
data_ = data[(data[' Label']!='BENIGN')&(data[' Label']!='WebDDoS')]
len(data_[' Label'].value_counts())

11

In [None]:
X = data_.drop(['Flow ID', ' Source IP', ' Source Port', ' Destination IP',
       ' Destination Port', ' Protocol', ' Timestamp',' Flow Packets/s','SimillarHTTP','Flow Bytes/s',' Label'],
       axis='columns')
y = data_[' Label']
X.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Inbound
0,217,38,0,16432.0,0.0,440.0,296.0,432.421,32.586,0.0,0.0,0.0,0.0,5.865,13.406,51.0,0.0,217.0,5.865,13.406,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,304,0,175115.207,0.0,296.0,440.0,432.615,32.178,1035.401,0,0,0,0,0,0,0,0,0.0,444.0,432.421,0.0,304,0,0,0,0,0,0,38,16432,0,0,-1,-1,37,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,1458,34,0,14384.0,0.0,440.0,152.0,423.059,68.784,0.0,0.0,0.0,0.0,44.182,201.078,1161.0,0.0,1458.0,44.182,201.078,1161.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,680,0,23319.616,0.0,152.0,440.0,423.543,67.825,4600.255,0,0,0,0,0,0,0,0,0.0,436.0,423.059,0.0,680,0,0,0,0,0,0,34,14384,0,0,-1,-1,33,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,1,2,0,2944.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,-2,0,2000000.0,0.0,1472.0,1472.0,1472.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,2208.0,1472.0,0.0,-2,0,0,0,0,0,0,2,2944,0,0,-1,-1,1,-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,1,2,0,458.0,0.0,229.0,229.0,229.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,40,0,2000000.0,0.0,229.0,229.0,229.0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,343.5,229.0,0.0,40,0,0,0,0,0,0,2,458,0,0,-1,-1,1,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,105801,4,0,1438.0,0.0,389.0,330.0,359.5,34.064,0.0,0.0,0.0,0.0,35267.0,61040.939,105751.0,1.0,105801.0,35267.0,61040.939,105751.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,80,0,37.807,0.0,330.0,389.0,353.6,32.316,1044.3,0,0,0,0,0,0,0,0,0.0,442.0,359.5,0.0,80,0,0,0,0,0,0,4,1438,0,0,-1,-1,3,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [None]:
X.describe()


Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Bwd PSH Flags,Fwd URG Flags,Bwd URG Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,FIN Flag Count,SYN Flag Count,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,ECE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Fwd Avg Bytes/Bulk,Fwd Avg Packets/Bulk,Fwd Avg Bulk Rate,Bwd Avg Bytes/Bulk,Bwd Avg Packets/Bulk,Bwd Avg Bulk Rate,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Inbound
count,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0,54851.0
mean,1381901.586,9.856,0.055,3868.891,0.522,623.332,610.147,619.41,5.125,0.171,0.039,0.073,0.052,197048.51,302267.627,644152.531,4.618,1381691.024,209392.639,308964.953,644076.503,4.7,61687.897,18217.306,31427.383,57161.139,0.091,0.0,0.0,0.0,0.0,-130345646.649,1.139,1106987.137,398.616,610.078,623.403,619.066,4.991,309.296,0.0,0.0,0.0,0.0,0.171,0.0,0.0,0.0,0.017,895.357,619.41,0.073,-130345646.649,0.0,0.0,0.0,0.0,0.0,0.0,9.856,3868.891,0.055,0.522,1005.607,0.72,6.877,-41816735.995,573.984,818.293,1551.345,90.33,385889.202,88112.105,487559.798,307929.165,0.999
std,9270437.554,390.134,0.439,11352.272,41.234,535.303,542.37,537.078,16.626,12.344,4.455,5.608,4.755,1018287.365,1535572.379,3488005.569,14.676,9270369.626,1089342.072,1579323.761,3487964.594,14.672,1846748.433,550509.115,955605.48,1726863.27,1.848,0.006,0.0,0.0,0.0,2388027464.271,11.119,909428.958,4847.809,542.321,535.317,537.199,16.864,3180.282,0.0,0.004,0.006,0.0,0.376,0.021,0.02,0.0,0.13,820.087,537.078,5.608,2388027464.271,0.0,0.0,0.0,0.0,0.0,0.0,390.134,11352.272,0.439,41.234,2319.739,214.243,25.96,206641022.663,32606.055,41310.964,74265.019,19514.596,2694389.623,820405.655,3449738.995,2245136.804,0.029
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-212543795000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-212543795000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,-1.0,0.0,-1408237563.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,2.0,0.0,458.0,0.0,229.0,229.0,229.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24166.624,0.0,229.0,229.0,229.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,343.5,229.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,458.0,0.0,0.0,-1.0,-1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1.0,2.0,0.0,1398.0,0.0,440.0,414.0,432.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,1000000.0,0.0,414.0,440.0,432.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,574.5,432.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1398.0,0.0,0.0,-1.0,-1.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,112.0,4.0,0.0,2944.0,0.0,1380.0,1380.0,1380.0,0.0,0.0,0.0,0.0,0.0,47.0,16.082,53.0,1.0,72.0,47.0,10.039,50.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,2000000.0,0.0,1380.0,1380.0,1380.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2068.5,1380.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2944.0,0.0,0.0,-1.0,-1.0,1.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,117833407.0,91177.0,42.0,88000.0,4836.0,1472.0,1472.0,1472.0,289.252,1073.0,520.0,520.0,565.257,16408370.667,28420129.932,67569476.0,352.0,117833407.0,16408370.667,28420129.932,67569476.0,275.0,113005942.0,30951597.667,53609737.136,106182305.0,73.0,1.0,0.0,0.0,0.0,88000.0,1360.0,4000000.0,666666.667,1472.0,1472.0,1472.0,521.432,271891.2,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,2208.0,1472.0,520.0,88000.0,0.0,0.0,0.0,0.0,0.0,0.0,91177.0,88000.0,42.0,4836.0,65535.0,28960.0,199.0,1480.0,4553385.0,3320518.997,4695924.0,4553385.0,49225110.0,26985772.847,67569476.0,49225110.0,1.0


In [None]:
X = X.loc[:, (X != 0).any(axis=0)]
print(len(X.columns))

64


In [None]:
X.corr()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Min,Bwd Packet Length Mean,Bwd Packet Length Std,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,Fwd IAT Max,Fwd IAT Min,Bwd IAT Total,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd PSH Flags,Fwd Header Length,Bwd Header Length,Fwd Packets/s,Bwd Packets/s,Min Packet Length,Max Packet Length,Packet Length Mean,Packet Length Std,Packet Length Variance,SYN Flag Count,RST Flag Count,ACK Flag Count,URG Flag Count,CWE Flag Count,Down/Up Ratio,Average Packet Size,Avg Fwd Segment Size,Avg Bwd Segment Size,Fwd Header Length.1,Subflow Fwd Packets,Subflow Fwd Bytes,Subflow Bwd Packets,Subflow Bwd Bytes,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Inbound
Flow Duration,1.000,0.003,0.303,-0.045,0.006,-0.152,-0.147,-0.151,-0.042,0.004,-0.001,0.002,0.005,0.859,0.808,0.884,-0.042,1.000,0.872,0.807,0.884,-0.043,0.303,0.301,0.300,0.303,0.078,0.000,0.007,0.241,-0.181,-0.012,-0.147,-0.152,-0.151,-0.039,-0.012,-0.001,0.000,0.269,-0.003,-0.003,-0.018,-0.146,-0.151,0.002,0.007,0.003,-0.045,0.303,0.006,0.257,0.007,-0.035,0.023,0.108,0.147,0.161,0.010,0.814,0.785,0.883,0.705,0.004
Total Fwd Packets,0.003,1.000,0.096,0.065,0.000,-0.011,-0.011,-0.011,0.004,0.000,-0.000,-0.000,0.000,-0.001,-0.001,-0.001,-0.006,0.003,-0.001,-0.001,-0.001,-0.006,0.006,0.002,0.002,0.003,-0.001,-0.000,-0.005,-0.001,-0.023,-0.002,-0.011,-0.011,-0.011,0.004,0.001,-0.000,-0.000,-0.008,-0.000,-0.000,-0.003,-0.014,-0.011,-0.000,-0.005,1.000,0.065,0.096,0.000,-0.008,-0.000,0.065,0.003,0.000,0.000,0.000,-0.000,-0.001,-0.000,-0.000,-0.001,0.001
Total Backward Packets,0.303,0.096,1.000,-0.042,0.514,-0.139,-0.140,-0.142,0.014,0.482,0.039,0.210,0.392,0.198,0.215,0.263,-0.030,0.303,0.252,0.233,0.263,-0.002,0.365,0.326,0.324,0.337,0.228,0.040,0.007,0.963,-0.150,0.364,-0.140,-0.134,-0.142,0.060,0.140,-0.001,0.040,0.268,0.083,0.072,0.600,-0.135,-0.142,0.210,0.007,0.096,-0.042,1.000,0.514,0.339,0.042,-0.030,0.025,0.043,0.063,0.068,-0.001,0.241,0.260,0.266,0.194,-0.038
Total Length of Fwd Packets,-0.045,0.065,-0.042,1.000,-0.002,0.015,0.002,0.016,0.058,-0.002,-0.001,-0.002,-0.002,-0.055,-0.055,-0.053,-0.069,-0.045,-0.055,-0.055,-0.053,-0.071,-0.011,-0.011,-0.011,-0.011,-0.017,-0.002,-0.082,-0.034,-0.247,-0.028,0.002,0.015,0.016,0.058,0.015,-0.001,-0.002,-0.155,-0.007,-0.007,-0.043,-0.032,0.016,-0.002,-0.082,0.065,1.000,-0.042,-0.002,-0.148,-0.003,0.996,0.023,-0.006,-0.007,-0.007,-0.002,-0.049,-0.037,-0.048,-0.047,0.008
Total Length of Bwd Packets,0.006,0.000,0.514,-0.002,1.000,-0.000,-0.009,-0.008,0.103,0.957,0.216,0.676,0.896,-0.000,0.002,0.007,-0.003,0.004,-0.000,0.002,0.005,-0.003,0.037,0.008,0.010,0.018,0.012,0.022,0.001,0.665,-0.015,0.004,-0.012,0.011,-0.008,0.245,0.496,-0.000,0.022,0.023,0.006,-0.000,0.049,-0.009,-0.008,0.676,0.001,0.000,-0.002,0.514,1.000,0.203,0.012,0.001,0.003,-0.000,-0.000,-0.000,-0.000,-0.002,-0.001,-0.002,-0.002,-0.002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Idle Mean,0.814,-0.001,0.241,-0.049,-0.002,-0.166,-0.160,-0.164,-0.044,-0.002,-0.001,-0.002,-0.002,0.960,0.968,0.957,-0.040,0.814,0.970,0.971,0.957,-0.040,0.179,0.181,0.180,0.180,0.088,-0.001,0.008,0.190,-0.174,-0.012,-0.160,-0.166,-0.164,-0.042,-0.014,-0.001,-0.001,0.312,-0.003,-0.003,-0.017,-0.156,-0.164,-0.002,0.008,-0.001,-0.049,0.241,-0.002,0.295,-0.001,-0.038,0.028,0.109,0.122,0.128,0.029,1.000,0.705,0.974,0.974,0.004
Idle Std,0.785,-0.000,0.260,-0.037,-0.001,-0.125,-0.121,-0.124,-0.033,-0.001,-0.001,-0.001,-0.001,0.723,0.705,0.815,-0.030,0.785,0.744,0.709,0.815,-0.031,0.223,0.227,0.226,0.225,0.071,-0.001,0.006,0.205,-0.131,-0.009,-0.121,-0.125,-0.124,-0.032,-0.010,-0.000,-0.001,0.237,-0.002,-0.002,-0.013,-0.117,-0.124,-0.001,0.006,-0.000,-0.037,0.260,-0.001,0.224,-0.001,-0.028,0.022,0.076,0.106,0.109,-0.000,0.705,1.000,0.829,0.540,0.003
Idle Max,0.883,-0.000,0.266,-0.048,-0.002,-0.164,-0.158,-0.162,-0.043,-0.002,-0.001,-0.002,-0.002,0.952,0.950,0.983,-0.040,0.883,0.966,0.953,0.983,-0.040,0.218,0.220,0.219,0.220,0.088,-0.001,0.008,0.210,-0.172,-0.012,-0.158,-0.164,-0.162,-0.042,-0.014,-0.001,-0.001,0.309,-0.003,-0.003,-0.017,-0.154,-0.162,-0.002,0.008,-0.000,-0.048,0.266,-0.002,0.292,-0.001,-0.037,0.028,0.106,0.126,0.134,0.023,0.974,0.829,1.000,0.907,0.004
Idle Min,0.705,-0.001,0.194,-0.047,-0.002,-0.159,-0.153,-0.157,-0.042,-0.002,-0.001,-0.002,-0.001,0.919,0.938,0.890,-0.038,0.705,0.924,0.940,0.890,-0.038,0.121,0.122,0.121,0.121,0.082,-0.001,0.007,0.153,-0.167,-0.011,-0.153,-0.159,-0.157,-0.040,-0.013,-0.001,-0.001,0.298,-0.003,-0.003,-0.017,-0.149,-0.157,-0.002,0.007,-0.001,-0.047,0.194,-0.002,0.282,-0.001,-0.036,0.027,0.106,0.110,0.115,0.035,0.974,0.540,0.907,1.000,0.004


In [None]:
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.65)]

In [None]:
print(to_drop)

[' Fwd Packet Length Min', ' Fwd Packet Length Mean', 'Bwd Packet Length Max', ' Bwd Packet Length Mean', ' Bwd Packet Length Std', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd Header Length', ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean', ' Packet Length Std', ' Packet Length Variance', ' RST Flag Count', ' CWE Flag Count', ' Average Packet Size', ' Avg Fwd Segment Size', ' Avg Bwd Segment Size', ' Fwd Header Length.1', 'Subflow Fwd Packets', ' Subflow Fwd Bytes', ' Subflow Bwd Packets', ' Subflow Bwd Bytes', 'Init_Win_bytes_forward', ' act_data_pkt_fwd', ' Active Std', ' Active Max', 'Idle Mean', ' Idle Std', ' Idle Max', ' Idle Min']


In [None]:

print(len(X.columns))

64


In [None]:
print(len(to_drop))

40


In [None]:
X = X.drop(to_drop, axis=1)

In [None]:
print(len(X.columns))

24


In [None]:
df = X.corr()
df

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Std,Bwd Packet Length Min,Flow IAT Min,Bwd IAT Total,Bwd IAT Min,Fwd PSH Flags,Fwd Header Length,Fwd Packets/s,Bwd Packets/s,SYN Flag Count,ACK Flag Count,URG Flag Count,Down/Up Ratio,Init_Win_bytes_backward,min_seg_size_forward,Active Mean,Active Min,Inbound
Flow Duration,1.0,0.003,0.303,-0.045,0.006,-0.152,-0.042,-0.001,-0.042,0.303,0.078,0.0,0.007,-0.181,-0.012,-0.001,0.269,-0.003,-0.018,0.007,0.023,0.108,0.01,0.004
Total Fwd Packets,0.003,1.0,0.096,0.065,0.0,-0.011,0.004,-0.0,-0.006,0.006,-0.001,-0.0,-0.005,-0.023,-0.002,-0.0,-0.008,-0.0,-0.003,-0.0,0.003,0.0,-0.0,0.001
Total Backward Packets,0.303,0.096,1.0,-0.042,0.514,-0.139,0.014,0.039,-0.03,0.365,0.228,0.04,0.007,-0.15,0.364,-0.001,0.268,0.083,0.6,0.042,0.025,0.043,-0.001,-0.038
Total Length of Fwd Packets,-0.045,0.065,-0.042,1.0,-0.002,0.015,0.058,-0.001,-0.069,-0.011,-0.017,-0.002,-0.082,-0.247,-0.028,-0.001,-0.155,-0.007,-0.043,-0.003,0.023,-0.006,-0.002,0.008
Total Length of Bwd Packets,0.006,0.0,0.514,-0.002,1.0,-0.0,0.103,0.216,-0.003,0.037,0.012,0.022,0.001,-0.015,0.004,-0.0,0.023,0.006,0.049,0.012,0.003,-0.0,-0.0,-0.002
Fwd Packet Length Max,-0.152,-0.011,-0.139,0.015,-0.0,1.0,-0.122,0.013,0.019,-0.039,-0.057,-0.006,-0.005,0.171,-0.095,-0.005,-0.528,-0.025,-0.146,-0.009,-0.071,-0.02,-0.005,0.018
Fwd Packet Length Std,-0.042,0.004,0.014,0.058,0.103,-0.122,1.0,-0.002,-0.081,-0.006,-0.014,0.015,-0.012,-0.353,-0.025,-0.001,-0.137,-0.002,-0.035,-0.001,0.007,-0.005,-0.001,0.007
Bwd Packet Length Min,-0.001,-0.0,0.039,-0.001,0.216,0.013,-0.002,1.0,-0.001,-0.0,0.014,0.057,0.0,-0.011,0.023,-0.0,-0.004,0.016,0.064,0.0,0.002,-0.0,-0.0,-0.012
Flow IAT Min,-0.042,-0.006,-0.03,-0.069,-0.003,0.019,-0.081,-0.001,1.0,-0.01,-0.008,-0.001,0.007,-0.279,-0.02,-0.001,0.034,0.055,-0.022,-0.001,0.005,-0.003,-0.001,-0.01
Bwd IAT Total,0.303,0.006,0.365,-0.011,0.037,-0.039,-0.006,-0.0,-0.01,1.0,0.014,0.003,0.002,-0.041,-0.003,-0.0,0.073,0.0,0.001,0.0,0.007,0.04,-0.0,0.001


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 123)
print(X_train.shape,X_test.shape)

(43880, 24) (10971, 24)


In [None]:
X_train.head()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Std,Bwd Packet Length Min,Flow IAT Min,Bwd IAT Total,Bwd IAT Min,Fwd PSH Flags,Fwd Header Length,Fwd Packets/s,Bwd Packets/s,SYN Flag Count,ACK Flag Count,URG Flag Count,Down/Up Ratio,Init_Win_bytes_backward,min_seg_size_forward,Active Mean,Active Min,Inbound
16760,1,2,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,40,2000000.0,0.0,0,1,0,0.0,-1,20,0.0,0.0,1
32564,1,2,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,40,2000000.0,0.0,0,1,0,0.0,-1,20,0.0,0.0,1
36249,507,28,0,12320.0,0.0,440.0,0.0,0.0,0.0,0.0,0.0,0,224,55226.824,0.0,0,0,0,0.0,-1,8,0.0,0.0,1
33710,0,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,40,0.0,0.0,0,1,0,0.0,-1,20,0.0,0.0,1
7404,1,2,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0,40,2000000.0,0.0,0,1,0,0.0,-1,20,0.0,0.0,1


In [None]:
X_train.describe()

Unnamed: 0,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Std,Bwd Packet Length Min,Flow IAT Min,Bwd IAT Total,Bwd IAT Min,Fwd PSH Flags,Fwd Header Length,Fwd Packets/s,Bwd Packets/s,SYN Flag Count,ACK Flag Count,URG Flag Count,Down/Up Ratio,Init_Win_bytes_backward,min_seg_size_forward,Active Mean,Active Min,Inbound
count,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0
mean,1378647.693,8.081,0.055,3823.152,0.487,625.3,5.152,0.049,4.588,55917.849,0.082,0.0,-123539639.686,1108993.746,393.215,0.0,0.171,0.0,0.017,0.436,-42116355.49,563.899,112.875,0.999
std,9274810.511,25.681,0.439,11252.067,39.021,536.51,16.821,4.981,14.312,1662874.594,1.712,0.005,2099289888.711,908494.685,4115.096,0.005,0.376,0.02,0.131,195.572,207328497.229,33005.462,21818.171,0.029
min,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-212543795000.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1062718975.0,0.0,0.0,0.0
25%,1.0,2.0,0.0,458.0,0.0,229.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,25313.694,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,1.0
50%,1.0,2.0,0.0,1398.0,0.0,440.0,0.0,0.0,1.0,0.0,0.0,0.0,40.0,1000000.0,0.0,0.0,0.0,0.0,0.0,-1.0,20.0,0.0,0.0,1.0
75%,106.0,4.0,0.0,2944.0,0.0,1390.0,0.0,0.0,1.0,0.0,0.0,0.0,40.0,2000000.0,0.0,0.0,0.0,0.0,0.0,-1.0,20.0,0.0,0.0,1.0
max,117833407.0,200.0,42.0,88000.0,4836.0,1472.0,289.252,520.0,352.0,90211923.0,52.0,1.0,88000.0,4000000.0,400000.0,1.0,1.0,1.0,3.0,28960.0,1480.0,4553385.0,4553385.0,1.0


In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)

X_test_s = scaler.transform(X_test)

In [None]:
pd.DataFrame(X_train_s).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
count,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0,43880.0
mean,0.012,0.036,0.001,0.043,0.0,0.425,0.018,0.0,0.013,0.001,0.002,0.0,0.999,0.277,0.001,0.0,0.171,0.0,0.006,0.0,0.96,0.0,0.0,0.999
std,0.079,0.129,0.01,0.128,0.008,0.364,0.058,0.01,0.041,0.018,0.033,0.005,0.01,0.227,0.01,0.005,0.376,0.02,0.044,0.007,0.195,0.007,0.005,0.029
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.005,0.0,0.005,0.0,0.156,0.0,0.0,0.003,0.0,0.0,0.0,1.0,0.006,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
50%,0.0,0.005,0.0,0.016,0.0,0.299,0.0,0.0,0.003,0.0,0.0,0.0,1.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
75%,0.0,0.015,0.0,0.033,0.0,0.944,0.0,0.0,0.003,0.0,0.0,0.0,1.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


**Support Vector Machines**

In [None]:
from sklearn import metrics
from sklearn.svm import SVC
model = SVC()
model.fit(X_train_s, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [None]:
predicted = model.predict(X_test_s)
accuracy = accuracy_score(y_test, predicted)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predicted))

In [None]:

print('b = ',model.intercept_)
print('Indices of support vectors = ', model.support_)
print('Support vectors = ', model.support_vectors_)
print('Number of support vectors for each class = ', model.n_support_)
print('Coefficients of the support vector in the decision function = ', np.abs(model.dual_coef_))

**Logistic** **Regression**


In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train_s, y_train)

In [None]:
predicted = clf.predict(X_test)
accuracy = accuracy_score(y_test, predicted)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test,predicted))

In [None]:
print('w = ',clf.coef_)
print('b = ',clf.intercept_)


**Naive Bayes**

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_train_s, y_train)


In [None]:
predicted = clf.predict(X_test_s)
accuracy = clf.score(X_test_s,y_test)
print("Accuracy with MultiNomial Naive Bayes algorithm:", round(accuracy*100),"%")

In [None]:
from sklearn.metrics import classification_report
predicted = clf.predict(X_test_s)
print(classification_report(y_test,predicted))

**XGBoosts classifier**

In [None]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 99.53%


In [None]:
from sklearn.metrics import classification_report
predicted = clf.predict(X_test_s)
print(classification_report(y_test,predicted))

              precision    recall  f1-score   support

        icmp       0.86      0.76      0.81      1054
         tcp       0.98      0.98      0.98     13009
         udp       0.80      0.84      0.82      1937

    accuracy                           0.95     16000
   macro avg       0.88      0.86      0.87     16000
weighted avg       0.95      0.95      0.95     16000



In [None]:
4clf = XGBClassifier(
        eval_metric = 'auc',
        num_class = 2,
        nthread = 4,
        silent = 1,
        )
parameters = {
        'eta': [0.05],
        'learning_rate' : [0.1, 0.01, 0.001, 0.0001],
        'subsample': [1.0],
        'colsample_bytree': [0.9, 1.0]
    }
clf = RandomizedSearchCV(clf, parameters, n_jobs=1, cv=2)
clf.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 61.44%


In [None]:
clf.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.9, eta=0.05,
              eval_metric='auc', gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
              n_jobs=1, nthread=4, num_class=2, objective='multi:softprob',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=1, subsample=1.0, verbosity=1)

**Decision Tree**

In [None]:
from sklearn.tree import DecisionTreeClassifier as dt
clf = dt()

In [None]:
scores = cross_val_score(clf, X_train_s, y_train, cv=5, scoring='f1_macro')

In [None]:
model = dt()
model.fit(X_train_s, y_train)
y_pred = model.predict(X_test_s)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 99.28%


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

        icmp       0.97      0.98      0.98      1054
         tcp       1.00      1.00      1.00     13009
         udp       0.98      0.97      0.98      1937

    accuracy                           0.99     16000
   macro avg       0.98      0.98      0.98     16000
weighted avg       0.99      0.99      0.99     16000



In [None]:
model

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
parameters = {'max_depth':[1,2,3,4,5], 
              'min_samples_leaf':[1,2,3,4,5], 
              'min_samples_split':[2,3,4,5],
              'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10],
              'criterion' : ['gini','entropy']}

In [None]:
# Fit the model
clf.fit(X_train_s, y_train)
# Make predictions
train_predictions = clf.predict(X_train_s)
test_predictions = clf.predict(X_test_s)

In [None]:
clf

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
print('The Training F1 Score is', f1_score(train_predictions, y_train,average ='weighted'))
print('The Testing F1 Score is', f1_score(test_predictions, y_test,average = 'weighted'))

The Training F1 Score is 0.7776385711980419
The Testing F1 Score is 0.7387604349366222


In [None]:
import sklearn as sk
parameters = {'max_depth':[1,2,3,4,5], 
              'min_samples_leaf':[1,2,3,4,5], 
              'min_samples_split':[2,3,4,5],
              'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10],
              'criterion' : ['gini','entropy']}
scorer = make_scorer(sk.metrics.f1_score,average = 'weighted') 

In [None]:
@timeit
def generate_clf_from_search(grid_or_random, clf, parameters, scorer, X, y):
    if grid_or_random == "Grid":
        search_obj = GridSearchCV(clf, parameters, scoring=scorer)
    elif grid_or_random == "Random":
        search_obj = RandomizedSearchCV(clf, parameters, scoring=scorer)
    fit_obj = search_obj.fit(X, y)
    best_clf = fit_obj.best_estimator_
    return best_clf

In [None]:
best_clf_grid = generate_clf_from_search("Random", 
                                         clf, 
                                         parameters, 
                                         scorer, 
                                         X_train, 
                                         y_train)

'generate_clf_from_search'  8512.41 ms


In [None]:
scores = cross_val_score(best_clf_grid, X_train, y_train, cv=5, scoring='f1_macro')
scores.mean()

0.4595255910817581

In [None]:
best_clf_grid.get_params

<bound method BaseEstimator.get_params of DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=3, max_features=None, max_leaf_nodes=8,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=2, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')>

In [None]:
y_pred = best_clf_grid.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 57.15%


In [None]:
best_clf_grid.fit(X_train, y_train)
# Make predictions using the new model.
best_train_predictions = best_clf_grid.predict(X_train)
best_test_predictions = best_clf_grid.predict(X_test)

# Calculate the f1_score of the new model.
print('The training F1 Score is', f1_score(best_train_predictions, y_train,average = 'weighted'))
print('The testing F1 Score is', f1_score(best_test_predictions, y_test,average = 'weighted'))

The training F1 Score is 0.6964039138247158
The testing F1 Score is 0.6924419024986603


In [None]:
best_clf_random = generate_clf_from_search("Random", 
                                           clf, 
                                           parameters, 
                                           scorer, 
                                           X_train, 
                                           y_train)

'generate_clf_from_search'  8941.57 ms


In [None]:
scores = cross_val_score(best_clf_random, X_train, y_train, cv=5, scoring='f1_macro')
scores.mean()

0.4888527364590144

In [None]:
best_clf_random.fit(X_train_s, y_train)
# Make predictions using the new model.
best_train_predictions = best_clf_random.predict(X_train_s)
best_test_predictions = best_clf_random.predict(X_test)

# Calculate the f1_score of the new model.
print('The training F1 Score is', f1_score(best_train_predictions, y_train,average = 'weighted'))
print('The testing F1 Score is', f1_score(best_test_predictions, y_test,average = 'weighted'))

The training F1 Score is 0.7050907845405145
The testing F1 Score is 0.27947533465361574
