# Support Vector Machines
Support Vector Machine Model using GridSearch.

In [None]:
from google.colab import drive
drive.mount('APA-DDoS-Dataset.csv')

Drive already mounted at APA-DDoS-Dataset.csv; to attempt to forcibly remount, call drive.mount("APA-DDoS-Dataset.csv", force_remount=True).


In [None]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%matplotlib inline

In [None]:
# Import data set
df=pd.read_csv("/APA-DDoS-Dataset.csv")
df.head()

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,ip.proto,frame.len,tcp.flags.syn,tcp.flags.reset,tcp.flags.push,tcp.flags.ack,...,tcp.seq,tcp.ack,frame.time,Packets,Bytes,Tx Packets,Tx Bytes,Rx Packets,Rx Bytes,Label
0,192.168.1.1,192.168.23.2,2412,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071112000 Mountain Dayli...,8,432,4,216,4,216,DDoS-PSH-ACK
1,192.168.1.1,192.168.23.2,2413,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071138000 Mountain Dayli...,10,540,5,270,5,270,DDoS-PSH-ACK
2,192.168.1.1,192.168.23.2,2414,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071146000 Mountain Dayli...,12,648,6,324,6,324,DDoS-PSH-ACK
3,192.168.1.1,192.168.23.2,2415,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071152000 Mountain Dayli...,10,540,5,270,5,270,DDoS-PSH-ACK
4,192.168.1.1,192.168.23.2,2416,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071159000 Mountain Dayli...,6,324,3,162,3,162,DDoS-PSH-ACK


The data set is presented in a dictionary form:

In [None]:
df.keys()       

Index(['ip.src', 'ip.dst', 'tcp.srcport', 'tcp.dstport', 'ip.proto',
       'frame.len', 'tcp.flags.syn', 'tcp.flags.reset', 'tcp.flags.push',
       'tcp.flags.ack', 'ip.flags.mf', 'ip.flags.df', 'ip.flags.rb', 'tcp.seq',
       'tcp.ack', 'frame.time', 'Packets', 'Bytes', 'Tx Packets', 'Tx Bytes',
       'Rx Packets', 'Rx Bytes', 'Label'],
      dtype='object')

We can grab information and arrays out of this dictionary to set up our data frame and understanding of the features.

In [None]:
df

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,ip.proto,frame.len,tcp.flags.syn,tcp.flags.reset,tcp.flags.push,tcp.flags.ack,...,tcp.seq,tcp.ack,frame.time,Packets,Bytes,Tx Packets,Tx Bytes,Rx Packets,Rx Bytes,Label
0,192.168.1.1,192.168.23.2,2412,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071112000 Mountain Dayli...,8,432,4,216,4,216,DDoS-PSH-ACK
1,192.168.1.1,192.168.23.2,2413,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071138000 Mountain Dayli...,10,540,5,270,5,270,DDoS-PSH-ACK
2,192.168.1.1,192.168.23.2,2414,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071146000 Mountain Dayli...,12,648,6,324,6,324,DDoS-PSH-ACK
3,192.168.1.1,192.168.23.2,2415,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071152000 Mountain Dayli...,10,540,5,270,5,270,DDoS-PSH-ACK
4,192.168.1.1,192.168.23.2,2416,8000,6,54,0,0,1,1,...,1,1,16-Jun 2020 20:18:15.071159000 Mountain Dayli...,6,324,3,162,3,162,DDoS-PSH-ACK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151195,192.168.19.1,192.168.23.2,37360,8000,6,66,0,0,0,1,...,1,1,16-Jun 2020 22:10:46.923006000 Mountain Dayli...,10,1146,6,560,4,586,Benign
151196,192.168.19.1,192.168.23.2,37362,8000,6,66,0,0,0,1,...,1,1,16-Jun 2020 22:10:46.935672000 Mountain Dayli...,10,1151,6,560,4,591,Benign
151197,192.168.19.1,192.168.23.2,37364,8000,6,66,0,0,0,1,...,1,1,16-Jun 2020 22:10:46.957469000 Mountain Dayli...,10,1144,6,560,4,584,Benign
151198,192.168.19.1,192.168.23.2,37366,8000,6,66,0,0,0,1,...,1,1,16-Jun 2020 22:10:46.970971000 Mountain Dayli...,10,1175,6,560,4,615,Benign


In [None]:
# Create data frame
df_feat = pd.DataFrame(df[['tcp.srcport', 
       'frame.len',  'Packets', 'Bytes', 'Tx Packets', 'Tx Bytes',
       'Rx Packets', 'Rx Bytes']].values, 
       columns=['tcp.srcport',
       'frame.len', 'Packets', 'Bytes', 'Tx Packets', 'Tx Bytes',
       'Rx Packets', 'Rx Bytes'], 
       index=df['ip.src'])


In [None]:
df_feat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 151200 entries, 192.168.1.1 to 192.168.19.1
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   tcp.srcport  151200 non-null  int64
 1   frame.len    151200 non-null  int64
 2   Packets      151200 non-null  int64
 3   Bytes        151200 non-null  int64
 4   Tx Packets   151200 non-null  int64
 5   Tx Bytes     151200 non-null  int64
 6   Rx Packets   151200 non-null  int64
 7   Rx Bytes     151200 non-null  int64
dtypes: int64(8)
memory usage: 10.4+ MB


In [None]:
# View data
df_feat.head()

Unnamed: 0_level_0,tcp.srcport,frame.len,Packets,Bytes,Tx Packets,Tx Bytes,Rx Packets,Rx Bytes
ip.src,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
192.168.1.1,2412,54,8,432,4,216,4,216
192.168.1.1,2413,54,10,540,5,270,5,270
192.168.1.1,2414,54,12,648,6,324,6,324
192.168.1.1,2415,54,10,540,5,270,5,270
192.168.1.1,2416,54,6,324,3,162,3,162


## Train Test Split

In [None]:
# Import function
from sklearn.model_selection import train_test_split

In [None]:
# Set up x and y
x = df_feat
y = df['Packets']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.50, random_state=50)

## Train the Support Vector Classifer

In [None]:
# Import model
from sklearn.svm import SVC

# Instantiate the model
model = SVC()

# Fit the model to the training data
model.fit(x_train,y_train)

C controls the cost of missclassification on the training data.  
**Large value:** low bias (because you penalized the cost of misclassification alot) and high variance.  
**Small value: **high bias (not penalizing the cost of missclassficiation as much) and low variance.  

**Gamma**  
**Small:** means Gaussian with a large variance  
**Large:** high bias and low variance  

## Predictions 

In [None]:
# Predict using default values
predictions = model.predict(x_test)

## Evaluation

In [None]:
# Imports
from sklearn.metrics import confusion_matrix,classification_report

# Confusion matrix
print(confusion_matrix(y_test,predictions))

# New line
print('\n')

# Classification report
print(classification_report(y_test,predictions))

[[    0     0     0     0     0     0     0     3     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0   739     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0    42     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0  3638     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0   150     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0  8248     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0   299     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0 10325     0     0     0     0
      0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0   31

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
