## DDoS attack Detection using Logistic Regression

In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/ddos-sdn-dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Data

In [5]:
data = pd.read_csv("./dataset_sdn.csv")
data.head()

Unnamed: 0,dt,switch,src,dst,pktcount,bytecount,dur,dur_nsec,tot_dur,flows,...,pktrate,Pairflow,Protocol,port_no,tx_bytes,rx_bytes,tx_kbps,rx_kbps,tot_kbps,label
0,11425,1,10.0.0.1,10.0.0.8,45304,48294064,100,716000000,101000000000.0,3,...,451,0,UDP,3,143928631,3917,0,0.0,0.0,0
1,11605,1,10.0.0.1,10.0.0.8,126395,134737070,280,734000000,281000000000.0,2,...,451,0,UDP,4,3842,3520,0,0.0,0.0,0
2,11425,1,10.0.0.2,10.0.0.8,90333,96294978,200,744000000,201000000000.0,3,...,451,0,UDP,1,3795,1242,0,0.0,0.0,0
3,11425,1,10.0.0.2,10.0.0.8,90333,96294978,200,744000000,201000000000.0,3,...,451,0,UDP,2,3688,1492,0,0.0,0.0,0
4,11425,1,10.0.0.2,10.0.0.8,90333,96294978,200,744000000,201000000000.0,3,...,451,0,UDP,3,3413,3665,0,0.0,0.0,0


In [6]:
m,n = data.shape
print(f"No. of training examples: {m}\nNo. of columns(including label): {n}")

No. of training examples: 104345
No. of columns(including label): 23


In [7]:
# important_features = [
#     'src',
#     'pktcount',
#     'dst',
#     'byteperflow',
#     'pktperflow',
#     'pktrate',
#     'tot_kbps',
#     'rx_kbps',
#     'flows',
#     'bytecount',
#     'dt',
#     'Protocol',
#     'dur',
#     'tot_dur',
#     'label',
# ]
# data = data[important_features]
# data.head()

## 2. Data PreProcessing

The data contains several rows with **NaN (NULL values)**

In [8]:
df1 = data.copy()

# eliminating rows with null values
df1 = df1.dropna()

   The columns dt(Destination IP address), src(Source IP address), dst (Destination port no.) are **less relevant** weighted features in training the model.

In [9]:
X = df1.drop(['dt','src','dst','label'],axis=1)

# Extracting target label from the dataset
y = df1['label']
X.head()

Unnamed: 0,switch,pktcount,bytecount,dur,dur_nsec,tot_dur,flows,packetins,pktperflow,byteperflow,pktrate,Pairflow,Protocol,port_no,tx_bytes,rx_bytes,tx_kbps,rx_kbps,tot_kbps
0,1,45304,48294064,100,716000000,101000000000.0,3,1943,13535,14428310,451,0,UDP,3,143928631,3917,0,0.0,0.0
1,1,126395,134737070,280,734000000,281000000000.0,2,1943,13531,14424046,451,0,UDP,4,3842,3520,0,0.0,0.0
2,1,90333,96294978,200,744000000,201000000000.0,3,1943,13534,14427244,451,0,UDP,1,3795,1242,0,0.0,0.0
3,1,90333,96294978,200,744000000,201000000000.0,3,1943,13534,14427244,451,0,UDP,2,3688,1492,0,0.0,0.0
4,1,90333,96294978,200,744000000,201000000000.0,3,1943,13534,14427244,451,0,UDP,3,3413,3665,0,0.0,0.0


In [10]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

The column "protocol" is symbolic meaning it cannot be handled by the interpreter and is **mapped into a dummy variable** which is understood by the interpreter.

In [11]:
# convert categorial data into dummy variables
X = pd.get_dummies(X)

## 3. Importing Required classes from scikit-learn library

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
X_train.head()

Unnamed: 0,switch,pktcount,bytecount,dur,dur_nsec,tot_dur,flows,packetins,pktperflow,byteperflow,...,Pairflow,port_no,tx_bytes,rx_bytes,tx_kbps,rx_kbps,tot_kbps,Protocol_ICMP,Protocol_TCP,Protocol_UDP
70788,6,68485,3698190,226,764000000,227000000000.0,7,12106,8967,484218,...,1,3,9215077,8265383,275,256.0,531.0,False,True,False
91167,5,923,90454,943,114000000,943000000000.0,9,4942,30,2940,...,1,3,406489737,135917821,2,2.0,4.0,True,False,False
13865,4,63325,67504450,140,629000000,141000000000.0,3,2242,13494,14384604,...,0,2,3660,3926,0,0.0,0.0,False,False,True
86773,2,2,196,2,905000000,2905000000.0,5,4942,0,0,...,1,2,133741305,2564,0,0.0,0.0,True,False,False
12582,4,22331,23268902,70,734000000,70734000000.0,5,1298,9741,10150122,...,0,4,3404,3236,0,0.0,0.0,False,False,True


In [14]:
X_train.shape, y_train.shape

((77879, 21), (77879,))

## 4. Scaling the Data

[Z-Score Normalization](https://towardsai.net/p/machine-learning/machine-learning-standardization-z-score-normalization-with-mathematics) scaling method is used under the hood by the StandardScaler() class.

In [15]:
scaler = StandardScaler()
scaler.fit_transform(X_train, y_train)

array([[ 0.91593663,  0.30502722, -0.70405606, ..., -0.8156974 ,
         1.59670711, -0.68400595],
       [ 0.40393751, -0.99406436, -0.77797943, ...,  1.22594482,
        -0.62628894, -0.68400595],
       [-0.1080616 ,  0.20581002,  0.60334931, ..., -0.8156974 ,
        -0.62628894,  1.46197558],
       ...,
       [ 2.45193397, -0.99283376, -0.77785091, ...,  1.22594482,
        -0.62628894, -0.68400595],
       [-1.64405894,  0.89806151,  1.38972632, ..., -0.8156974 ,
        -0.62628894,  1.46197558],
       [-0.1080616 ,  1.57831464,  2.16247355, ..., -0.8156974 ,
        -0.62628894,  1.46197558]])

## 5. Training the Model

In [16]:
logReg = LogisticRegression(C=0.03,max_iter=10000)
logReg.fit(X_train, y_train)

## 6. Prediction using the trained model

In [17]:
predictions = logReg.predict(X_test)  # Predicted list
predictions[0:5]

array([0, 1, 0, 0, 0], dtype=int64)

## 7. Evaluating the accuracy

In [18]:
accuracy = logReg.score(X_test, y_test)
print(f"Accuracy is {accuracy*100 : 0.2f} %")

Accuracy is  70.17 %
