In [1]:
from IPython.display import display, HTML
display(HTML("""
<style>
 .messagebox{
     border-radius: 2px;
     padding: 1.25em 1.5em;
     border: 1px solid;
 }
.messagelightgreen{
     border-color: hsl(164deg 95% 38%);
     color: rgb(5 139 102);
     background-color: rgb(236 255 250);
 }
 .messagelightgreen b{
     color:rgb(139 77 5);
 }
 .messagebrown{
     border-color: hsl(35deg 96% 62%);
    color: rgb(143 84 4);
    background-color: rgb(255 245 234);
 }
 .messagebrown b{
     color: rgb(5 139 102);
 }
</style>"""))

In [2]:
import os, re, time, math, tqdm, itertools

In [3]:
import numpy as np

In [4]:
import pandas as pd

In [5]:
import glob

In [6]:
from pathlib import Path

In [7]:
import matplotlib.pyplot as plt

In [8]:
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 14
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.titlesize'] = 14
plt.rcParams['figure.figsize'] = (20,8)
plt.rcParams["ps.useafm"] = True

In [9]:
import plotly.express as px

In [10]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [11]:
import seaborn as sns

In [12]:
from imblearn.under_sampling import RandomUnderSampler

In [13]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

In [14]:
import lightgbm as lgb

In [15]:
from lightgbm import plot_importance,plot_split_value_histogram

In [16]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier

In [17]:
from sklearn.svm import SVC, LinearSVC

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc

In [20]:
import tensorflow as tf

In [21]:
from tensorflow.keras.layers import Dense, LSTM, Input 

In [22]:
from tensorflow.keras.models import Model, Sequential

In [23]:
from tensorflow.keras.utils import to_categorical

In [24]:
from tensorflow.keras.utils import plot_model

In [25]:
file_list = glob.glob("Dataset/*.csv")

In [26]:
df_list = []

In [None]:
for file_name in file_list:
    df = pd.read_csv(file_name)
    df_list.append(df)


Columns (85) have mixed types.Specify dtype option on import or set low_memory=False.



In [None]:
df = pd.concat(df_list)

In [None]:
df.head().style.background_gradient(cmap='Spectral')

In [None]:
df.tail(10).style.background_gradient(cmap='coolwarm')

In [None]:
df.shape

In [None]:
display(HTML("<h6 class='messagebox messagelightgreen'>No of Rows Available in Dataset <b>{0}</b></h6>".format(df.shape[0])))
display(HTML("<h6 class='messagebox messagelightgreen'>No of Columns Available in Dataset <b>{0}</b></h6>".format(df.shape[1])))

In [None]:
df.memory_usage(deep=True)

In [None]:
df.memory_usage(deep=True).sum()

In [None]:
df.columns

In [None]:
df.columns = [col.strip() for col in df.columns]

In [None]:
df.columns

In [None]:
df.info()

In [None]:
pd.options.mode.use_inf_as_na = True

In [None]:
df.isna()

In [None]:
df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.describe()

In [None]:
label_vc = df["Label"].value_counts()

In [None]:
px.bar(label_vc, x=label_vc.index, y = label_vc.values, 
           color=label_vc.index, title="Dataset DDoS Attack Type Count",
          labels={'y': "Attack Count", 'index': "DDoS Attack Name"}, color_discrete_sequence=px.colors.qualitative.G10)

In [None]:
df.replace(to_replace=["Portmap"], value="UDP", inplace=True)

In [None]:
label_vc = df["Label"].value_counts()

In [None]:
px.bar(label_vc, x=label_vc.index, y = label_vc.values, 
           color=label_vc.index, title="Dataset DDoS Attack Type Count",
          labels={'y': "Attack Count", 'index': "DDoS Attack Name"}, color_discrete_sequence=px.colors.qualitative.G10)

In [None]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])
object_df = df.select_dtypes(include=['object'])
numeric_cols = numeric_df.columns
object_cols = object_df.columns
print('Numeric Columns: ')
print(numeric_cols, '\n')
print('Object Columns: ')
print(object_cols, '\n')
print('Number of Numeric Features: ', len(numeric_cols))
print('Number of Object Features: ', len(object_cols))

In [None]:
display(HTML("<h6 class='messagebox messagebrown'>Number of Numeric Columns Available in Dataset <b>{0}</b></h6>".format(len(numeric_cols))))
display(HTML("<h6 class='messagebox messagebrown'>Number of Object Columns Available in Dataset <b>{0}</b></h6>".format(len(object_cols))))

In [None]:
object_df.head()

In [None]:
df["SimillarHTTP"].value_counts()

In [None]:
plt.figure(figsize=(19, 17), dpi=80)
plt.barh(list(dict(df["Source IP"].value_counts()).keys()), dict(df["Source IP"].value_counts()).values())

for idx, val in enumerate(dict(df["Source IP"].value_counts()).values()):
    plt.text(x = val, y = idx-0.2, s = str(val), color='r', size = 13)

plt.xlabel('Number of Requests')
plt.ylabel('IP addres of sender')
plt.title('Number of all reqests')

In [None]:
plt.figure(figsize=(29, 27), dpi=80)
plt.barh(list(dict(df["Destination IP"].value_counts()).keys()), dict(df["Destination IP"].value_counts()).values())

for idx, val in enumerate(dict(df["Destination IP"].value_counts()).values()):
    plt.text(x = val, y = idx-0.2, s = str(val), color='r', size = 13)

plt.xlabel('Number of Requests From Destination IP')
plt.ylabel('IP addres of Destination')
plt.title('Number of all reqests')

In [None]:
plt.figure(figsize=(12, 17), dpi=80)
plt.barh(list(dict(df["Source IP"].value_counts()).keys()), dict(df["Source IP"].value_counts()).values(), color='lawngreen')
plt.barh(list(dict(df[df.Label != "BENIGN"]["Source IP"].value_counts()).keys()), dict(df[df.Label != "BENIGN"]["Source IP"].value_counts()).values(), color='blue')

for idx, val in enumerate(dict(df["Source IP"].value_counts()).values()):
    plt.text(x = val, y = idx-0.2, s = str(val), color='r', size = 13)

for idx, val in enumerate(dict(df[df.Label == "BENIGN"]["Source IP"].value_counts()).values()):
    plt.text(x = val, y = idx-0.2, s = str(val), color='w', size = 13)


plt.xlabel('Number of Requests')
plt.ylabel('IP addres of sender')

plt.title('Number of requests from different IP adress')

In [None]:
attack_types = df["Label"].unique()

In [None]:
attack_types

In [None]:
object_cols = object_cols.tolist()

In [None]:
object_cols.remove("Label")

In [None]:
label_encoder = LabelEncoder()

In [None]:
df['Label']= label_encoder.fit_transform(df['Label'])

In [None]:
df['Label'].unique()

In [None]:
df.drop(object_cols, axis=1, inplace=True)

In [None]:
df.columns

In [None]:
df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [None]:
column_list = df.columns.tolist()

In [None]:
len(column_list)

In [None]:
X = df[column_list]

In [None]:
y = df[["Label"]]

In [None]:
rus = RandomUnderSampler(random_state=0)

In [None]:
X, y = rus.fit_resample(X, y)

In [None]:
type(X)

In [None]:
type(y)

In [None]:
df = X

In [None]:
df["Label"] = y

In [None]:
df.head()

In [None]:
label_vc = df["Label"].value_counts()

In [None]:
px.bar(label_vc, x=attack_types, y = label_vc.values, 
           color=label_vc.index, title="Dataset DDoS Attack Type Count",
          labels={'y': "Attack Count", 'index': "DDoS Attack Name"}, color_discrete_sequence=px.colors.qualitative.G10)

In [None]:
numeric_df = df.select_dtypes(include=['int64', 'float64'])
object_df = df.select_dtypes(include=['object'])
numeric_cols = numeric_df.columns
object_cols = object_df.columns
print('Numeric Columns: ')
print(numeric_cols, '\n')
print('Object Columns: ')
print(object_cols, '\n')
print('Number of Numeric Features: ', len(numeric_cols))
print('Number of Object Features: ', len(object_cols))

In [None]:
display(HTML("<h6 class='messagebox messagebrown'>Number of Numeric Columns Available in Dataset <b>{0}</b></h6>".format(len(numeric_cols))))
display(HTML("<h6 class='messagebox messagebrown'>Number of Object Columns Available in Dataset <b>{0}</b></h6>".format(len(object_cols))))

In [None]:
params = { 
    'boosting_type': 'gbdt',  
    'objective': 'multiclass',  
    'num_class': 6,   
    'learning_rate': 0.01,
    'min_gain_to_split': 0.2,  
    'verbose': 1,
    'num_threads':4,
}

In [None]:
dtrain = lgb.Dataset(
    X, label=y
)


In [None]:
bst = lgb.train(
    params, dtrain, num_boost_round=10
)

In [None]:
def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax,max_num_features=20)

In [None]:
plot_features(bst, (15,15))

In [None]:
df.columns

In [None]:
features = ["Source Port", "Fwd Packet Length Min", "Init_Win_bytes_forward", "ACK Flag Count", "Protocol", "Destination Port", "Fwd IAT Total", "Active Std"]

In [None]:
X = df[features]

In [None]:
X.head()

In [None]:
scaler = MinMaxScaler()

In [None]:
X = scaler.fit_transform(X)

In [None]:
pd.DataFrame(X, columns=features)

In [None]:
y = df["Label"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=2, stratify=y)

In [None]:
rfc = RandomForestClassifier()

In [None]:
rfc.fit(X_train,y_train)

In [None]:
y_pred=rfc.predict(X_test)

In [None]:
rfc_ac=accuracy_score(y_test, y_pred)*100

In [None]:
display(HTML("<h6 class='messagebox messagelightgreen'>Random Forest Accuracy Score <b>{0}</b></h6>".format(rfc_ac)))

In [None]:
print(classification_report(y_test, y_pred,target_names=attack_types))

In [None]:
#X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
#X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

In [None]:
X_train.shape, X_test.shape

In [None]:
len(attack_types)

In [None]:
def create_lstm_model():
    model = Sequential()
    model.add(Dense(units=50, activation='relu'))
    model.add(Dense(6,activation='softmax'))
    return model

In [None]:
model = create_lstm_model()

In [None]:
model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=300, batch_size=5000,validation_split=0.2)

In [None]:
test_results = model.evaluate(X_test, y_test, verbose=1)
print(f'Test results - Loss: {test_results[0]} - Accuracy: {test_results[1]*100}%')

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title("Plot of accuracy vs epoch for train and test dataset")
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title("Plot of loss vs epoch for train and test dataset")
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_test

In [None]:
y_pred_cm  = np.argmax(y_pred, axis=1)


In [None]:
print(classification_report(y_test, y_pred_cm,target_names=attack_types))