In [112]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
import random

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC

NUM_OF_ROWS = 25000
ATTACK_NAME = 'DoS'

---

In [113]:
# import the attack sample dataset
dosSamples = pd.read_csv('dos_on_closed_port.csv')
dosSamples

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
0,3,298.309255,8,1215,298.309255,169.426912,28705.478370,229930.0,257,155.884746,...,162.955351,0,0,0,34.533588,269.389906,34.533588,0.028056,0.003712,0.003228
1,1,0.000000,0,0,0.000000,0.000000,0.000000,0.0,0,0.000000,...,0.000000,0,2,0,0.000068,29330.797203,0.000068,0.000068,0.000068,0.000000
2,1,0.000000,0,0,0.000000,0.000000,0.000000,0.0,0,0.000000,...,0.000000,0,0,0,0.011388,175.622485,0.011388,0.011388,0.011388,0.000000
3,1,425.965517,0,1428,425.965517,581.003204,337564.722949,5198.0,1399,519.800000,...,0.000000,2,28,0,0.685858,42.282804,0.685858,0.142152,0.024495,0.049216
4,0,0.000000,0,0,0.000000,0.000000,0.000000,0.0,0,0.000000,...,0.000000,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,1,36.000000,31,41,36.000000,5.000000,25.000000,31.0,31,31.000000,...,0.000000,0,0,0,0.072331,27.650680,0.072331,0.072331,0.072331,0.000000
447,1,101.000000,101,101,101.000000,0.000000,0.000000,101.0,101,101.000000,...,0.000000,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
448,2,36.000000,31,41,36.000000,5.000000,25.000000,62.0,31,31.000000,...,62.000000,0,0,0,3.988903,1.002782,3.988903,3.873262,1.329634,1.798616
449,1,125.000000,125,125,125.000000,0.000000,0.000000,125.0,125,125.000000,...,0.000000,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [114]:
# get all the relevant attack rows from the attack sample dataset
dosSamples = dosSamples[dosSamples['SYN Flag Count'] >= 50]

In [115]:
# print some general information about the attack samples
print(f'Dataset Shape: {dosSamples.shape}')
dosSamples.reset_index()

Dataset Shape: (20, 26)


Unnamed: 0,index,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,...,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
0,10,1,0.0,0,0,0.0,0.0,0.0,0.0,0,...,0.0,509,0,0,2.099056,242.489957,2.099056,0.832011,0.004132,0.050063
1,30,1,0.0,0,0,0.0,0.0,0.0,0.0,0,...,0.0,510,0,0,2.317981,220.019059,2.317981,0.895859,0.004554,0.053371
2,51,1,0.0,0,0,0.0,0.0,0.0,0.0,0,...,0.0,511,0,0,2.187265,233.625081,2.187265,0.85209,0.004289,0.052811
3,74,1,0.0,0,0,0.0,0.0,0.0,0.0,0,...,0.0,507,0,0,1.295714,391.290011,1.295714,0.791319,0.002561,0.035777
4,96,1,0.0,0,0,0.0,0.0,0.0,0.0,0,...,0.0,515,0,0,2.245945,229.302145,2.245945,0.828894,0.00437,0.05065
5,116,1,0.0,0,0,0.0,0.0,0.0,0.0,0,...,0.0,514,0,0,2.116229,242.884861,2.116229,0.832079,0.004125,0.050332
6,144,1,0.0,0,0,0.0,0.0,0.0,0.0,0,...,0.0,506,0,0,2.259497,223.943655,2.259497,1.027771,0.004474,0.05809
7,159,1,0.0,0,0,0.0,0.0,0.0,0.0,0,...,0.0,508,0,0,2.247484,226.030533,2.247484,1.023343,0.004433,0.058794
8,188,1,0.0,0,0,0.0,0.0,0.0,0.0,0,...,0.0,511,0,0,2.294056,222.749581,2.294056,1.020703,0.004498,0.058266
9,206,1,0.0,0,0,0.0,0.0,0.0,0.0,0,...,0.0,511,0,0,2.154165,237.21488,2.154165,0.833956,0.004224,0.051365


In [116]:
# find the columns that we need to synthesis data for to produce an attack dataset
columnsToGather = dosSamples.replace(0, np.nan) #replace all 0 values with null
columnsToGather = columnsToGather.dropna(how="all", axis=1).columns.tolist() #remove all columns where there are null values
columnsToGather #left with all columns that the values are not 0 (be know for a fact that the data is consistant and there are not missing values in the rows)

['Number of Ports',
 'SYN Flag Count',
 'Flow Duration',
 'Packets Per Second',
 'IAT Total',
 'IAT Max',
 'IAT Mean',
 'IAT Std']

In [117]:
hulkSamples = dosSamples.head(10)
goldeneyeSamples = dosSamples[10:]

## Hulk Samples Dataset

In [118]:
# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (float(hulkSamples[col].min() * 0.85), float(hulkSamples[col].max() * 1.15)) for col in columnsToGather}
MinMaxDict['Number of Ports'] = (1, 1)
MinMaxDict

{'Number of Ports': (1, 1),
 'SYN Flag Count': (430.09999999999997, 592.25),
 'Flow Duration': (1.1013570189476012, 2.6656781554222104),
 'Packets Per Second': (187.0162003563554, 449.98351258846293),
 'IAT Total': (1.1013570189476012, 2.6656781554222104),
 'IAT Max': (0.6726210594177245, 1.1819366455078124),
 'IAT Mean': (0.00217659489910585, 0.00523708871399246),
 'IAT Std': (0.030410694611119716, 0.06761263560541449)}

In [119]:
# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['SYN Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (1, 1),
 'SYN Flag Count': (430, 592),
 'Flow Duration': (1.1013570189476012, 2.6656781554222104),
 'Packets Per Second': (187.0162003563554, 449.98351258846293),
 'IAT Total': (1.1013570189476012, 2.6656781554222104),
 'IAT Max': (0.6726210594177245, 1.1819366455078124),
 'IAT Mean': (0.00217659489910585, 0.00523708871399246),
 'IAT Std': (0.030410694611119716, 0.06761263560541449)}

In [120]:
# creating an empty dataframe before adding values to it
hulkDataset = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(hulkSamples.columns))), columns=hulkSamples.columns)
hulkDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [121]:
# adding zeros to all columns that should not have any values
zeroColumns = [col for col in hulkSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    hulkDataset[col] = int(0)
zeroColumns

['Average Packet Size',
 'Packet Length Min',
 'Packet Length Max',
 'Packet Length Mean',
 'Packet Length Std',
 'Packet Length Variance',
 'Total Length of Fwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Bwd Packet Length Std',
 'Fwd Segment Size Avg',
 'Bwd Segment Size Avg',
 'Subflow Fwd Bytes',
 'ACK Flag Count',
 'RST Flag Count']

In [122]:
# adding the rest of the attack feature values to the dataset at random based on the smaple data
hulkDataset['Number of Ports'] = np.full(shape=NUM_OF_ROWS, fill_value=1, dtype=int)
hulkDataset['SYN Flag Count'] = np.random.randint(MinMaxDict['SYN Flag Count'][0], MinMaxDict['SYN Flag Count'][1], NUM_OF_ROWS)

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
firstCorrelation = ['SYN Flag Count', 'Flow Duration']
independent_col = hulkSamples[firstCorrelation[0]].values.reshape(-1, 1) 
dependent_cols = hulkSamples[firstCorrelation[1]].values.reshape(-1, 1) 

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]
factor = float(scaling_factors.flatten()[0])

for index, row in hulkDataset.iterrows():
    # calculate a random small delta of the factor for adding some randomness
    delta = random.uniform(factor * 0.1, factor * 0.2) # select a dantom
    updatedFactor = factor + random.choice([-1, 1]) * delta

    # calculate the value we want to add into the dataset in the given row
    for col in ['Flow Duration', 'IAT Total']:
        hulkDataset.loc[index, col] = row['SYN Flag Count'] * updatedFactor

In [123]:
hulkDataset

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
0,1,0,0,0,0,0,0,0,0,0,...,0,520,0,0,2.462459,0.0,2.462459,0.0,0.0,0.0
1,1,0,0,0,0,0,0,0,0,0,...,0,586,0,0,2.061910,0.0,2.061910,0.0,0.0,0.0
2,1,0,0,0,0,0,0,0,0,0,...,0,455,0,0,2.214281,0.0,2.214281,0.0,0.0,0.0
3,1,0,0,0,0,0,0,0,0,0,...,0,439,0,0,1.521603,0.0,1.521603,0.0,0.0,0.0
4,1,0,0,0,0,0,0,0,0,0,...,0,569,0,0,2.109094,0.0,2.109094,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,1,0,0,0,0,0,0,0,0,0,...,0,484,0,0,2.368068,0.0,2.368068,0.0,0.0,0.0
24996,1,0,0,0,0,0,0,0,0,0,...,0,442,0,0,2.149381,0.0,2.149381,0.0,0.0,0.0
24997,1,0,0,0,0,0,0,0,0,0,...,0,438,0,0,1.600161,0.0,1.600161,0.0,0.0,0.0
24998,1,0,0,0,0,0,0,0,0,0,...,0,517,0,0,2.434701,0.0,2.434701,0.0,0.0,0.0


In [124]:
# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
secondCorrelation = ['Flow Duration', 'Packets Per Second', 'IAT Max', 'IAT Mean', 'IAT Std']
independent_col = dosSamples[secondCorrelation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = dosSamples[secondCorrelation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,float(factor)) for name, factor in zip(secondCorrelation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Packets Per Second', 28.830840308166604)
('IAT Max', 0.3666720155316033)
('IAT Mean', 0.0016812305332870267)
('IAT Std', 0.01659843824523716)


In [125]:
durationToPacketsCorr = [x * y for x, y in zip(hulkSamples['Flow Duration'].values, hulkSamples['Packets Per Second'].values)]
durationToPacketsCorr = float(np.mean(durationToPacketsCorr))
durationToPacketsCorr

510.2

In [126]:
#iterating over all rows we need to add values
for index, row in hulkDataset.iterrows():
    for col, factor in scaling_factors: 
        # calculate a random small delta of the factor for adding some randomness
        if col == 'Packets Per Second':
            delta = random.uniform(durationToPacketsCorr * 0.1, durationToPacketsCorr * 0.2) # select a delta
            updatedFactor = durationToPacketsCorr + delta
            hulkDataset.loc[index, col] = updatedFactor / row['Flow Duration']
        else:
            if col == 'IAT Std':
                delta = random.uniform(factor * 0.55, factor * 0.8)
                updatedFactor = factor + random.choices([-1, 1], weights=[2, 1], k=1)[0] * delta
            else:
                delta = random.uniform(factor * 0.1, factor * 0.2) # select a delta
                updatedFactor = factor + random.choice([-1, 1]) * delta
            hulkDataset.loc[index, col] = row['Flow Duration'] * updatedFactor

In [127]:
# adding a label to the dataset
hulkDataset['Label'] = ATTACK_NAME

In [128]:
hulkSamples

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
10,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,509,0,0,2.099056,242.489957,2.099056,0.832011,0.004132,0.050063
30,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,510,0,0,2.317981,220.019059,2.317981,0.895859,0.004554,0.053371
51,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,511,0,0,2.187265,233.625081,2.187265,0.85209,0.004289,0.052811
74,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,507,0,0,1.295714,391.290011,1.295714,0.791319,0.002561,0.035777
96,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,515,0,0,2.245945,229.302145,2.245945,0.828894,0.00437,0.05065
116,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,514,0,0,2.116229,242.884861,2.116229,0.832079,0.004125,0.050332
144,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,506,0,0,2.259497,223.943655,2.259497,1.027771,0.004474,0.05809
159,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,508,0,0,2.247484,226.030533,2.247484,1.023343,0.004433,0.058794
188,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,511,0,0,2.294056,222.749581,2.294056,1.020703,0.004498,0.058266
206,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,511,0,0,2.154165,237.21488,2.154165,0.833956,0.004224,0.051365


In [129]:
x = hulkDataset[hulkDataset['SYN Flag Count'] > 500]
x = x[x['SYN Flag Count'] < 520]
x[0:10]

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std,Label
5,1,0,0,0,0,0,0,0,0,0,...,507,0,0,2.410857,253.288356,2.410857,0.712582,0.003426,0.011304,DoS
11,1,0,0,0,0,0,0,0,0,0,...,509,0,0,1.835289,308.189505,1.835289,0.803572,0.002542,0.009436,DoS
13,1,0,0,0,0,0,0,0,0,0,...,515,0,0,2.36591,258.486401,2.36591,1.003408,0.003494,0.011689,DoS
47,1,0,0,0,0,0,0,0,0,0,...,504,0,0,2.385886,245.120097,2.385886,1.011484,0.003602,0.011652,DoS
48,1,0,0,0,0,0,0,0,0,0,...,514,0,0,2.550492,236.765647,2.550492,0.76779,0.00497,0.015473,DoS
53,1,0,0,0,0,0,0,0,0,0,...,516,0,0,2.394539,248.079347,2.394539,0.706696,0.004796,0.014403,DoS
60,1,0,0,0,0,0,0,0,0,0,...,519,0,0,1.816127,320.577155,1.816127,0.754397,0.003498,0.049975,DoS
66,1,0,0,0,0,0,0,0,0,0,...,516,0,0,1.878225,316.777123,1.878225,0.788119,0.003577,0.007247,DoS
85,1,0,0,0,0,0,0,0,0,0,...,507,0,0,1.729303,329.019347,1.729303,0.538178,0.00234,0.008329,DoS
86,1,0,0,0,0,0,0,0,0,0,...,509,0,0,1.727525,347.551394,1.727525,0.547263,0.002609,0.008249,DoS


In [130]:
hulkDataset

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std,Label
0,1,0,0,0,0,0,0,0,0,0,...,520,0,0,2.462459,237.957881,2.462459,0.723773,0.003364,0.010271,DoS
1,1,0,0,0,0,0,0,0,0,0,...,586,0,0,2.061910,287.488662,2.061910,0.903525,0.003082,0.014825,DoS
2,1,0,0,0,0,0,0,0,0,0,...,455,0,0,2.214281,269.651455,2.214281,0.682438,0.004186,0.015051,DoS
3,1,0,0,0,0,0,0,0,0,0,...,439,0,0,1.521603,389.067564,1.521603,0.643477,0.002091,0.006721,DoS
4,1,0,0,0,0,0,0,0,0,0,...,569,0,0,2.109094,278.115565,2.109094,0.880922,0.004023,0.062987,DoS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,1,0,0,0,0,0,0,0,0,0,...,484,0,0,2.368068,256.536960,2.368068,1.015942,0.004499,0.069065,DoS
24996,1,0,0,0,0,0,0,0,0,0,...,442,0,0,2.149381,266.870941,2.149381,0.709225,0.004175,0.062894,DoS
24997,1,0,0,0,0,0,0,0,0,0,...,438,0,0,1.600161,363.356004,1.600161,0.511137,0.003011,0.046634,DoS
24998,1,0,0,0,0,0,0,0,0,0,...,517,0,0,2.434701,234.205465,2.434701,1.002276,0.003290,0.014537,DoS


In [131]:
hulkDataset.describe()

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,...,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,510.17956,0.0,0.0,2.120891,285.685338,2.120891,0.778334,0.003568,0.027096
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,47.069709,0.0,0.0,0.380452,51.983447,0.380452,0.184747,0.000845,0.023311
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,430.0,0.0,0.0,1.433413,191.429201,1.433413,0.424677,0.001958,0.004834
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,469.0,0.0,0.0,1.80126,240.878081,1.80126,0.640388,0.002934,0.010015
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,510.0,0.0,0.0,2.083445,281.472395,2.083445,0.760116,0.003485,0.013606
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,551.0,0.0,0.0,2.436406,325.544225,2.436406,0.891339,0.004083,0.049549
max,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,591.0,0.0,0.0,2.947164,423.286252,2.947164,1.293877,0.005868,0.086875


## Goldeneye Samples Dataset

In [132]:
# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (float(goldeneyeSamples[col].min() * 0.85), float(goldeneyeSamples[col].max() * 1.15)) for col in columnsToGather}
MinMaxDict['Number of Ports'] = (1, 1)
MinMaxDict

# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['SYN Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (1, 1),
 'SYN Flag Count': (512, 704),
 'Flow Duration': (3.511592209339142, 9.859183549880981),
 'Packets Per Second': (60.7765842849364, 168.41007290857817),
 'IAT Total': (3.511592209339142, 9.859183549880981),
 'IAT Max': (0.9012591481208801, 5.05841863155365),
 'IAT Mean': (0.005813894386323005, 0.01610977704228914),
 'IAT Std': (0.04190411015533854, 0.20705873768088517)}

In [133]:
# creating an empty dataframe before adding values to it
goldeneyeDataset = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(goldeneyeSamples.columns))), columns=goldeneyeSamples.columns)
goldeneyeDataset.head(3)

# adding zeros to all columns that should not have any values
zeroColumns = [col for col in goldeneyeSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    goldeneyeDataset[col] = int(0)
zeroColumns

['Average Packet Size',
 'Packet Length Min',
 'Packet Length Max',
 'Packet Length Mean',
 'Packet Length Std',
 'Packet Length Variance',
 'Total Length of Fwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Bwd Packet Length Std',
 'Fwd Segment Size Avg',
 'Bwd Segment Size Avg',
 'Subflow Fwd Bytes',
 'ACK Flag Count',
 'RST Flag Count']

In [134]:
# adding the rest of the attack feature values to the dataset at random based on the smaple data
goldeneyeDataset['Number of Ports'] = np.full(shape=NUM_OF_ROWS, fill_value=1, dtype=int)
goldeneyeDataset['SYN Flag Count'] = np.random.randint(MinMaxDict['SYN Flag Count'][0], MinMaxDict['SYN Flag Count'][1], NUM_OF_ROWS)

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
firstCorrelation = ['SYN Flag Count', 'Flow Duration']
independent_col = goldeneyeSamples[firstCorrelation[0]].values.reshape(-1, 1) 
dependent_cols = goldeneyeSamples[firstCorrelation[1]].values.reshape(-1, 1) 

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]
factor = float(scaling_factors.flatten()[0])

for index, row in goldeneyeDataset.iterrows():
    # calculate a random small delta of the factor for adding some randomness
    delta = random.uniform(factor * 0.1, factor * 0.2) # select a dantom
    updatedFactor = factor + random.choice([-1, 1]) * delta

    # calculate the value we want to add into the dataset in the given row
    for col in ['Flow Duration', 'IAT Total']:
        goldeneyeDataset.loc[index, col] = row['SYN Flag Count'] * updatedFactor

In [135]:
# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
secondCorrelation = ['Flow Duration', 'Packets Per Second', 'IAT Max', 'IAT Mean', 'IAT Std']
independent_col = dosSamples[secondCorrelation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = dosSamples[secondCorrelation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,float(factor)) for name, factor in zip(secondCorrelation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)
    
durationToPacketsCorr = [x * y for x, y in zip(goldeneyeSamples['Flow Duration'].values, goldeneyeSamples['Packets Per Second'].values)]
durationToPacketsCorr = float(np.mean(durationToPacketsCorr))
durationToPacketsCorr

('Packets Per Second', 28.830840308166604)
('IAT Max', 0.3666720155316033)
('IAT Mean', 0.0016812305332870267)
('IAT Std', 0.01659843824523716)


608.5

In [136]:
#iterating over all rows we need to add values
for index, row in goldeneyeDataset.iterrows():
    for col, factor in scaling_factors: 
        # calculate a random small delta of the factor for adding some randomness
        if col == 'Packets Per Second':
            delta = random.uniform(durationToPacketsCorr * 0.1, durationToPacketsCorr * 0.2) # select a delta
            updatedFactor = durationToPacketsCorr + delta
            goldeneyeDataset.loc[index, col] = updatedFactor / row['Flow Duration']
        else:
            if col == 'IAT Std':
                delta = random.uniform(factor * 0.55, factor * 0.8)
                updatedFactor = factor + random.choices([-1, 1], weights=[2, 1], k=1)[0] * delta
            else:
                delta = random.uniform(factor * 0.1, factor * 0.2) # select a delta
                updatedFactor = factor + random.choice([-1, 1]) * delta
            goldeneyeDataset.loc[index, col] = row['Flow Duration'] * updatedFactor
            
# adding a label to the dataset
goldeneyeDataset['Label'] = ATTACK_NAME

In [137]:
goldeneyeSamples

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
232,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,610,0,0,5.542298,110.062652,5.542298,1.652793,0.009101,0.075181
257,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,603,0,0,5.690761,105.961222,5.690761,2.867497,0.009453,0.117805
279,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,605,0,0,4.61412,131.11926,4.61412,1.12908,0.007639,0.051523
302,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,605,0,0,4.131285,146.443542,4.131285,1.060305,0.00684,0.049299
324,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,611,0,0,5.122786,119.271042,5.122786,1.362833,0.008398,0.060642
345,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,611,0,0,6.088568,100.352004,6.088568,2.312779,0.009981,0.096719
366,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,606,0,0,5.658547,107.094632,5.658547,1.969987,0.009353,0.085119
392,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,613,0,0,6.801349,90.12918,6.801349,1.857952,0.011113,0.089024
416,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,613,0,0,8.573203,71.501864,8.573203,4.398625,0.014009,0.180051
439,1,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,608,0,0,5.06431,120.05584,5.06431,1.097122,0.008343,0.051962


In [138]:
goldeneyeDataset

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std,Label
0,1,0,0,0,0,0,0,0,0,0,...,694,0,0,5.369312,132.491696,5.369312,2.318164,0.007536,0.039299,DoS
1,1,0,0,0,0,0,0,0,0,0,...,597,0,0,6.658063,100.659962,6.658063,2.891693,0.012753,0.033305,DoS
2,1,0,0,0,0,0,0,0,0,0,...,669,0,0,7.293305,94.235748,7.293305,2.241670,0.014143,0.208607,DoS
3,1,0,0,0,0,0,0,0,0,0,...,701,0,0,5.418209,134.505731,5.418209,2.209862,0.008146,0.032307,DoS
4,1,0,0,0,0,0,0,0,0,0,...,681,0,0,5.396446,126.571122,5.396446,1.588967,0.008162,0.029960,DoS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,1,0,0,0,0,0,0,0,0,0,...,538,0,0,5.877292,115.698274,5.877292,2.488357,0.008625,0.041033,DoS
24996,1,0,0,0,0,0,0,0,0,0,...,564,0,0,6.137707,116.127557,6.137707,2.655081,0.012309,0.040393,DoS
24997,1,0,0,0,0,0,0,0,0,0,...,696,0,0,5.876105,122.272146,5.876105,1.837717,0.008345,0.031808,DoS
24998,1,0,0,0,0,0,0,0,0,0,...,691,0,0,5.224980,133.351826,5.224980,1.567833,0.010399,0.031983,DoS


In [139]:
dosDataset = pd.concat([hulkDataset, goldeneyeDataset], axis=0)
dosDataset = dosDataset.sample(frac=1, random_state=42).reset_index(drop=True)
print(dosDataset.shape)

# save the dataset
dosDataset.to_csv('dos_closed_port_dataset.csv', index=False)

(50000, 27)


---