In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
import random

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC

NUM_OF_ROWS = 7500
ATTACK_NAME = 'DoS'

In [4]:
pd.set_option('display.max_columns', None)  # Show all columns

---

In [5]:
# import the attack sample dataset
dosSamples = pd.read_csv('dos_hulk_samples.csv')
dosSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,1,98.000000,93,103,5.00000,25.000000,39,39,39.000000,39,0.000000,49,49.000000,49,0.000000,0.000000,0.0,0.000000,0,0,0,0.071406,28.008801,0.071406,0.071406,0.000000
1,0,115.000000,115,115,0.00000,0.000000,0,0,0.000000,0,0.000000,81,81.000000,81,0.000000,0.000000,0.0,0.000000,0,0,0,8.089847,1.112506,1.024562,1.011231,0.034083
2,1,127.500000,92,163,35.50000,1260.250000,174,58,58.000000,58,0.000000,129,129.000000,129,0.000000,0.000000,0.0,0.000000,0,0,0,0.004730,1268.502646,0.003781,0.000946,0.001426
3,1,92.000000,92,92,0.00000,0.000000,348,58,58.000000,58,0.000000,0,0.000000,0,0.000000,0.000000,0.0,0.000000,0,0,0,0.004415,1359.065939,0.004412,0.000883,0.001764
4,1,113.448176,54,490,119.39751,14255.765358,792575,456,79.448176,20,119.397510,0,0.000000,0,0.000000,46.704391,0.0,105.578127,2819,5896,1941,3.921313,2544.045958,0.080743,0.000393,0.001147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,1,363.903226,94,1362,438.60625,192375.442248,5892,1308,368.250000,40,479.145007,1305,247.666667,49,381.032749,0.000000,0.0,392.800000,0,0,0,25.923333,1.195834,13.771338,0.864111,2.772981
208,1,127.500000,92,163,35.50000,1260.250000,348,58,58.000000,58,0.000000,129,129.000000,129,0.000000,0.000000,0.0,116.000000,0,0,0,15.059787,0.796824,14.958914,1.369072,4.297560
209,1,92.000000,92,92,0.00000,0.000000,580,58,58.000000,58,0.000000,0,0.000000,0,0.000000,0.000000,0.0,145.000000,0,0,0,15.060015,0.664010,14.957140,1.673335,4.696633
210,1,698.000000,698,698,0.00000,0.000000,4648,664,664.000000,664,0.000000,0,0.000000,0,0.000000,0.000000,0.0,1162.000000,0,0,0,6.758404,1.035747,2.047888,1.126401,0.749836


In [6]:
# dosSamples.drop('IAT Total', axis=1, inplace=True)
# dosSamples = dosSamples.rename(columns={'Average Packet Size': 'Average Packet Length'})
# dosSamples.to_csv('dos_hulk_new_samples2.csv', index=False)

In [7]:
# get all the relevant attack rows from the attack sample dataset
dosSamples = dosSamples[dosSamples['RST Flag Count'] > 20]

In [8]:
# print some general information about the attack samples
print(f'Dataset Shape: {dosSamples.shape}')
dosSamples.reset_index() 
# finding the correlation between the Packet Length Std column to the rest of the columns in order to create new data

Dataset Shape: (22, 26)


Unnamed: 0,index,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4,1,113.448176,54,490,119.39751,14255.765358,792575,456,79.448176,20,119.39751,0,0.0,0,0.0,46.704391,0.0,105.578127,2819,5896,1941,3.921313,2544.045958,0.080743,0.000393,0.001147
1,6,1,111.399578,54,492,118.554999,14055.287716,770745,458,77.399578,20,118.554999,0,0.0,0,0.0,45.408817,0.0,97.897244,2411,5932,2245,4.324354,2302.771733,0.041172,0.000434,0.001078
2,17,1,114.086974,54,490,122.233724,14941.083217,799268,456,80.086974,20,122.233724,0,0.0,0,0.0,48.782766,0.0,102.260491,1790,6418,2440,3.992678,2499.575369,0.052491,0.0004,0.000896
3,23,1,115.067034,54,488,123.774303,15320.077971,809049,454,81.067034,20,123.774303,0,0.0,0,0.0,50.176052,0.0,95.881607,1477,6596,2554,5.106365,1954.423561,0.100327,0.000512,0.001462
4,29,1,113.853212,54,488,122.117633,14912.716396,798053,454,79.853212,20,122.117633,0,0.0,0,0.0,48.652892,0.0,103.724071,1767,6383,2478,4.209765,2374.004139,0.047036,0.000421,0.000916
5,32,1,117.572573,54,489,125.957802,15865.367856,834890,455,83.572573,20,125.957802,0,0.0,0,0.0,52.133133,0.0,103.893728,1520,6990,2320,4.390662,2275.283341,0.070261,0.00044,0.001078
6,36,1,116.129676,54,490,122.376793,14976.079363,818915,456,82.129676,20,122.376793,0,0.0,0,0.0,49.261559,0.0,104.147908,2318,6829,1677,10.304537,967.632019,4.278063,0.001034,0.044491
7,42,1,110.064299,54,489,116.821153,13647.181842,758285,455,76.064299,20,116.821153,0,0.0,0,0.0,43.86127,0.0,104.432585,2632,5751,2172,3.477777,2866.486457,0.037094,0.000349,0.000705
8,46,1,106.599037,54,490,111.687568,12474.112822,723885,456,72.599037,20,111.687568,0,0.0,0,0.0,39.632233,0.0,85.092865,3146,5531,1760,9.178418,1086.352771,2.364858,0.000921,0.027002
9,54,1,104.720705,54,489,109.30536,11947.661642,701903,455,70.720705,20,109.30536,0,0.0,0,0.0,38.039194,0.0,84.070308,3115,5297,1848,16.610516,597.513051,1.989192,0.001674,0.032656


In [9]:
# find the columns that we need to synthesis data for to produce an attack dataset
columnsToGather = dosSamples.replace(0, np.nan) #replace all 0 values with null
columnsToGather = columnsToGather.dropna(how="all", axis=1).columns.tolist() #remove all columns where there are null values
columnsToGather #left with all columns that the values are not 0 (be know for a fact that the data is consistant and there are not missing values in the rows)

['Number of Ports',
 'Average Packet Length',
 'Packet Length Min',
 'Packet Length Max',
 'Packet Length Std',
 'Packet Length Variance',
 'Total Length of Fwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Min',
 'Fwd Packet Length Std',
 'Fwd Segment Size Avg',
 'Subflow Fwd Bytes',
 'SYN Flag Count',
 'ACK Flag Count',
 'RST Flag Count',
 'Flow Duration',
 'Packets Per Second',
 'IAT Max',
 'IAT Mean',
 'IAT Std']

In [10]:
# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (dosSamples[col].min() * 0.85, dosSamples[col].max() * 1.15) for col in columnsToGather}
MinMaxDict['Number of Ports'] = (1, 1)
MinMaxDict 

{'Number of Ports': (1, 1),
 'Average Packet Length': (np.float64(84.26142361541321),
  np.float64(135.20845845845844)),
 'Packet Length Min': (np.float64(45.9), np.float64(62.099999999999994)),
 'Packet Length Max': (np.float64(413.95), np.float64(565.8)),
 'Packet Length Std': (np.float64(85.69969727608765),
  np.float64(144.85147217068953)),
 'Packet Length Variance': (np.float64(8640.515427309489),
  np.float64(18245.17303479656)),
 'Total Length of Fwd Packet': (np.float64(446822.05),
  np.float64(960123.4999999999)),
 'Fwd Packet Length Max': (np.float64(385.05), np.float64(526.6999999999999)),
 'Fwd Packet Length Mean': (np.float64(55.36142361541321),
  np.float64(96.10845845845844)),
 'Fwd Packet Length Min': (np.float64(17.0), np.float64(23.0)),
 'Fwd Packet Length Std': (np.float64(85.69969727608765),
  np.float64(144.85147217068953)),
 'Fwd Segment Size Avg': (np.float64(26.95029736092182),
  np.float64(59.9531031031031)),
 'Subflow Fwd Bytes': (np.float64(60.07364130434783)

In [11]:
# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (1, 1),
 'Average Packet Length': (np.float64(84.26142361541321),
  np.float64(135.20845845845844)),
 'Packet Length Min': (np.float64(45.9), np.float64(62.099999999999994)),
 'Packet Length Max': (np.float64(413.95), np.float64(565.8)),
 'Packet Length Std': (np.float64(85.69969727608765),
  np.float64(144.85147217068953)),
 'Packet Length Variance': (np.float64(8640.515427309489),
  np.float64(18245.17303479656)),
 'Total Length of Fwd Packet': (np.float64(446822.05),
  np.float64(960123.4999999999)),
 'Fwd Packet Length Max': (np.float64(385.05), np.float64(526.6999999999999)),
 'Fwd Packet Length Mean': (np.float64(55.36142361541321),
  np.float64(96.10845845845844)),
 'Fwd Packet Length Min': (np.float64(17.0), np.float64(23.0)),
 'Fwd Packet Length Std': (np.float64(85.69969727608765),
  np.float64(144.85147217068953)),
 'Fwd Segment Size Avg': (np.float64(26.95029736092182),
  np.float64(59.9531031031031)),
 'Subflow Fwd Bytes': (np.float64(60.07364130434783)

### Creating the dataset

In [12]:
# creating an empty dataframe before adding values to it
dosDataset = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(dosSamples.columns))), columns=dosSamples.columns)
dosDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# adding zeros to all columns that should not have any values
zeroColumns = [col for col in dosSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    dosDataset[col] = int(0)
zeroColumns

['Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Bwd Packet Length Std',
 'Bwd Segment Size Avg']

In [14]:
dosDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Calculating the correct corelation between columns

### First group

In [15]:
# finding the correlation between the SYN Flag Count column to the rest of the columns in order to create new data
firstCorrelation = ['SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']
independent_col = dosSamples[firstCorrelation[0]].values.reshape(-1, 1) #column 'SYN Flag Count'
dependent_cols = dosSamples[firstCorrelation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(firstCorrelation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('ACK Flag Count', np.float64(1.9518394723650978))
('RST Flag Count', np.float64(0.6629111435788864))


In [16]:
dosDataset['SYN Flag Count'] = np.random.randint(MinMaxDict['SYN Flag Count'][0]*0.85, MinMaxDict['SYN Flag Count'][1]*1.15, NUM_OF_ROWS)

for index, row in dosDataset.iterrows():
    for col, factor in scaling_factors: #iterating over all rows we need to add values
        # calculate the value we want to add into the dataset in the given row
        delta = random.uniform(factor * 0.1, factor * 0.2) # select a delta
        updatedFactor = factor + random.choice([-1, 1]) * delta
        dosDataset.loc[index, col] = int(row['SYN Flag Count'] * updatedFactor)

In [17]:
dosDataset.head(10)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,2697,6008.0,1453.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,3575,8089.0,1989.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,2387,3917.0,1785.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,3753,6344.0,2135.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,3583,8131.0,2751.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,4727,10793.0,3661.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,2365,5349.0,1274.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,2217,5160.0,1682.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,4678,10173.0,2739.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,2620,5834.0,1524.0,0.0,0.0,0.0,0.0,0.0


### Second group

In [18]:
# Generate random values for the 'Flow Duration' column
randValues = np.random.uniform(MinMaxDict['Flow Duration'][0]*0.9, MinMaxDict['Flow Duration'][1]*1.05, size=NUM_OF_ROWS)

# Assign the random values
dosDataset['Flow Duration'] = randValues

In [19]:
# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
secondCorrelation = ['Flow Duration', 'Packets Per Second', 'IAT Max', 'IAT Mean', 'IAT Std']
independent_col = dosSamples[secondCorrelation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = dosSamples[secondCorrelation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(secondCorrelation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Packets Per Second', np.float64(29.47374840796544))
('IAT Max', np.float64(0.28959035080285817))
('IAT Mean', np.float64(0.00010689125614908151))
('IAT Std', np.float64(0.0035898854422333083))


In [20]:
durationToPacketsCorr = [x * y for x, y in zip(dosSamples['Flow Duration'].values, dosSamples['Packets Per Second'].values)]
durationToPacketsCorr = np.mean(durationToPacketsCorr)
durationToPacketsCorr

np.float64(9827.272727272728)

In [21]:
#iterating over all rows we need to add values
for index, row in dosDataset.iterrows():
    for col, factor in scaling_factors: 
        # calculate a random small delta of the factor for adding some randomness
        if col == 'Packets Per Second':
            delta = random.uniform(durationToPacketsCorr * 0.035, durationToPacketsCorr * 0.175) # select a delta
            updatedFactor = durationToPacketsCorr + random.choice([-1, 1]) * delta
            dosDataset.loc[index, col] = updatedFactor / row['Flow Duration']
        else:
            if col == 'IAT Std' or col == 'IAT Max':
                delta = random.uniform(factor * 0.7, factor * 0.95)
                updatedFactor = factor + random.choices([-1, 1], weights=[2, 1], k=1)[0] * delta
            else:
                delta = random.uniform(factor * 0.05, factor * 0.15) # select a delta
                updatedFactor = factor + random.choice([-1, 1]) * delta
            dosDataset.loc[index, col] = row['Flow Duration'] * updatedFactor

In [22]:
dosDataset.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3220.763333,6282.6584,2138.924,26.000244,616.729886,5.551433,0.002781,0.067976
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1241.387657,2636.478574,900.962418,13.528833,614.195515,7.243577,0.001479,0.089434
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1066.0,1672.0,571.0,2.663051,165.05821,0.046628,0.000247,0.000613
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2158.0,4105.0,1397.0,14.375145,260.103733,0.835051,0.001528,0.010456
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3236.5,6167.5,2084.5,25.763718,380.083051,1.903218,0.002748,0.023077
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4284.0,8151.75,2799.0,37.848072,684.698132,7.795317,0.003983,0.095651
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5387.0,12601.0,4276.0,49.567866,4139.166715,27.762177,0.006061,0.345853


In [23]:
x = dosDataset[dosDataset['Flow Duration']<10]
x[x['Flow Duration']>5][0:40]

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,3753,6344.0,2135.0,5.346412,1562.581408,0.421987,0.000526,0.002632
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,3583,8131.0,2751.0,7.185809,1315.390184,4.025162,0.000689,0.007479
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,2268,3663.0,1688.0,6.020076,1846.570256,3.090459,0.000568,0.004778
27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,1757,2908.0,1288.0,6.521642,1665.785754,3.64958,0.00061,0.042324
61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,1970,4404.0,1484.0,9.295798,907.17849,0.753773,0.000921,0.002862
65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,1286,2114.0,1018.0,5.402131,1744.952041,2.964594,0.00051,0.001308
77,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,5000,10816.0,3874.0,6.316354,1480.523416,3.329116,0.00059,0.041167
83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,1362,2323.0,781.0,7.5018,1458.080108,3.804169,0.000695,0.003832
86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,4646,7981.0,3601.0,6.021674,1870.834233,3.333521,0.000715,0.00189
88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,4906,10719.0,2883.0,8.680165,1303.845105,4.674996,0.000995,0.002393


In [24]:
x = dosDataset[dosDataset['Flow Duration']<50.5]
x[x['Flow Duration']>25][20:40]

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,1424,2460.0,786.0,48.134234,184.351554,2.463256,0.004818,0.318886
34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,2624,5652.0,1447.0,28.911274,314.334038,16.097407,0.002855,0.007634
36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,4346,9947.0,2342.0,41.976836,274.507414,23.50627,0.00476,0.010232
37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,4033,9192.0,2357.0,34.725566,303.980217,18.654441,0.004093,0.025814
38,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,1807,3936.0,1055.0,38.012583,277.05543,18.902276,0.004505,0.017261
40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,2090,3587.0,1646.0,26.788248,406.293697,0.913172,0.003207,0.175449
41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,3328,7195.0,1833.0,39.966599,205.88717,22.486377,0.003959,0.013398
42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,2682,4603.0,2048.0,45.293643,192.024926,24.675469,0.005266,0.277904
44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,2661,4255.0,2073.0,28.908334,360.46863,2.243714,0.003269,0.190927
45,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,2946,6672.0,2332.0,28.825544,370.57892,1.475443,0.002892,0.019168


In [25]:
dosDataset[dosDataset['Flow Duration']<6][:20]

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,3753,6344.0,2135.0,5.346412,1562.581408,0.421987,0.000526,0.002632
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,4727,10793.0,3661.0,3.326944,2508.777182,0.050328,0.000311,0.001466
33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,5166,8528.0,2872.0,4.00419,2203.518316,2.233861,0.000382,0.003313
35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,2235,3691.0,1723.0,3.348121,2599.560523,0.261768,0.000325,0.002996
43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,1365,2207.0,1022.0,3.315962,3101.123493,0.196377,0.000307,0.002841
65,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,1286,2114.0,1018.0,5.402131,1744.952041,2.964594,0.00051,0.001308
94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,1289,2184.0,687.0,4.224302,2461.571476,0.290878,0.000517,0.00137
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,2610,4520.0,1481.0,5.350366,1562.43892,0.443348,0.000625,0.032954
137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,4247,7235.0,2509.0,3.077062,2998.804279,0.251161,0.000364,0.001256
161,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,3687,8122.0,2852.0,5.709842,1420.479937,0.292291,0.000538,0.001663


### Third group

In [26]:
# finding the correlation between the Packet Length Std column to the rest of the columns in order to create new data
firstCorrelation = ['Average Packet Length', 'Packet Length Std', 'Packet Length Variance', 'Total Length of Fwd Packet', 
                    'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Fwd Segment Size Avg', 'Subflow Fwd Bytes']
independent_col = dosSamples[firstCorrelation[0]].values.reshape(-1, 1) #column 'Packet Length Std'
dependent_cols = dosSamples[firstCorrelation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(firstCorrelation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Packet Length Std', np.float64(1.053481965176044))
('Packet Length Variance', np.float64(122.15926993685173))
('Total Length of Fwd Packet', np.float64(6798.663741787297))
('Fwd Packet Length Mean', np.float64(0.6901644819681063))
('Fwd Packet Length Std', np.float64(1.053481965176044))
('Fwd Segment Size Avg', np.float64(0.39446115648863966))
('Subflow Fwd Bytes', np.float64(0.8079142101649653))


In [27]:
dosDataset['Average Packet Length'] = np.random.uniform(MinMaxDict['Average Packet Length'][0]*0.85, MinMaxDict['Average Packet Length'][1]*1.15, NUM_OF_ROWS)

for index, row in dosDataset.iterrows():
    for col, factor in scaling_factors: #iterating over all rows we need to add values
        # calculate the value we want to add into the dataset in the given row
        delta = random.uniform(factor * 0.1, factor * 0.2) # select a delta
        updatedFactor = factor + random.choice([-1, 1]) * delta
        dosDataset.loc[index, col] = row['Average Packet Length'] * updatedFactor

In [28]:
x = dosDataset[dosDataset['Average Packet Length'] > 114]
x[x['Average Packet Length'] < 120].tail(20)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
7208,0.0,118.98851,0.0,0.0,149.128252,16165.846141,655094.670687,0.0,69.195063,0.0,138.017112,0,0,0,0,51.917493,0,85.545,4593,10128.0,3543.0,17.572032,596.340745,1.323266,0.002061,0.017582
7220,0.0,114.704425,0.0,0.0,100.071327,16471.697669,901426.583541,0.0,64.465667,0.0,106.260136,0,0,0,0,36.638479,0,107.878575,2525,4028.0,1387.0,40.178659,269.460151,2.19873,0.003659,0.01935
7232,0.0,114.985075,0.0,0.0,138.511665,16714.072228,876468.479189,0.0,65.758352,0.0,143.227091,0,0,0,0,53.006379,0,103.046076,1552,2444.0,1213.0,32.331807,320.391616,0.941621,0.003955,0.020933
7249,0.0,119.478642,0.0,0.0,148.197773,12688.937526,723003.7895,0.0,97.722966,0.0,139.289856,0,0,0,0,40.827279,0,86.33804,1326,3023.0,721.0,25.702232,439.192043,12.867375,0.002359,0.023299
7259,0.0,115.072219,0.0,0.0,104.284451,12071.796866,937081.439692,0.0,65.792642,0.0,99.234958,0,0,0,0,52.935639,0,109.76203,1640,3811.0,1270.0,2.862544,3136.328331,1.417223,0.000351,0.018718
7262,0.0,119.116284,0.0,0.0,143.505431,12124.661031,698093.177182,0.0,95.474189,0.0,111.656463,0,0,0,0,41.714851,0,112.793216,3929,6802.0,3102.0,34.144869,332.854292,1.140466,0.003153,0.029168
7269,0.0,114.65545,0.0,0.0,139.668303,15725.264151,624212.378131,0.0,70.950004,0.0,101.003059,0,0,0,0,53.866734,0,104.118591,4558,7990.0,2564.0,7.370891,1547.372351,4.104249,0.000711,0.003794
7311,0.0,119.435364,0.0,0.0,141.375149,17499.599631,689407.970486,0.0,69.845484,0.0,150.901708,0,0,0,0,53.025821,0,77.731589,3273,5373.0,1903.0,15.409046,697.88607,0.467959,0.001444,0.01331
7329,0.0,116.592295,0.0,0.0,142.582149,12285.294555,946582.443348,0.0,96.529375,0.0,100.396918,0,0,0,0,39.063798,0,105.859072,2591,4120.0,1453.0,22.637738,376.754689,0.500144,0.002709,0.019086
7399,0.0,114.544796,0.0,0.0,139.945339,15438.313014,897293.742914,0.0,93.925858,0.0,133.859156,0,0,0,0,54.027543,0,105.067195,3502,6105.0,2766.0,20.013002,528.777677,0.998869,0.001973,0.127231


### Independant Columns

In [29]:
# finding the correlation between the Packet Length Std column to the rest of the columns in order to create new data
independantColumns = ['Packet Length Min', 'Packet Length Max', 'Fwd Packet Length Max', 'Fwd Packet Length Min']

for col in independantColumns:
    dosDataset[col] = (np.random.uniform(MinMaxDict[col][0]*0.85, MinMaxDict[col][1]*1.15, NUM_OF_ROWS)).astype(int)

dosDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,125.734524,40,434,109.215060,17325.263430,7.178628e+05,489,100.177220,23,149.255557,0,0,0,0,55.003533,0,115.157316,2697,6008.0,1453.0,26.101379,436.340808,0.470351,0.003174,0.016770
1,0.0,139.693999,39,600,120.931127,20333.767081,1.045740e+06,467,85.458329,16,131.793950,0,0,0,0,65.699896,0,99.029166,3575,8089.0,1989.0,40.033668,281.039285,22.156566,0.004907,0.022966
2,0.0,141.037015,43,387,133.154151,14299.036958,1.129013e+06,506,107.508314,20,172.974327,0,0,0,0,64.871719,0,93.913428,2387,3917.0,1785.0,34.375691,239.024572,1.503691,0.003397,0.021642
3,0.0,87.214116,69,389,105.358358,12109.392455,6.920213e+05,572,69.100868,25,80.472088,0,0,0,0,40.091871,0,80.905249,3753,6344.0,2135.0,5.346412,1562.581408,0.421987,0.000526,0.002632
4,0.0,81.041873,56,627,73.820046,8514.162811,4.794257e+05,469,62.797516,21,99.141644,0,0,0,0,27.678255,0,73.968437,3583,8131.0,2751.0,7.185809,1315.390184,4.025162,0.000689,0.007479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,0.0,78.022965,39,598,73.450632,8173.724490,4.434808e+05,413,59.294096,15,68.540832,0,0,0,0,26.423003,0,54.359915,5161,8079.0,2965.0,7.596529,1128.616558,0.125381,0.000914,0.047769
7496,0.0,105.328649,47,647,95.253263,11422.596968,8.069240e+05,437,59.718967,18,125.599829,0,0,0,0,35.373037,0,75.844115,3489,5841.0,2654.0,4.874747,1821.982721,0.195885,0.000557,0.001800
7497,0.0,80.360378,69,618,70.771714,11504.992591,6.209339e+05,520,62.919459,16,100.146842,0,0,0,0,28.489858,0,75.118196,4068,6744.0,3045.0,41.447755,226.000984,2.588513,0.003845,0.010707
7498,0.0,78.199068,48,449,70.279622,11261.623119,6.015547e+05,379,48.148848,23,91.280378,0,0,0,0,27.584787,0,52.904558,5336,8841.0,4051.0,28.431820,377.429821,2.004969,0.002813,0.186452


### Adding labels and verifiying the dataset

In [30]:
# adding number of ports and a label to the dataset
dosDataset['Number of Ports'] = np.full(shape=NUM_OF_ROWS, fill_value=1, dtype=int)
dosDataset['Label'] = ATTACK_NAME

In [31]:
dosDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std,Label
0,1,125.734524,40,434,109.215060,17325.263430,7.178628e+05,489,100.177220,23,149.255557,0,0,0,0,55.003533,0,115.157316,2697,6008.0,1453.0,26.101379,436.340808,0.470351,0.003174,0.016770,DoS
1,1,139.693999,39,600,120.931127,20333.767081,1.045740e+06,467,85.458329,16,131.793950,0,0,0,0,65.699896,0,99.029166,3575,8089.0,1989.0,40.033668,281.039285,22.156566,0.004907,0.022966,DoS
2,1,141.037015,43,387,133.154151,14299.036958,1.129013e+06,506,107.508314,20,172.974327,0,0,0,0,64.871719,0,93.913428,2387,3917.0,1785.0,34.375691,239.024572,1.503691,0.003397,0.021642,DoS
3,1,87.214116,69,389,105.358358,12109.392455,6.920213e+05,572,69.100868,25,80.472088,0,0,0,0,40.091871,0,80.905249,3753,6344.0,2135.0,5.346412,1562.581408,0.421987,0.000526,0.002632,DoS
4,1,81.041873,56,627,73.820046,8514.162811,4.794257e+05,469,62.797516,21,99.141644,0,0,0,0,27.678255,0,73.968437,3583,8131.0,2751.0,7.185809,1315.390184,4.025162,0.000689,0.007479,DoS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,1,78.022965,39,598,73.450632,8173.724490,4.434808e+05,413,59.294096,15,68.540832,0,0,0,0,26.423003,0,54.359915,5161,8079.0,2965.0,7.596529,1128.616558,0.125381,0.000914,0.047769,DoS
7496,1,105.328649,47,647,95.253263,11422.596968,8.069240e+05,437,59.718967,18,125.599829,0,0,0,0,35.373037,0,75.844115,3489,5841.0,2654.0,4.874747,1821.982721,0.195885,0.000557,0.001800,DoS
7497,1,80.360378,69,618,70.771714,11504.992591,6.209339e+05,520,62.919459,16,100.146842,0,0,0,0,28.489858,0,75.118196,4068,6744.0,3045.0,41.447755,226.000984,2.588513,0.003845,0.010707,DoS
7498,1,78.199068,48,449,70.279622,11261.623119,6.015547e+05,379,48.148848,23,91.280378,0,0,0,0,27.584787,0,52.904558,5336,8841.0,4051.0,28.431820,377.429821,2.004969,0.002813,0.186452,DoS


In [32]:
dosSamples.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0,22.0
mean,1.0,109.541236,54.0,489.272727,115.336644,13344.256197,743456.272727,455.272727,75.541236,20.0,115.336644,0.0,0.0,0.0,0.0,43.067228,0.0,88.236516,2693.954545,5715.863636,1978.409091,14.33449,1273.183737,3.416355,0.001497,0.042374
std,0.0,4.723316,0.0,1.031957,6.610693,1503.443014,66624.894863,1.031957,4.723316,0.0,6.610693,0.0,0.0,0.0,0.0,5.371901,0.0,12.216166,687.545218,696.524932,410.466594,11.577552,890.026163,4.604678,0.001294,0.055395
min,1.0,99.131087,54.0,487.0,100.823173,10165.312267,525673.0,453.0,65.131087,20.0,100.823173,0.0,0.0,0.0,0.0,31.706232,0.0,70.674872,1477.0,4081.0,1100.0,3.477777,196.600053,0.037094,0.000349,0.000705
25%,1.0,106.7009,54.0,489.0,111.827633,12505.478401,710284.5,455.0,72.7009,20.0,111.827633,0.0,0.0,0.0,0.0,39.767573,0.0,78.726481,2327.5,5252.0,1742.75,4.569588,465.317607,0.072882,0.000458,0.001226
50%,1.0,109.95587,54.0,489.0,116.146897,13490.556399,757735.0,455.0,75.95587,20.0,116.146897,0.0,0.0,0.0,0.0,43.302417,0.0,86.390381,2894.0,5704.0,1904.0,8.535311,1174.495592,1.109663,0.000856,0.015098
75%,1.0,113.082372,54.0,490.0,120.085389,14420.658297,788886.75,456.0,79.082372,20.0,120.085389,0.0,0.0,0.0,0.0,46.672949,0.0,101.169679,3118.0,6270.25,2301.25,21.099838,2195.068396,5.98709,0.002152,0.080042
max,1.0,117.572573,54.0,492.0,125.957802,15865.367856,834890.0,458.0,83.572573,20.0,125.957802,0.0,0.0,0.0,0.0,52.133133,0.0,105.578127,4075.0,6990.0,2662.0,41.052888,2866.486457,16.314272,0.005087,0.200255


In [33]:
dosDataset.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0
mean,1.0,113.457384,54.724267,500.856,119.386698,13826.526564,770700.9,464.403867,78.315811,19.930133,119.768884,0.0,0.0,0.0,0.0,44.573425,0.0,91.302838,3220.763333,6282.6584,2138.924,26.000244,616.729886,5.551433,0.002781,0.067976
std,0.0,24.379657,9.394552,86.010593,31.669118,3709.376226,204992.9,79.657694,21.028627,3.481523,31.791087,0.0,0.0,0.0,0.0,11.84968,0.0,24.386801,1241.387657,2636.478574,900.962418,13.528833,614.195515,7.243577,0.001479,0.089434
min,1.0,71.643063,39.0,351.0,60.721014,7015.521841,390895.5,327.0,39.789581,14.0,60.723343,0.0,0.0,0.0,0.0,22.863334,0.0,46.43736,1066.0,1672.0,571.0,2.663051,165.05821,0.046628,0.000247,0.000613
25%,1.0,92.390188,46.0,427.0,95.049415,10945.403609,613240.4,396.0,62.051669,17.0,95.474609,0.0,0.0,0.0,0.0,35.420475,0.0,72.420116,2158.0,4105.0,1397.0,14.375145,260.103733,0.835051,0.001528,0.010456
50%,1.0,113.087817,55.0,501.0,116.651541,13516.834114,755216.8,464.0,76.257121,20.0,116.958505,0.0,0.0,0.0,0.0,43.513607,0.0,89.101661,3236.5,6167.5,2084.5,25.763718,380.083051,1.903218,0.002748,0.023077
75%,1.0,134.534394,63.0,573.0,138.89192,16156.989089,896635.2,532.0,91.811777,23.0,139.61929,0.0,0.0,0.0,0.0,52.046032,0.0,106.720708,4284.0,8151.75,2799.0,37.848072,684.698132,7.795317,0.003983,0.095651
max,1.0,155.489352,71.0,650.0,196.226608,22727.161315,1260510.0,605.0,128.388881,26.0,196.395373,0.0,0.0,0.0,0.0,73.362299,0.0,150.638421,5387.0,12601.0,4276.0,49.567866,4139.166715,27.762177,0.006061,0.345853


In [None]:
# save the dataset
# dosDataset.to_csv('dos_hulk_dataset_new.csv', index=False)

---