In [366]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
import random

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC

NUM_OF_ROWS = 25000
ATTACK_NAME = 'DoS'

In [367]:
pd.set_option('display.max_columns', None)  # Show all columns

---

In [368]:
# import the attack sample dataset
dosSamples = pd.read_csv('dos_hulk_samples.csv')
dosSamples

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
0,1,122.647059,8,254,122.647059,87.032746,7574.698962,432.0,52,43.200000,254,142.024390,8,86.207206,43.200000,142.024390,54.00,0,0,0,6.996509,7.289350,6.996509,0.999475,0.139930,0.245731
1,1,374.448276,0,1448,374.448276,564.178230,318297.074911,5198.0,1448,433.166667,1428,333.000000,0,534.155408,433.166667,333.000000,0.00,2,28,0,0.851866,34.042913,0.851866,0.171232,0.030424,0.063792
2,1,207.750000,0,1428,207.750000,417.408099,174229.520833,137.0,98,27.400000,1428,336.571429,0,507.738363,27.400000,336.571429,0.00,0,12,0,0.657084,18.262506,0.657084,0.249103,0.059735,0.097944
3,4,554.875000,29,1300,554.875000,593.930181,352753.059375,20535.0,1300,933.409091,479,92.222222,29,136.814374,933.409091,92.222222,1026.75,0,0,0,6.548384,6.108377,6.548384,2.381877,0.167907,0.477337
4,1,36.000000,31,41,36.000000,5.000000,25.000000,31.0,31,31.000000,41,41.000000,41,0.000000,31.000000,41.000000,0.00,0,0,0,0.084193,23.754946,0.084193,0.084193,0.084193,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,1,0.000000,0,0,0.000000,0.000000,0.000000,0.0,0,0.000000,0,0.000000,0,0.000000,0.000000,0.000000,0.00,0,0,0,0.004064,984.231843,0.004064,0.003923,0.001355,0.001817
484,1,0.000000,0,0,0.000000,0.000000,0.000000,0.0,0,0.000000,0,0.000000,0,0.000000,0.000000,0.000000,0.00,0,0,0,0.004338,461.039187,0.004338,0.004338,0.004338,0.000000
485,1,0.000000,0,0,0.000000,0.000000,0.000000,0.0,0,0.000000,0,0.000000,0,0.000000,0.000000,0.000000,0.00,0,0,0,0.004057,492.954575,0.004057,0.004057,0.004057,0.000000
486,0,0.000000,0,0,0.000000,0.000000,0.000000,0.0,0,0.000000,0,0.000000,0,0.000000,0.000000,0.000000,0.00,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [369]:
# get all the relevant attack rows from the attack sample dataset
dosSamples = dosSamples[dosSamples['RST Flag Count'] > 20]

In [370]:
# print some general information about the attack samples
print(f'Dataset Shape: {dosSamples.shape}')
dosSamples.reset_index() 

Dataset Shape: (24, 26)


Unnamed: 0,index,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
0,5,1,45.420035,0,423,45.420035,117.949181,13912.009258,446161.0,423,45.420035,0,0.0,0,0.0,45.420035,0.0,59.10983,3137,5714,1712,4.714153,2083.725304,4.714153,0.023284,0.00048,0.000922
1,17,1,46.393486,0,424,46.393486,119.411478,14259.101086,458646.0,424,46.393486,0,0.0,0,0.0,46.393486,0.0,59.402409,3166,5950,1588,4.842849,2041.360358,4.842849,0.025724,0.00049,0.000994
2,28,1,44.484143,0,423,44.484143,117.367183,13775.055603,437635.0,423,44.484143,0,0.0,0,0.0,44.484143,0.0,50.320225,3217,5673,1656,5.183131,1898.080531,5.183131,0.699923,0.000527,0.007128
3,46,1,44.893343,0,424,44.893343,118.219382,13975.822296,442379.0,424,44.893343,0,0.0,0,0.0,44.893343,0.0,57.601432,3110,5475,1867,4.983406,1977.362539,4.983406,0.026082,0.000506,0.001039
4,55,1,41.791982,0,424,41.791982,114.141141,13028.200132,408642.0,424,41.791982,0,0.0,0,0.0,41.791982,0.0,41.843334,3407,5238,1706,7.90737,1236.567934,7.90737,2.201838,0.000809,0.024074
5,64,1,41.529954,0,423,41.529954,113.896237,12972.352751,400681.0,423,41.529954,0,0.0,0,0.0,41.529954,0.0,41.564419,3383,5254,1629,9.195835,1049.170617,9.195835,2.554215,0.000953,0.028577
6,81,1,38.396019,0,425,38.396019,109.545382,12000.190683,379967.0,425,38.396019,0,0.0,0,0.0,38.396019,0.0,38.450415,3609,5350,1522,6.783164,1458.906193,6.783164,0.891328,0.000686,0.00965
7,87,1,35.254274,0,423,35.254274,105.911676,11217.283077,344399.0,423,35.254274,0,0.0,0,0.0,35.254274,0.0,35.297632,4171,5020,1080,18.218136,536.223908,18.218136,4.609304,0.001865,0.058279
8,134,1,30.93498,0,424,30.93498,100.479038,10096.037022,132742.0,424,30.93498,0,0.0,0,0.0,30.93498,0.0,57.8901,1822,2128,452,4.387097,978.095564,4.387097,0.955754,0.001023,0.020035
9,172,1,31.931266,0,423,31.931266,101.006373,10202.287451,285242.0,423,31.931266,0,0.0,0,0.0,31.931266,0.0,41.708144,3936,4584,564,26.208671,340.841394,26.208671,8.908422,0.002934,0.113825


In [371]:
# find the columns that we need to synthesis data for to produce an attack dataset
columnsToGather = dosSamples.replace(0, np.nan) #replace all 0 values with null
columnsToGather = columnsToGather.dropna(how="all", axis=1).columns.tolist() #remove all columns where there are null values
columnsToGather #left with all columns that the values are not 0 (be know for a fact that the data is consistant and there are not missing values in the rows)

['Number of Ports',
 'Average Packet Size',
 'Packet Length Max',
 'Packet Length Mean',
 'Packet Length Std',
 'Packet Length Variance',
 'Total Length of Fwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Fwd Segment Size Avg',
 'Subflow Fwd Bytes',
 'SYN Flag Count',
 'ACK Flag Count',
 'RST Flag Count',
 'Flow Duration',
 'Packets Per Second',
 'IAT Total',
 'IAT Max',
 'IAT Mean',
 'IAT Std']

In [372]:
# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (dosSamples[col].min() * 0.85, dosSamples[col].max() * 1.15) for col in columnsToGather}
MinMaxDict['Number of Ports'] = (1, 1)
MinMaxDict

{'Number of Ports': (1, 1),
 'Average Packet Size': (20.97921723300971, 53.9475732090384),
 'Packet Length Max': (357.0, 488.74999999999994),
 'Packet Length Mean': (20.97921723300971, 53.9475732090384),
 'Packet Length Std': (76.62023264438389, 138.01145733708282),
 'Packet Length Variance': (6906.65888291707, 16562.7498750482),
 'Total Length of Fwd Packet': (112830.7, 532408.6),
 'Fwd Packet Length Max': (357.0, 488.74999999999994),
 'Fwd Packet Length Mean': (20.97921723300971, 53.9475732090384),
 'Fwd Segment Size Avg': (20.97921723300971, 53.9475732090384),
 'Subflow Fwd Bytes': (21.34182098765432, 68.31277036653282),
 'SYN Flag Count': (1548.7, 5321.049999999999),
 'ACK Flag Count': (1808.8, 6842.499999999999),
 'RST Flag Count': (384.2, 3057.85),
 'Flow Duration': (3.7290323495864865, 44.06797716617584),
 'Packets Per Second': (212.5888366664272, 2396.28409958007),
 'IAT Total': (3.7290323495864865, 44.06797716617584),
 'IAT Max': (0.019791364669799725, 24.826610791683194),
 'I

In [373]:
# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (1, 1),
 'Average Packet Size': (20.97921723300971, 53.9475732090384),
 'Packet Length Max': (357.0, 488.74999999999994),
 'Packet Length Mean': (20.97921723300971, 53.9475732090384),
 'Packet Length Std': (76.62023264438389, 138.01145733708282),
 'Packet Length Variance': (6906.65888291707, 16562.7498750482),
 'Total Length of Fwd Packet': (112830.7, 532408.6),
 'Fwd Packet Length Max': (357.0, 488.74999999999994),
 'Fwd Packet Length Mean': (20.97921723300971, 53.9475732090384),
 'Fwd Segment Size Avg': (20.97921723300971, 53.9475732090384),
 'Subflow Fwd Bytes': (21.34182098765432, 68.31277036653282),
 'SYN Flag Count': (1548, 5321),
 'ACK Flag Count': (1808, 6842),
 'RST Flag Count': (384, 3057),
 'Flow Duration': (3.7290323495864865, 44.06797716617584),
 'Packets Per Second': (212.5888366664272, 2396.28409958007),
 'IAT Total': (3.7290323495864865, 44.06797716617584),
 'IAT Max': (0.019791364669799725, 24.826610791683194),
 'IAT Mean': (0.00040796478249536, 0.0

### Creating the dataset

In [374]:
# creating an empty dataframe before adding values to it
dosDataset = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(dosSamples.columns))), columns=dosSamples.columns)
dosDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [375]:
# adding zeros to all columns that should not have any values
zeroColumns = [col for col in dosSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    dosDataset[col] = int(0)
zeroColumns

['Packet Length Min',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Bwd Packet Length Std',
 'Bwd Segment Size Avg']

In [376]:
dosDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Calculating the correct corelation between columns

In [377]:
# inserting values that are the same across multiple columns
sameValueColumns = ['Average Packet Size', 'Packet Length Mean', 'Fwd Packet Length Mean', 'Fwd Segment Size Avg']
sameValueColumns2 = ['Packet Length Max', 'Fwd Packet Length Max']

# Generate random values for the first group of columns
randValues = np.random.uniform(MinMaxDict['Average Packet Size'][0]*0.85, MinMaxDict['Average Packet Size'][1]*1.15, size=NUM_OF_ROWS)

# Assign the same random value across all columns in 'sameValueColumns'
for col in sameValueColumns:
    dosDataset[col] = randValues

# Generate random values for the second group of columns
randValues2 = np.random.uniform(MinMaxDict['Packet Length Max'][0]*0.85, MinMaxDict['Packet Length Max'][1]*1.15, size=NUM_OF_ROWS)

# Assign the same random value across all columns in 'sameValueColumns2'
for col in sameValueColumns2:
    dosDataset[col] = randValues2

In [378]:
dosDataset

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
0,0.0,43.719346,0,540.854225,43.719346,0.0,0.0,0.0,540.854225,43.719346,0,0,0,0,43.719346,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,37.072393,0,429.572400,37.072393,0.0,0.0,0.0,429.572400,37.072393,0,0,0,0,37.072393,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,60.500871,0,385.000121,60.500871,0.0,0.0,0.0,385.000121,60.500871,0,0,0,0,60.500871,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,28.152962,0,338.680325,28.152962,0.0,0.0,0.0,338.680325,28.152962,0,0,0,0,28.152962,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,37.538500,0,450.066899,37.538500,0.0,0.0,0.0,450.066899,37.538500,0,0,0,0,37.538500,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,0.0,55.901842,0,405.745259,55.901842,0.0,0.0,0.0,405.745259,55.901842,0,0,0,0,55.901842,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24996,0.0,41.801725,0,336.658991,41.801725,0.0,0.0,0.0,336.658991,41.801725,0,0,0,0,41.801725,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24997,0.0,21.284191,0,516.601925,21.284191,0.0,0.0,0.0,516.601925,21.284191,0,0,0,0,21.284191,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24998,0.0,58.038950,0,414.652789,58.038950,0.0,0.0,0.0,414.652789,58.038950,0,0,0,0,58.038950,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### First group

In [379]:
# finding the correlation between the SYN Flag Count column to the rest of the columns in order to create new data
firstCorrelation = ['SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']
independent_col = dosSamples[firstCorrelation[0]].values.reshape(-1, 1) #column 'SYN Flag Count'
dependent_cols = dosSamples[firstCorrelation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(firstCorrelation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('ACK Flag Count', 1.3844076416190647)
('RST Flag Count', 0.40640186091498165)


In [380]:
dosDataset['SYN Flag Count'] = np.random.randint(MinMaxDict['SYN Flag Count'][0]*0.85, MinMaxDict['SYN Flag Count'][1]*1.15, NUM_OF_ROWS)

for index, row in dosDataset.iterrows():
    for col, factor in scaling_factors: #iterating over all rows we need to add values
        # calculate the value we want to add into the dataset in the given row
        delta = random.uniform(factor * 0.1, factor * 0.2) # select a delta
        updatedFactor = factor + random.choice([-1, 1]) * delta
        dosDataset.loc[index, col] = int(row['SYN Flag Count'] * updatedFactor)

In [381]:
dosDataset.head(10)

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
0,0.0,43.719346,0,540.854225,43.719346,0.0,0.0,0.0,540.854225,43.719346,0,0,0,0,43.719346,0,0.0,4579,5249.0,2211.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,37.072393,0,429.5724,37.072393,0.0,0.0,0.0,429.5724,37.072393,0,0,0,0,37.072393,0,0.0,5429,6478.0,1958.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,60.500871,0,385.000121,60.500871,0.0,0.0,0.0,385.000121,60.500871,0,0,0,0,60.500871,0,0.0,5883,7183.0,1995.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,28.152962,0,338.680325,28.152962,0.0,0.0,0.0,338.680325,28.152962,0,0,0,0,28.152962,0,0.0,3593,5843.0,1231.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,37.5385,0,450.066899,37.5385,0.0,0.0,0.0,450.066899,37.5385,0,0,0,0,37.5385,0,0.0,5656,6813.0,1901.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,28.510701,0,530.706874,28.510701,0.0,0.0,0.0,530.706874,28.510701,0,0,0,0,28.510701,0,0.0,6067,7190.0,2046.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,30.779909,0,491.246308,30.779909,0.0,0.0,0.0,491.246308,30.779909,0,0,0,0,30.779909,0,0.0,1628,2534.0,757.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,23.380545,0,437.577178,23.380545,0.0,0.0,0.0,437.577178,23.380545,0,0,0,0,23.380545,0,0.0,3978,6126.0,1426.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,40.663931,0,445.935367,40.663931,0.0,0.0,0.0,445.935367,40.663931,0,0,0,0,40.663931,0,0.0,2123,3513.0,702.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,36.766259,0,478.801167,36.766259,0.0,0.0,0.0,478.801167,36.766259,0,0,0,0,36.766259,0,0.0,1368,1532.0,655.0,0.0,0.0,0.0,0.0,0.0,0.0


### Second group

In [382]:
# Generate random values for the first group of columns
randValues = np.random.uniform(MinMaxDict['Flow Duration'][0]*0.9, MinMaxDict['Flow Duration'][1]*1.05, size=NUM_OF_ROWS)

# Assign the same random value across all columns in `sameValueColumns`
for col in ['Flow Duration', 'IAT Total']:
    dosDataset[col] = randValues

In [383]:
# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
secondCorrelation = ['Flow Duration', 'Packets Per Second', 'IAT Max', 'IAT Mean', 'IAT Std']
independent_col = dosSamples[secondCorrelation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = dosSamples[secondCorrelation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(secondCorrelation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Packets Per Second', 25.708888235716653)
('IAT Max', 0.4361636888589877)
('IAT Mean', 0.00010950413530739898)
('IAT Std', 0.00502620850117565)


In [384]:
durationToPacketsCorr = [x * y for x, y in zip(dosSamples['Flow Duration'].values, dosSamples['Packets Per Second'].values)]
durationToPacketsCorr = np.mean(durationToPacketsCorr)
durationToPacketsCorr

9231.0

In [385]:
#iterating over all rows we need to add values
for index, row in dosDataset.iterrows():
    for col, factor in scaling_factors: 
        # calculate a random small delta of the factor for adding some randomness
        if col == 'Packets Per Second':
            delta = random.uniform(durationToPacketsCorr * 0.1, durationToPacketsCorr * 0.2) # select a delta
            updatedFactor = durationToPacketsCorr + delta
            dosDataset.loc[index, col] = updatedFactor / row['Flow Duration']
        else:
            if col == 'IAT Std':
                delta = random.uniform(factor * 0.55, factor * 0.8)
                updatedFactor = factor + random.choices([-1, 1], weights=[2, 1], k=1)[0] * delta
            else:
                delta = random.uniform(factor * 0.1, factor * 0.2) # select a delta
                updatedFactor = factor + random.choice([-1, 1]) * delta
            dosDataset.loc[index, col] = row['Flow Duration'] * updatedFactor

In [386]:
dosDataset.describe()

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,0.0,39.866199,0.0,432.799355,39.866199,0.0,0.0,0.0,432.799355,39.866199,0.0,0.0,0.0,0.0,39.866199,0.0,0.0,3725.73724,5154.33424,1514.5526,24.879475,646.726884,24.879475,10.844328,0.002721,0.096614
std,0.0,12.777239,0.0,74.547986,12.777239,0.0,0.0,0.0,74.547986,12.777239,0.0,0.0,0.0,0.0,12.777239,0.0,0.0,1388.645899,2092.957777,616.087744,12.391605,548.524348,12.391605,5.703139,0.001433,0.10183
min,0.0,17.84124,0.0,303.455898,17.84124,0.0,0.0,0.0,303.455898,17.84124,0.0,0.0,0.0,0.0,17.84124,0.0,0.0,1315.0,1466.0,430.0,3.357278,220.071179,3.357278,1.182306,0.000303,0.003501
25%,0.0,28.822303,0.0,368.619761,28.822303,0.0,0.0,0.0,368.619761,28.822303,0.0,0.0,0.0,0.0,28.822303,0.0,0.0,2514.0,3412.75,999.0,14.116629,297.294218,14.116629,6.041365,0.001512,0.029742
50%,0.0,39.868726,0.0,432.426756,39.868726,0.0,0.0,0.0,432.426756,39.868726,0.0,0.0,0.0,0.0,39.868726,0.0,0.0,3729.0,5055.5,1479.0,24.914428,426.051078,24.914428,10.641092,0.002656,0.053002
75%,0.0,50.87003,0.0,497.83828,50.87003,0.0,0.0,0.0,497.83828,50.87003,0.0,0.0,0.0,0.0,50.87003,0.0,0.0,4928.0,6660.0,1964.0,35.718516,750.674536,35.718516,15.147453,0.003815,0.117353
max,0.0,62.039102,0.0,562.035902,62.039102,0.0,0.0,0.0,562.035902,62.039102,0.0,0.0,0.0,0.0,62.039102,0.0,0.0,6118.0,10128.0,2970.0,46.268637,3288.506691,46.268637,24.104482,0.006048,0.417238


In [387]:
x = dosDataset[dosDataset['Flow Duration']<10]
x[x['Flow Duration']>5][0:40]

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
7,0.0,23.380545,0,437.577178,23.380545,0.0,0.0,0.0,437.577178,23.380545,0,0,0,0,23.380545,0,0.0,3978,6126.0,1426.0,7.346772,1454.697531,7.346772,3.534342,0.000933,0.011866
18,0.0,33.427906,0,458.503202,33.427906,0.0,0.0,0.0,458.503202,33.427906,0,0,0,0,33.427906,0,0.0,4456,7371.0,2172.0,8.395567,1250.686027,8.395567,4.16804,0.00078,0.011799
30,0.0,20.711634,0,326.931004,20.711634,0.0,0.0,0.0,326.931004,20.711634,0,0,0,0,20.711634,0,0.0,4107,5113.0,1389.0,6.019688,1758.103993,6.019688,2.188947,0.000757,0.011074
44,0.0,60.972442,0,369.87755,60.972442,0.0,0.0,0.0,369.87755,60.972442,0,0,0,0,60.972442,0,0.0,5259,8443.0,1764.0,6.997423,1526.604534,6.997423,3.616311,0.000638,0.010329
60,0.0,58.045704,0,383.388652,58.045704,0.0,0.0,0.0,383.388652,58.045704,0,0,0,0,58.045704,0,0.0,5547,8761.0,1874.0,9.591789,1126.725567,9.591789,4.61929,0.000885,0.012264
63,0.0,61.503753,0,423.747305,61.503753,0.0,0.0,0.0,423.747305,61.503753,0,0,0,0,61.503753,0,0.0,1363,1528.0,487.0,9.496911,1142.619576,9.496911,3.484547,0.001161,0.012034
73,0.0,48.749526,0,539.334043,48.749526,0.0,0.0,0.0,539.334043,48.749526,0,0,0,0,48.749526,0,0.0,5590,6197.0,2003.0,9.546653,1084.606624,9.546653,4.738988,0.000913,0.082114
74,0.0,57.838429,0,560.872827,57.838429,0.0,0.0,0.0,560.872827,57.838429,0,0,0,0,57.838429,0,0.0,2252,2766.0,1038.0,5.953916,1821.670943,5.953916,3.00908,0.000723,0.009508
75,0.0,38.987168,0,532.070164,38.987168,0.0,0.0,0.0,532.070164,38.987168,0,0,0,0,38.987168,0,0.0,5445,6693.0,1976.0,6.458614,1670.54502,6.458614,3.12388,0.000604,0.01073
76,0.0,39.112918,0,396.778422,39.112918,0.0,0.0,0.0,396.778422,39.112918,0,0,0,0,39.112918,0,0.0,6010,6908.0,2790.0,5.709304,1924.388671,5.709304,2.944223,0.000705,0.012744


In [388]:
x = dosDataset[dosDataset['Flow Duration']<50.5]
x[x['Flow Duration']>25][20:40]

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
39,0.0,37.565278,0,536.88143,37.565278,0.0,0.0,0.0,536.88143,37.565278,0,0,0,0,37.565278,0,0.0,2302,2791.0,780.0,34.51198,302.873703,34.51198,16.97776,0.003074,0.280062
45,0.0,33.393781,0,556.992338,33.393781,0.0,0.0,0.0,556.992338,33.393781,0,0,0,0,33.393781,0,0.0,4614,7612.0,1581.0,36.745153,291.232485,36.745153,13.995291,0.004453,0.081018
46,0.0,27.990831,0,482.159709,27.990831,0.0,0.0,0.0,482.159709,27.990831,0,0,0,0,27.990831,0,0.0,3417,4063.0,1124.0,41.092385,263.38432,41.092385,21.43917,0.005064,0.055076
47,0.0,51.723804,0,429.676598,51.723804,0.0,0.0,0.0,429.676598,51.723804,0,0,0,0,51.723804,0,0.0,2105,2562.0,710.0,25.380461,410.208189,25.380461,8.956011,0.002366,0.207161
48,0.0,41.342347,0,418.4679,41.342347,0.0,0.0,0.0,418.4679,41.342347,0,0,0,0,41.342347,0,0.0,4149,4961.0,1445.0,32.277492,325.966032,32.277492,11.281736,0.004092,0.056352
51,0.0,43.062585,0,527.400116,43.062585,0.0,0.0,0.0,527.400116,43.062585,0,0,0,0,43.062585,0,0.0,3233,5301.0,1471.0,40.614342,265.431655,40.614342,19.760993,0.005298,0.0417
52,0.0,23.231547,0,404.546009,23.231547,0.0,0.0,0.0,404.546009,23.231547,0,0,0,0,23.231547,0,0.0,5090,6113.0,1666.0,25.281584,425.606926,25.281584,9.134844,0.003102,0.028093
53,0.0,32.902996,0,502.113687,32.902996,0.0,0.0,0.0,502.113687,32.902996,0,0,0,0,32.902996,0,0.0,2366,3764.0,848.0,42.37841,254.078516,42.37841,16.295275,0.003967,0.074694
54,0.0,57.184511,0,543.478973,57.184511,0.0,0.0,0.0,543.478973,57.184511,0,0,0,0,57.184511,0,0.0,5939,6665.0,2667.0,36.422382,287.636335,36.422382,14.198378,0.004541,0.048297
55,0.0,53.832302,0,380.768671,53.832302,0.0,0.0,0.0,380.768671,53.832302,0,0,0,0,53.832302,0,0.0,5092,5907.0,2412.0,28.643793,355.085718,28.643793,13.862775,0.002711,0.251417


In [389]:
dosDataset[dosDataset['Flow Duration']<6][:20]

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
16,0.0,53.631509,0,335.235675,53.631509,0.0,0.0,0.0,335.235675,53.631509,0,0,0,0,53.631509,0,0.0,1399,1683.0,459.0,4.102518,2639.365605,4.102518,1.582791,0.0004,0.034018
64,0.0,34.413469,0,327.410313,34.413469,0.0,0.0,0.0,327.410313,34.413469,0,0,0,0,34.413469,0,0.0,5816,7134.0,2082.0,3.935727,2801.591447,3.935727,1.932667,0.000347,0.004603
74,0.0,57.838429,0,560.872827,57.838429,0.0,0.0,0.0,560.872827,57.838429,0,0,0,0,57.838429,0,0.0,2252,2766.0,1038.0,5.953916,1821.670943,5.953916,3.00908,0.000723,0.009508
76,0.0,39.112918,0,396.778422,39.112918,0.0,0.0,0.0,396.778422,39.112918,0,0,0,0,39.112918,0,0.0,6010,6908.0,2790.0,5.709304,1924.388671,5.709304,2.944223,0.000705,0.012744
80,0.0,27.096515,0,467.9207,27.096515,0.0,0.0,0.0,467.9207,27.096515,0,0,0,0,27.096515,0,0.0,4763,5292.0,2291.0,4.905164,2148.336512,4.905164,2.491246,0.000612,0.006615
119,0.0,49.360292,0,405.33108,49.360292,0.0,0.0,0.0,405.33108,49.360292,0,0,0,0,49.360292,0,0.0,4823,7815.0,1707.0,4.272405,2446.59975,4.272405,1.614388,0.000535,0.038297
120,0.0,26.339771,0,524.978793,26.339771,0.0,0.0,0.0,524.978793,26.339771,0,0,0,0,26.339771,0,0.0,1619,2627.0,787.0,4.726404,2192.502192,4.726404,1.835002,0.000424,0.005745
142,0.0,54.979169,0,462.919719,54.979169,0.0,0.0,0.0,462.919719,54.979169,0,0,0,0,54.979169,0,0.0,2925,4676.0,1019.0,5.358311,2027.787274,5.358311,1.920431,0.000654,0.011434
145,0.0,29.403578,0,470.99438,29.403578,0.0,0.0,0.0,470.99438,29.403578,0,0,0,0,29.403578,0,0.0,5448,6203.0,1986.0,4.835853,2135.111569,4.835853,1.780827,0.000627,0.008558
148,0.0,44.061333,0,440.229385,44.061333,0.0,0.0,0.0,440.229385,44.061333,0,0,0,0,44.061333,0,0.0,2011,2471.0,656.0,5.557722,1844.74452,5.557722,2.720335,0.000522,0.011464


### Third group

In [390]:
# finding the correlation between the Packet Length Std column to the rest of the columns in order to create new data
firstCorrelation = ['Packet Length Std', 'Packet Length Variance', 'Total Length of Fwd Packet', 'Subflow Fwd Bytes']
independent_col = dosSamples[firstCorrelation[0]].values.reshape(-1, 1) #column 'Packet Length Std'
dependent_cols = dosSamples[firstCorrelation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(firstCorrelation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Packet Length Variance', 109.29090132096465)
('Total Length of Fwd Packet', 3253.507894214089)
('Subflow Fwd Bytes', 0.41141342596763014)


In [391]:
dosDataset['Packet Length Std'] = np.random.uniform(MinMaxDict['Packet Length Std'][0]*0.85, MinMaxDict['Packet Length Std'][1]*1.15, NUM_OF_ROWS)

for index, row in dosDataset.iterrows():
    for col, factor in scaling_factors: #iterating over all rows we need to add values
        # calculate the value we want to add into the dataset in the given row
        delta = random.uniform(factor * 0.1, factor * 0.2) # select a delta
        updatedFactor = factor + random.choice([-1, 1]) * delta
        dosDataset.loc[index, col] = row['Packet Length Std'] * updatedFactor

In [392]:
x = dosDataset[dosDataset['Packet Length Std'] > 114]
x[x['Packet Length Std'] < 120].tail(20)

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
24755,0.0,51.444411,0,463.738789,51.444411,114.550265,10395.768885,325738.738827,463.738789,51.444411,0,0,0,0,51.444411,0,55.47189,1652,2662.0,552.0,5.610059,1881.175145,5.610059,2.731802,0.000515,0.011951
24756,0.0,49.965742,0,330.199845,49.965742,116.009896,14765.615353,448873.95125,330.199845,49.965742,0,0,0,0,49.965742,0,41.903335,3430,5570.0,1568.0,42.052899,248.230676,42.052899,20.69194,0.005301,0.379881
24785,0.0,44.177182,0,322.145936,44.177182,114.331183,10851.749167,329130.372287,322.145936,44.177182,0,0,0,0,44.177182,0,54.899315,1855,2072.0,620.0,15.780414,697.828041,15.780414,5.78715,0.001509,0.01811
24812,0.0,60.77274,0,317.294776,60.77274,116.435072,14963.905939,443138.103745,317.294776,60.77274,0,0,0,0,60.77274,0,54.350437,6012,9566.0,2692.0,40.657181,255.328858,40.657181,15.647568,0.005109,0.085684
24818,0.0,18.297119,0,557.149045,18.297119,118.846941,14558.349692,446455.282633,557.149045,18.297119,0,0,0,0,18.297119,0,58.328942,3837,6036.0,1858.0,39.651261,259.028629,39.651261,20.624271,0.003689,0.087534
24820,0.0,45.301246,0,404.702117,45.301246,119.721059,10759.224724,447137.97277,404.702117,45.301246,0,0,0,0,45.301246,0,39.683844,3333,5136.0,1212.0,4.82663,2131.580052,4.82663,1.851734,0.000452,0.01029
24823,0.0,45.887402,0,313.232104,45.887402,119.201634,15175.892657,428875.725034,313.232104,45.887402,0,0,0,0,45.887402,0,40.462799,2134,2604.0,982.0,23.714975,453.841708,23.714975,12.1526,0.00228,0.039135
24824,0.0,23.711542,0,530.758707,23.711542,117.917904,14573.501044,426215.332349,530.758707,23.711542,0,0,0,0,23.711542,0,54.946917,1360,2249.0,656.0,32.482324,325.062246,32.482324,11.508103,0.004264,0.036752
24829,0.0,52.561051,0,453.565423,52.561051,115.143272,10668.485342,416551.574628,453.565423,52.561051,0,0,0,0,52.561051,0,54.616263,2715,4403.0,1312.0,16.6428,663.903435,16.6428,6.078141,0.00149,0.026794
24842,0.0,54.639947,0,490.354849,54.639947,114.264181,14954.995664,413979.381743,490.354849,54.639947,0,0,0,0,54.639947,0,55.790807,3337,3999.0,1536.0,21.780793,493.582506,21.780793,7.709531,0.002101,0.026627


### Adding labels and verifiying the dataset

In [393]:
# adding number of ports and a label to the dataset
dosDataset['Number of Ports'] = np.full(shape=NUM_OF_ROWS, fill_value=1, dtype=int)
dosDataset['Label'] = ATTACK_NAME

In [394]:
dosDataset

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std,Label
0,1,43.719346,0,540.854225,43.719346,131.721389,11870.560064,383505.355773,540.854225,43.719346,0,0,0,0,43.719346,0,45.181143,4579,5249.0,2211.0,37.769779,277.492885,37.769779,14.626067,0.004611,0.082211,DoS
1,1,37.072393,0,429.572400,37.072393,84.741981,10297.168947,328610.926963,429.572400,37.072393,0,0,0,0,37.072393,0,29.757310,5429,6478.0,1958.0,10.917602,999.799786,10.917602,5.543607,0.001362,0.021129,DoS
2,1,60.500871,0,385.000121,60.500871,108.243354,10358.786904,403843.880448,385.000121,60.500871,0,0,0,0,60.500871,0,36.214837,5883,7183.0,1995.0,11.879885,925.070588,11.879885,4.613893,0.001545,0.015653,DoS
3,1,28.152962,0,338.680325,28.152962,157.453123,14927.757560,459109.623082,338.680325,28.152962,0,0,0,0,28.152962,0,55.058376,3593,5843.0,1231.0,26.858157,399.491749,26.858157,13.579825,0.002560,0.028482,DoS
4,1,37.538500,0,450.066899,37.538500,74.518119,9371.740459,283271.170360,450.066899,37.538500,0,0,0,0,37.538500,0,26.000313,5656,6813.0,1901.0,25.891052,398.192043,25.891052,9.905088,0.002343,0.226027,DoS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,1,55.901842,0,405.745259,55.901842,157.241271,20416.104115,418819.913256,405.745259,55.901842,0,0,0,0,55.901842,0,74.260823,3973,4518.0,1937.0,31.374805,328.815824,31.374805,12.073613,0.002779,0.046635,DoS
24996,1,41.801725,0,336.658991,41.801725,106.126499,9896.283020,389017.076561,336.658991,41.801725,0,0,0,0,41.801725,0,48.531211,2565,4260.0,1205.0,31.181504,346.165969,31.181504,16.309746,0.002774,0.055049,DoS
24997,1,21.284191,0,516.601925,21.284191,99.872737,12556.933122,262993.972341,516.601925,21.284191,0,0,0,0,21.284191,0,36.666501,3898,6185.0,1833.0,45.132650,232.614583,45.132650,17.601078,0.004056,0.072614,DoS
24998,1,58.038950,0,414.652789,58.038950,91.918561,8908.117124,241811.034602,414.652789,58.038950,0,0,0,0,58.038950,0,30.634205,4373,6681.0,2035.0,37.202142,292.579942,37.202142,13.780624,0.004712,0.296233,DoS


In [395]:
dosSamples.describe()

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
count,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0
mean,1.0,37.357836,0.0,422.583333,37.357836,108.125155,11755.445289,348143.375,422.583333,37.357836,0.0,0.0,0.0,0.0,37.357836,0.0,44.226953,3401.458333,4852.5,1453.708333,16.042977,897.153494,16.042977,5.852689,0.001769,0.068695
std,0.0,6.170054,0.0,1.348644,6.170054,8.197321,1746.037949,85890.806041,1.348644,6.170054,0.0,0.0,0.0,0.0,6.170054,0.0,9.298604,677.631167,845.676884,473.800681,10.300608,618.630589,10.300608,6.899729,0.001136,0.075843
min,1.0,24.681432,0.0,420.0,24.681432,90.14145,8125.481039,132742.0,420.0,24.681432,0.0,0.0,0.0,0.0,24.681432,0.0,25.108025,1822.0,2128.0,452.0,4.387097,250.104514,4.387097,0.023284,0.00048,0.000922
25%,1.0,33.484267,0.0,421.0,33.484267,103.297971,10671.527517,311099.0,421.0,33.484267,0.0,0.0,0.0,0.0,33.484267,0.0,37.515929,3158.75,4563.5,1195.75,7.223352,351.652234,7.223352,0.535694,0.000792,0.00594
50%,1.0,37.01772,0.0,423.0,37.01772,107.849765,11631.870743,350404.5,423.0,37.01772,0.0,0.0,0.0,0.0,37.01772,0.0,43.369752,3499.0,4996.0,1533.0,14.54937,695.165152,14.54937,3.042825,0.001516,0.038272
75%,1.0,41.871931,0.0,424.0,41.871931,114.201034,13041.886875,407039.25,424.0,41.871931,0.0,0.0,0.0,0.0,41.871931,0.0,50.60588,3763.75,5278.0,1668.5,21.489634,1265.108809,21.489634,8.228698,0.002845,0.110368
max,1.0,46.910933,0.0,425.0,46.910933,120.009963,14402.391196,462964.0,425.0,46.910933,0.0,0.0,0.0,0.0,46.910933,0.0,59.402409,4627.0,5950.0,2659.0,38.31998,2083.725304,38.31998,21.588357,0.003999,0.231287


In [396]:
dosDataset.describe()

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,1.0,39.866199,0.0,432.799355,39.866199,111.994867,12226.966245,364310.070795,432.799355,39.866199,0.0,0.0,0.0,0.0,39.866199,0.0,46.117449,3725.73724,5154.33424,1514.5526,24.879475,646.726884,24.879475,10.844328,0.002721,0.096614
std,0.0,12.777239,0.0,74.547986,12.777239,26.922373,3515.648767,104069.682815,74.547986,12.777239,0.0,0.0,0.0,0.0,12.777239,0.0,13.25779,1388.645899,2092.957777,616.087744,12.391605,548.524348,12.391605,5.703139,0.001433,0.10183
min,1.0,17.84124,0.0,303.455898,17.84124,65.129949,5718.254487,169729.980604,303.455898,17.84124,0.0,0.0,0.0,0.0,17.84124,0.0,21.628454,1315.0,1466.0,430.0,3.357278,220.071179,3.357278,1.182306,0.000303,0.003501
25%,1.0,28.822303,0.0,368.619761,28.822303,88.875615,9440.785818,281812.56796,368.619761,28.822303,0.0,0.0,0.0,0.0,28.822303,0.0,35.624486,2514.0,3412.75,999.0,14.116629,297.294218,14.116629,6.041365,0.001512,0.029742
50%,1.0,39.868726,0.0,432.426756,39.868726,112.109495,11969.481313,357170.507679,432.426756,39.868726,0.0,0.0,0.0,0.0,39.868726,0.0,45.080268,3729.0,5055.5,1479.0,24.914428,426.051078,24.914428,10.641092,0.002656,0.053002
75%,1.0,50.87003,0.0,497.83828,50.87003,135.166423,14480.921771,431430.142129,497.83828,50.87003,0.0,0.0,0.0,0.0,50.87003,0.0,54.706613,4928.0,6660.0,1964.0,35.718516,750.674536,35.718516,15.147453,0.003815,0.117353
max,1.0,62.039102,0.0,562.035902,62.039102,158.713118,20781.090779,619203.126096,562.035902,62.039102,0.0,0.0,0.0,0.0,62.039102,0.0,78.275241,6118.0,10128.0,2970.0,46.268637,3288.506691,46.268637,24.104482,0.006048,0.417238


In [397]:
# save the dataset
# dosDataset.to_csv('dos_hulk_dataset_updated_flows.csv', index=False)

---