In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
import random

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC

NUM_OF_ROWS = 11500
ATTACK_NAME = 'DoS'

In [3]:
pd.set_option('display.max_columns', None)  # Show all columns

---

In [4]:
# import the attack sample dataset
dosSamples = pd.read_csv('dos_samples_1.csv')
print(f'Dataset Shape: {dosSamples.shape}')
dosSamples

Dataset Shape: (18, 26)


Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,1,60.0,60,60,0.0,0.0,259948,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,57.282503,9998,0,0,2.773013,3605.464376,1.589317,0.000277,0.019216
1,1,60.0,60,60,0.0,0.0,259922,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,31.935373,9997,0,0,4.922741,2030.77922,1.713713,0.000492,0.028124
2,1,60.0,60,60,0.0,0.0,259974,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,30.416988,9999,0,0,5.267316,1898.310308,1.850942,0.000527,0.030091
3,1,60.0,60,60,0.0,0.0,236886,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,26.008564,9111,0,0,44.188204,206.186248,44.076651,0.004851,0.46177
4,1,60.0,60,60,0.0,0.0,256802,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,26.036906,9877,0,0,161.398189,61.196473,161.285302,0.016342,1.622864
5,1,60.0,60,60,0.0,0.0,259922,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,60.236848,9997,0,0,2.90927,3436.25743,1.548943,0.000291,0.019841
6,1,60.0,60,60,0.0,0.0,256568,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,26.013181,9868,0,0,81.048627,121.754068,80.927725,0.008214,0.814672
7,1,60.0,60,60,0.0,0.0,259974,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,64.413776,9999,0,0,2.68372,3725.798694,1.323679,0.000268,0.018444
8,1,60.0,60,60,0.0,0.0,259610,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,26.015633,9985,0,0,101.352469,98.51758,101.220045,0.010151,1.01296
9,1,60.0,60,60,0.0,0.0,259792,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,26.013017,9992,0,0,45.53635,219.429093,45.40936,0.004558,0.454275


In [5]:
# find the columns that we need to synthesis data for to produce an attack dataset
columnsToGather = dosSamples.replace(0, np.nan) #replace all 0 values with null
columnsToGather = columnsToGather.dropna(how="all", axis=1).columns.tolist() #remove all columns where there are null values
columnsToGather #left with all columns that the values are not 0 (be know for a fact that the data is consistant and there are not missing values in the rows)

['Number of Ports',
 'Average Packet Length',
 'Packet Length Min',
 'Packet Length Max',
 'Total Length of Fwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Min',
 'Fwd Segment Size Avg',
 'Subflow Fwd Bytes',
 'SYN Flag Count',
 'Flow Duration',
 'Packets Per Second',
 'IAT Max',
 'IAT Mean',
 'IAT Std']

In [6]:
# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (dosSamples[col].min() * 0.85, dosSamples[col].max() * 1.1) for col in columnsToGather}
MinMaxDict['Number of Ports'] = (1, 1)
MinMaxDict

{'Number of Ports': (1, 1),
 'Average Packet Length': (np.float64(51.0), np.float64(66.0)),
 'Packet Length Min': (np.float64(51.0), np.float64(66.0)),
 'Packet Length Max': (np.float64(51.0), np.float64(66.0)),
 'Total Length of Fwd Packet': (np.float64(59891.0), np.float64(286000.0)),
 'Fwd Packet Length Max': (np.float64(22.099999999999998), np.float64(28.6)),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(28.6)),
 'Fwd Packet Length Min': (np.float64(22.099999999999998), np.float64(28.6)),
 'Fwd Segment Size Avg': (np.float64(5.1), np.float64(6.6000000000000005)),
 'Subflow Fwd Bytes': (np.float64(22.107279314888007),
  np.float64(494.80968858131496)),
 'SYN Flag Count': (np.float64(2303.5), np.float64(11000.0)),
 'Flow Duration': (np.float64(1.3379797458648681),
  np.float64(177.53800797462466)),
 'Packets Per Second': (np.float64(25.737769126305732),
  np.float64(6988.14763743391)),
 'IAT Max': (np.float64(1.125127124786377), np.float64(177.41383211612703)

In [7]:
# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['SYN Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (1, 1),
 'Average Packet Length': (np.float64(51.0), np.float64(66.0)),
 'Packet Length Min': (np.float64(51.0), np.float64(66.0)),
 'Packet Length Max': (np.float64(51.0), np.float64(66.0)),
 'Total Length of Fwd Packet': (np.float64(59891.0), np.float64(286000.0)),
 'Fwd Packet Length Max': (np.float64(22.099999999999998), np.float64(28.6)),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(28.6)),
 'Fwd Packet Length Min': (np.float64(22.099999999999998), np.float64(28.6)),
 'Fwd Segment Size Avg': (np.float64(5.1), np.float64(6.6000000000000005)),
 'Subflow Fwd Bytes': (np.float64(22.107279314888007),
  np.float64(494.80968858131496)),
 'SYN Flag Count': (2303, 11000),
 'Flow Duration': (np.float64(1.3379797458648681),
  np.float64(177.53800797462466)),
 'Packets Per Second': (np.float64(25.737769126305732),
  np.float64(6988.14763743391)),
 'IAT Max': (np.float64(1.125127124786377), np.float64(177.41383211612703)),
 'IAT Mean': (np.float64(

In [8]:
# creating an empty dataframe before adding values to it
dosDataset = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(dosSamples.columns))), columns=dosSamples.columns)
dosDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# adding zeros to all columns that should not have any values
zeroColumns = [col for col in dosSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    dosDataset[col] = int(0)
zeroColumns

['Packet Length Std',
 'Packet Length Variance',
 'Fwd Packet Length Std',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Bwd Packet Length Std',
 'Bwd Segment Size Avg',
 'ACK Flag Count',
 'RST Flag Count']

In [10]:
dosDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0


### Same values

In [11]:
same_value = ['Average Packet Length', 'Packet Length Min', 'Packet Length Max']
val = np.random.randint(MinMaxDict[same_value[0]][0], MinMaxDict[same_value[0]][1]*1.1, NUM_OF_ROWS)

for col in same_value:
    dosDataset[col] = val

In [12]:
same_value2 = ['Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Min']
val2 = np.random.randint(MinMaxDict[same_value2[0]][0], MinMaxDict[same_value2[0]][1]*1.25, NUM_OF_ROWS)

for col in same_value2:
    dosDataset[col] = val2

In [13]:
dosDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,71,71,71,0,0,0.0,32,32,32,0,0,0,0,0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0
1,0.0,66,66,66,0,0,0.0,33,33,33,0,0,0,0,0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0
2,0.0,61,61,61,0,0,0.0,27,27,27,0,0,0,0,0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0
3,0.0,53,53,53,0,0,0.0,23,23,23,0,0,0,0,0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0
4,0.0,64,64,64,0,0,0.0,22,22,22,0,0,0,0,0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11495,0.0,57,57,57,0,0,0.0,25,25,25,0,0,0,0,0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0
11496,0.0,57,57,57,0,0,0.0,29,29,29,0,0,0,0,0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0
11497,0.0,69,69,69,0,0,0.0,25,25,25,0,0,0,0,0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0
11498,0.0,64,64,64,0,0,0.0,31,31,31,0,0,0,0,0,0.0,0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0


In [14]:
dosDataset['Fwd Segment Size Avg'] = np.random.randint(MinMaxDict['Fwd Segment Size Avg'][0]*0.9, MinMaxDict['Fwd Segment Size Avg'][1]*1.5, NUM_OF_ROWS)
dosDataset['Number of Ports'] = np.full(shape=NUM_OF_ROWS, fill_value=1, dtype=int)

In [15]:
rand_values = dosDataset['SYN Flag Count'] = np.random.randint(MinMaxDict['SYN Flag Count'][0], MinMaxDict['SYN Flag Count'][1]*1.1, NUM_OF_ROWS)
usual_values = np.random.randint(8176, 10658, NUM_OF_ROWS)

# Choose values randomly (10% from rand_values, 90% from usual_values)
chosen_values = np.where(np.random.rand(NUM_OF_ROWS) > 0.1, usual_values, rand_values) 

dosDataset['SYN Flag Count'] = chosen_values

In [16]:
rand_values = np.random.uniform(MinMaxDict['Flow Duration'][0], MinMaxDict['Flow Duration'][1], NUM_OF_ROWS)
usual_values = np.random.uniform(1.654, 45.175, NUM_OF_ROWS)

# Choose values randomly (25% from rand_values, 75% from usual_values)
chosen_values = np.where(np.random.rand(NUM_OF_ROWS) > 0.25, usual_values, rand_values) 

dosDataset['Flow Duration'] = chosen_values

In [17]:
rand_values = np.random.uniform(MinMaxDict['Subflow Fwd Bytes'][0], MinMaxDict['Subflow Fwd Bytes'][1], NUM_OF_ROWS)
usual_values = np.random.uniform(13.763, 72.146, NUM_OF_ROWS)

# Choose values randomly (10% from rand_values, 90% from usual_values)
chosen_values = np.where(np.random.rand(NUM_OF_ROWS) > 0.1, usual_values, rand_values) 

dosDataset['Subflow Fwd Bytes'] = chosen_values

In [18]:
dosDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,1,71,71,71,0,0,0.0,32,32,32,0,0,0,0,0,6,0,37.295759,8840,0,0,44.584154,0.0,0.0,0.0,0.0
1,1,66,66,66,0,0,0.0,33,33,33,0,0,0,0,0,4,0,68.421188,9899,0,0,161.002999,0.0,0.0,0.0,0.0
2,1,61,61,61,0,0,0.0,27,27,27,0,0,0,0,0,4,0,39.399780,8719,0,0,41.042561,0.0,0.0,0.0,0.0
3,1,53,53,53,0,0,0.0,23,23,23,0,0,0,0,0,5,0,31.781376,10298,0,0,38.203761,0.0,0.0,0.0,0.0
4,1,64,64,64,0,0,0.0,22,22,22,0,0,0,0,0,4,0,43.624580,8498,0,0,41.750730,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11495,1,57,57,57,0,0,0.0,25,25,25,0,0,0,0,0,8,0,65.360942,8603,0,0,41.663423,0.0,0.0,0.0,0.0
11496,1,57,57,57,0,0,0.0,29,29,29,0,0,0,0,0,7,0,34.908807,10161,0,0,159.450784,0.0,0.0,0.0,0.0
11497,1,69,69,69,0,0,0.0,25,25,25,0,0,0,0,0,5,0,36.024674,8274,0,0,15.813844,0.0,0.0,0.0,0.0
11498,1,64,64,64,0,0,0.0,31,31,31,0,0,0,0,0,6,0,35.781599,9704,0,0,23.114284,0.0,0.0,0.0,0.0


### First Correlation

In [19]:
# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
first_correlation = ['SYN Flag Count', 'Total Length of Fwd Packet']
independent_col = dosSamples[first_correlation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = dosSamples[first_correlation[1]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(first_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Total Length of Fwd Packet', np.float64(26.000000000000007))


In [20]:
# adding the rest of the attack feature values to the dataset at random based on the smaple data
for index, row in dosDataset.iterrows():
    for col, factor in scaling_factors: #iterating over all rows we need to add values to except 'Number of Ports'
        delta = random.uniform(factor * 0.05, factor * 0.25) # select a dantom
        updatedFactor = factor + random.choice([-1, 1]) * delta
        dosDataset.loc[index, col] = int(row['SYN Flag Count'] * updatedFactor)

In [21]:
dosDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,1,71,71,71,0,0,259967.0,32,32,32,0,0,0,0,0,6,0,37.295759,8840,0,0,44.584154,0.0,0.0,0.0,0.0
1,1,66,66,66,0,0,282463.0,33,33,33,0,0,0,0,0,4,0,68.421188,9899,0,0,161.002999,0.0,0.0,0.0,0.0
2,1,61,61,61,0,0,188884.0,27,27,27,0,0,0,0,0,4,0,39.399780,8719,0,0,41.042561,0.0,0.0,0.0,0.0
3,1,53,53,53,0,0,226073.0,23,23,23,0,0,0,0,0,5,0,31.781376,10298,0,0,38.203761,0.0,0.0,0.0,0.0
4,1,64,64,64,0,0,208455.0,22,22,22,0,0,0,0,0,4,0,43.624580,8498,0,0,41.750730,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11495,1,57,57,57,0,0,172126.0,25,25,25,0,0,0,0,0,8,0,65.360942,8603,0,0,41.663423,0.0,0.0,0.0,0.0
11496,1,57,57,57,0,0,213346.0,29,29,29,0,0,0,0,0,7,0,34.908807,10161,0,0,159.450784,0.0,0.0,0.0,0.0
11497,1,69,69,69,0,0,197291.0,25,25,25,0,0,0,0,0,5,0,36.024674,8274,0,0,15.813844,0.0,0.0,0.0,0.0
11498,1,64,64,64,0,0,294985.0,31,31,31,0,0,0,0,0,6,0,35.781599,9704,0,0,23.114284,0.0,0.0,0.0,0.0


### Second correlation

In [22]:
# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
secondCorrelation = ['Flow Duration', 'Packets Per Second', 'IAT Max', 'IAT Mean', 'IAT Std']
independent_col = dosSamples[secondCorrelation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = dosSamples[secondCorrelation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(secondCorrelation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

durationToPacketsCorr = [x * y for x, y in zip(dosSamples['Flow Duration'].values, dosSamples['Packets Per Second'].values)]
durationToPacketsCorr = np.mean(durationToPacketsCorr)
durationToPacketsCorr

('Packets Per Second', np.float64(1.8396896437579322))
('IAT Max', np.float64(0.998429144907052))
('IAT Mean', np.float64(0.00015181064048409737))
('IAT Std', np.float64(0.01191782011351499))


np.float64(8585.611111111111)

In [23]:
# adding the rest of the attack feature values to the dataset at random based on the smaple data
for index, row in dosDataset.iterrows():
    for col, factor in scaling_factors: #iterating over all rows we need to add values to except 'Number of Ports'
        # calculate a random small delta of the factor for adding some randomness
        if col == 'Packets Per Second':
            delta = random.uniform(durationToPacketsCorr * 0.1, durationToPacketsCorr * 0.15) # select a delta
            updatedFactor = durationToPacketsCorr + random.choice([-1, 1]) * delta
            dosDataset.loc[index, col] = updatedFactor / row['Flow Duration']
        else:
            if col == 'IAT Std':
                delta = random.uniform(factor * 0.1, factor * 0.35)
                updatedFactor = factor + random.choice([-1, 1]) * delta  
            elif col == 'IAT Max':
                delta = random.uniform(factor * 0.1, factor * 0.225)
                updatedFactor = factor + random.choice([-1, 1]) * delta  
            else:
                delta = random.uniform(factor * 0.05, factor * 0.2) # select a delta
                updatedFactor = factor + random.choice([-1, 1]) * delta
            dosDataset.loc[index, col] = row['Flow Duration'] * updatedFactor

In [24]:
dosDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,1,71,71,71,0,0,259967.0,32,32,32,0,0,0,0,0,6,0,37.295759,8840,0,0,44.584154,171.899749,50.798248,0.005626,0.462755
1,1,66,66,66,0,0,282463.0,33,33,33,0,0,0,0,0,4,0,68.421188,9899,0,0,161.002999,59.087805,135.761462,0.020391,1.620348
2,1,61,61,61,0,0,188884.0,27,27,27,0,0,0,0,0,4,0,39.399780,8719,0,0,41.042561,233.654783,49.181952,0.006665,0.564986
3,1,53,53,53,0,0,226073.0,23,23,23,0,0,0,0,0,5,0,31.781376,10298,0,0,38.203761,196.175396,31.514714,0.005287,0.394762
4,1,64,64,64,0,0,208455.0,22,22,22,0,0,0,0,0,4,0,43.624580,8498,0,0,41.750730,178.120912,33.843025,0.006807,0.593271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11495,1,57,57,57,0,0,172126.0,25,25,25,0,0,0,0,0,8,0,65.360942,8603,0,0,41.663423,234.182791,46.076501,0.005257,0.444399
11496,1,57,57,57,0,0,213346.0,29,29,29,0,0,0,0,0,7,0,34.908807,10161,0,0,159.450784,60.795571,134.440914,0.020088,2.147862
11497,1,69,69,69,0,0,197291.0,25,25,25,0,0,0,0,0,5,0,36.024674,8274,0,0,15.813844,604.266945,19.252225,0.002876,0.236554
11498,1,64,64,64,0,0,294985.0,31,31,31,0,0,0,0,0,6,0,35.781599,9704,0,0,23.114284,318.983952,26.309245,0.003742,0.190358


In [25]:
# adding a label to the dataset
dosDataset['Label'] = ATTACK_NAME

In [26]:
dosSamples.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0
mean,1.0,60.0,60.0,60.0,0.0,0.0,223225.888889,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,58.869588,8585.611111,0.0,0.0,50.730569,1261.620636,50.075167,0.007958,0.609617
std,0.0,0.0,0.0,0.0,0.0,0.0,67405.300285,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,98.729675,2592.511549,0.0,0.0,47.078915,1854.329691,47.670273,0.009044,0.59576
min,1.0,60.0,60.0,60.0,0.0,0.0,70460.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,26.008564,2710.0,0.0,0.0,1.574094,30.279728,1.323679,0.000157,0.014803
25%,1.0,60.0,60.0,60.0,0.0,0.0,240935.5,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,26.015761,9266.75,0.0,0.0,5.008885,97.571008,1.74802,0.000501,0.028616
50%,1.0,60.0,60.0,60.0,0.0,0.0,257478.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,26.06135,9903.0,0.0,0.0,44.862277,212.807671,44.743005,0.004704,0.458022
75%,1.0,60.0,60.0,60.0,0.0,0.0,259922.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,52.472023,9997.0,0.0,0.0,75.90785,1997.661992,75.807678,0.01025,1.020746
max,1.0,60.0,60.0,60.0,0.0,0.0,260000.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,449.82699,10000.0,0.0,0.0,161.398189,6352.861489,161.285302,0.033034,1.999864


In [27]:
dosDataset.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0,11500.0
mean,1.0,60.992783,60.992783,60.992783,0.0,0.0,239205.904435,27.960087,27.960087,27.960087,0.0,0.0,0.0,0.0,0.0,5.990783,0.0,65.053679,9193.125652,0.0,0.0,39.938569,550.923348,40.001014,0.00606,0.475647
std,0.0,6.06139,6.06139,6.06139,0.0,0.0,51441.714044,3.739457,3.739457,3.739457,0.0,0.0,0.0,0.0,0.0,1.414798,0.0,80.17613,1287.537882,0.0,0.0,39.957494,729.134649,41.091712,0.006163,0.502515
min,1.0,51.0,51.0,51.0,0.0,0.0,45478.0,22.0,22.0,22.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,13.765517,2321.0,0.0,0.0,1.359383,41.417495,1.268659,0.000172,0.01257
25%,1.0,56.0,56.0,56.0,0.0,0.0,202939.75,25.0,25.0,25.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,29.945352,8679.0,0.0,0.0,15.020847,197.719933,14.711754,0.002259,0.168195
50%,1.0,61.0,61.0,61.0,0.0,0.0,238875.0,28.0,28.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,46.14105,9340.0,0.0,0.0,28.251219,305.005187,27.640161,0.004214,0.317222
75%,1.0,66.0,66.0,66.0,0.0,0.0,279014.25,31.0,31.0,31.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,62.206396,10029.0,0.0,0.0,41.794995,570.688363,44.161014,0.006494,0.544703
max,1.0,71.0,71.0,71.0,0.0,0.0,390429.0,34.0,34.0,34.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,494.222011,12094.0,0.0,0.0,177.420602,7105.731898,215.3537,0.032037,2.79593


In [28]:
dosSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,1,60.0,60,60,0.0,0.0,259948,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,57.282503,9998,0,0,2.773013,3605.464376,1.589317,0.000277,0.019216
1,1,60.0,60,60,0.0,0.0,259922,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,31.935373,9997,0,0,4.922741,2030.77922,1.713713,0.000492,0.028124
2,1,60.0,60,60,0.0,0.0,259974,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,30.416988,9999,0,0,5.267316,1898.310308,1.850942,0.000527,0.030091
3,1,60.0,60,60,0.0,0.0,236886,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,26.008564,9111,0,0,44.188204,206.186248,44.076651,0.004851,0.46177
4,1,60.0,60,60,0.0,0.0,256802,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,26.036906,9877,0,0,161.398189,61.196473,161.285302,0.016342,1.622864
5,1,60.0,60,60,0.0,0.0,259922,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,60.236848,9997,0,0,2.90927,3436.25743,1.548943,0.000291,0.019841
6,1,60.0,60,60,0.0,0.0,256568,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,26.013181,9868,0,0,81.048627,121.754068,80.927725,0.008214,0.814672
7,1,60.0,60,60,0.0,0.0,259974,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,64.413776,9999,0,0,2.68372,3725.798694,1.323679,0.000268,0.018444
8,1,60.0,60,60,0.0,0.0,259610,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,26.015633,9985,0,0,101.352469,98.51758,101.220045,0.010151,1.01296
9,1,60.0,60,60,0.0,0.0,259792,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,26.013017,9992,0,0,45.53635,219.429093,45.40936,0.004558,0.454275


In [29]:
x = dosDataset[dosDataset['Flow Duration'] > 30]
x[x['Flow Duration'] < 31][:20]

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std,Label
56,1,59,59,59,0,0,247764.0,23,23,23,0,0,0,0,0,8,0,30.363599,10507,0,0,30.441386,244.262683,36.56615,0.00538,0.444286,DoS
180,1,68,68,68,0,0,183703.0,29,29,29,0,0,0,0,0,6,0,31.00723,9392,0,0,30.378997,318.168379,24.406078,0.00419,0.314552,DoS
186,1,62,62,62,0,0,338824.0,30,30,30,0,0,0,0,0,6,0,50.59499,10655,0,0,30.913875,309.574645,34.373201,0.004354,0.294748,DoS
188,1,51,51,51,0,0,275697.0,33,33,33,0,0,0,0,0,5,0,52.136735,9272,0,0,30.217017,250.354661,23.52764,0.004849,0.450832,DoS
213,1,69,69,69,0,0,271540.0,26,26,26,0,0,0,0,0,4,0,41.124867,9813,0,0,30.898134,305.806587,35.595672,0.00443,0.422335,DoS
218,1,65,65,65,0,0,258287.0,23,23,23,0,0,0,0,0,8,0,71.375809,9085,0,0,30.060101,244.594059,33.356073,0.003655,0.404351,DoS
468,1,70,70,70,0,0,182702.0,29,29,29,0,0,0,0,0,5,0,70.759783,8352,0,0,30.994596,307.368343,26.107191,0.00525,0.482636,DoS
558,1,65,65,65,0,0,248027.0,31,31,31,0,0,0,0,0,8,0,70.122011,9048,0,0,30.207409,318.638646,36.152265,0.004958,0.416722,DoS
570,1,54,54,54,0,0,205380.0,26,26,26,0,0,0,0,0,5,0,14.529882,8844,0,0,30.842558,319.871542,36.425789,0.003841,0.245296,DoS
593,1,61,61,61,0,0,303145.0,26,26,26,0,0,0,0,0,6,0,323.40385,10449,0,0,30.750234,239.332744,26.260993,0.004006,0.473996,DoS


In [30]:
x = dosDataset[dosDataset['Flow Duration'] > 2.6]
x[x['Flow Duration'] < 2.7]

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std,Label
513,1,55,55,55,0,0,224248.0,29,29,29,0,0,0,0,0,8,0,15.842304,10440,0,0,2.684264,2772.40904,3.02035,0.00038,0.023813,DoS
1242,1,55,55,55,0,0,247239.0,23,23,23,0,0,0,0,0,4,0,63.963607,10615,0,0,2.655537,2897.435076,2.236656,0.00035,0.04142,DoS
2061,1,58,58,58,0,0,128004.0,26,26,26,0,0,0,0,0,6,0,57.969261,6403,0,0,2.628704,2859.141361,2.113477,0.000369,0.038532,DoS
4313,1,66,66,66,0,0,207218.0,23,23,23,0,0,0,0,0,6,0,25.707276,9587,0,0,2.651385,2795.555547,2.159919,0.000367,0.042395,DoS
4437,1,51,51,51,0,0,244482.0,33,33,33,0,0,0,0,0,5,0,303.037775,8551,0,0,2.635377,2809.407415,2.210914,0.000442,0.039513,DoS
4883,1,56,56,56,0,0,274434.0,27,27,27,0,0,0,0,0,5,0,61.595335,8760,0,0,2.66545,2835.944735,2.12429,0.00033,0.027259,DoS
6093,1,56,56,56,0,0,258200.0,29,29,29,0,0,0,0,0,5,0,43.99187,9140,0,0,2.695809,2711.728689,3.052041,0.000354,0.026417,DoS
7281,1,59,59,59,0,0,259153.0,28,28,28,0,0,0,0,0,5,0,71.140912,9346,0,0,2.682523,2846.578522,3.021036,0.000383,0.023742,DoS
7885,1,67,67,67,0,0,286081.0,24,24,24,0,0,0,0,0,5,0,279.873117,9779,0,0,2.672857,2764.501256,2.19901,0.000436,0.021716,DoS
8015,1,62,62,62,0,0,193601.0,23,23,23,0,0,0,0,0,6,0,62.757897,5999,0,0,2.634584,2823.417169,3.073826,0.000376,0.036102,DoS


---

### Adding rows with low flow duration

In [31]:
NUM_OF_ROWS = 3500

In [32]:
# import the attack sample dataset
dosSamples = pd.read_csv('dos_samples_2.csv')
print(f'Dataset Shape: {dosSamples.shape}')
dosSamples

Dataset Shape: (8, 26)


Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,1,60.0,60,60,0.0,0.0,251472,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9672,0,0,0.113266,85391.915937,0.002451,1.2e-05,7.7e-05
1,1,60.0,60,60,0.0,0.0,259818,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9993,0,0,0.1256,79562.038842,0.013888,1.3e-05,0.000176
2,1,60.0,60,60,0.0,0.0,259558,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9983,0,0,0.122131,81740.025636,0.001796,1.2e-05,7e-05
3,1,60.0,60,60,0.0,0.0,250952,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9652,0,0,0.155204,62189.191252,0.002296,1.6e-05,8.3e-05
4,1,60.0,60,60,0.0,0.0,259844,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9994,0,0,0.150346,66473.316835,0.005427,1.5e-05,0.0001
5,1,60.0,60,60,0.0,0.0,258440,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9940,0,0,0.146744,67737.00547,0.001699,1.5e-05,8e-05
6,1,60.0,60,60,0.0,0.0,253734,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9759,0,0,0.133626,73032.202973,0.003685,1.4e-05,8.5e-05
7,1,60.0,60,60,0.0,0.0,259740,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9990,0,0,0.150142,66537.031827,0.013583,1.5e-05,0.000234


In [33]:
# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (dosSamples[col].min() * 0.85, dosSamples[col].max() * 1.1) for col in columnsToGather}
MinMaxDict['Number of Ports'] = (1, 1)
MinMaxDict

{'Number of Ports': (1, 1),
 'Average Packet Length': (np.float64(51.0), np.float64(66.0)),
 'Packet Length Min': (np.float64(51.0), np.float64(66.0)),
 'Packet Length Max': (np.float64(51.0), np.float64(66.0)),
 'Total Length of Fwd Packet': (np.float64(213309.19999999998),
  np.float64(285828.4)),
 'Fwd Packet Length Max': (np.float64(22.099999999999998), np.float64(28.6)),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(28.6)),
 'Fwd Packet Length Min': (np.float64(22.099999999999998), np.float64(28.6)),
 'Fwd Segment Size Avg': (np.float64(5.1), np.float64(6.6000000000000005)),
 'Subflow Fwd Bytes': (np.float64(0.0), np.float64(0.0)),
 'SYN Flag Count': (np.float64(8204.199999999999),
  np.float64(10993.400000000001)),
 'Flow Duration': (np.float64(0.09627609252929688),
  np.float64(0.17072420120239254)),
 'Packets Per Second': (np.float64(52860.81256459571),
  np.float64(93931.107530648)),
 'IAT Max': (np.float64(0.00144432783126828),
  np.float64(0.01527693

In [34]:
# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['SYN Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (1, 1),
 'Average Packet Length': (np.float64(51.0), np.float64(66.0)),
 'Packet Length Min': (np.float64(51.0), np.float64(66.0)),
 'Packet Length Max': (np.float64(51.0), np.float64(66.0)),
 'Total Length of Fwd Packet': (np.float64(213309.19999999998),
  np.float64(285828.4)),
 'Fwd Packet Length Max': (np.float64(22.099999999999998), np.float64(28.6)),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(28.6)),
 'Fwd Packet Length Min': (np.float64(22.099999999999998), np.float64(28.6)),
 'Fwd Segment Size Avg': (np.float64(5.1), np.float64(6.6000000000000005)),
 'Subflow Fwd Bytes': (np.float64(0.0), np.float64(0.0)),
 'SYN Flag Count': (8204, 10993),
 'Flow Duration': (np.float64(0.09627609252929688),
  np.float64(0.17072420120239254)),
 'Packets Per Second': (np.float64(52860.81256459571),
  np.float64(93931.107530648)),
 'IAT Max': (np.float64(0.00144432783126828),
  np.float64(0.015276932716369611)),
 'IAT Mean': (np.float64(9.95513313300

In [35]:
# creating an empty dataframe before adding values to it
dosDataset2 = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(dosSamples.columns))), columns=dosSamples.columns)
dosDataset2.head(3)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# adding zeros to all columns that should not have any values
zeroColumns = [col for col in dosSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    dosDataset2[col] = int(0)
zeroColumns

['Packet Length Std',
 'Packet Length Variance',
 'Fwd Packet Length Std',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Bwd Packet Length Std',
 'Bwd Segment Size Avg',
 'ACK Flag Count',
 'RST Flag Count']

In [37]:
same_value = ['Average Packet Length', 'Packet Length Min', 'Packet Length Max']
val = np.random.randint(MinMaxDict[same_value[0]][0], MinMaxDict[same_value[0]][1]*1.1, NUM_OF_ROWS)

for col in same_value:
    dosDataset2[col] = val

In [38]:
same_value2 = ['Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Min']
val2 = np.random.randint(MinMaxDict[same_value2[0]][0], MinMaxDict[same_value2[0]][1]*1.25, NUM_OF_ROWS)

for col in same_value2:
    dosDataset2[col] = val2

In [39]:
dosDataset2['Fwd Segment Size Avg'] = np.random.randint(MinMaxDict['Fwd Segment Size Avg'][0]*0.9, MinMaxDict['Fwd Segment Size Avg'][1]*1.5, NUM_OF_ROWS)
dosDataset2['Flow Duration'] = np.random.uniform(MinMaxDict['Flow Duration'][0]*0.95, MinMaxDict['Flow Duration'][1]*1.05, NUM_OF_ROWS)
dosDataset2['Number of Ports'] = np.full(shape=NUM_OF_ROWS, fill_value=1, dtype=int)
dosDataset2['Subflow Fwd Bytes'] = np.full(shape=NUM_OF_ROWS, fill_value=0, dtype=int)
dosDataset2['SYN Flag Count'] = np.random.randint(MinMaxDict['SYN Flag Count'][0]*0.9, MinMaxDict['SYN Flag Count'][1]*1.1, NUM_OF_ROWS)

In [40]:
# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
first_correlation = ['SYN Flag Count', 'Total Length of Fwd Packet']
independent_col = dosSamples[first_correlation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = dosSamples[first_correlation[1]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(first_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)
    
# adding the rest of the attack feature values to the dataset at random based on the smaple data
for index, row in dosDataset2.iterrows():
    for col, factor in scaling_factors: #iterating over all rows we need to add values to except 'Number of Ports'
        delta = random.uniform(factor * 0.05, factor * 0.25) # select a dantom
        updatedFactor = factor + random.choice([-1, 1]) * delta
        dosDataset2.loc[index, col] = int(row['SYN Flag Count'] * updatedFactor)

('Total Length of Fwd Packet', np.float64(26.000000000000007))


In [41]:
# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
secondCorrelation = ['Flow Duration', 'Packets Per Second', 'IAT Max', 'IAT Mean', 'IAT Std']
independent_col = dosSamples[secondCorrelation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = dosSamples[secondCorrelation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(secondCorrelation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

durationToPacketsCorr = [x * y for x, y in zip(dosSamples['Flow Duration'].values, dosSamples['Packets Per Second'].values)]
durationToPacketsCorr = np.mean(durationToPacketsCorr)
durationToPacketsCorr

('Packets Per Second', np.float64(519129.6051044216))
('IAT Max', np.float64(0.040715922209246726))
('IAT Mean', np.float64(0.00010129001644309197))
('IAT Std', np.float64(0.000825544773212792))


np.float64(9872.875)

In [42]:
# adding the rest of the attack feature values to the dataset at random based on the smaple data
for index, row in dosDataset2.iterrows():
    for col, factor in scaling_factors: #iterating over all rows we need to add values to except 'Number of Ports'
        # calculate a random small delta of the factor for adding some randomness
        if col == 'Packets Per Second':
            delta = random.uniform(durationToPacketsCorr*0.025, durationToPacketsCorr * 0.075) # select a delta
            updatedFactor = durationToPacketsCorr + random.choice([-1, 1]) * delta
            dosDataset2.loc[index, col] = updatedFactor / row['Flow Duration']
        else:
            if col == 'IAT Std':
                delta = random.uniform(factor * 0.1, factor * 0.35)
                updatedFactor = factor + random.choice([-1, 1]) * delta  
            elif col == 'IAT Max':
                delta = random.uniform(factor * 0.15, factor * 0.7)
                updatedFactor = factor + random.choice([-1, 1]) * delta  
            else:
                delta = random.uniform(factor * 0.05, factor * 0.2) # select a delta
                updatedFactor = factor + random.choice([-1, 1]) * delta
            dosDataset2.loc[index, col] = row['Flow Duration'] * updatedFactor

In [43]:
dosSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,1,60.0,60,60,0.0,0.0,251472,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9672,0,0,0.113266,85391.915937,0.002451,1.2e-05,7.7e-05
1,1,60.0,60,60,0.0,0.0,259818,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9993,0,0,0.1256,79562.038842,0.013888,1.3e-05,0.000176
2,1,60.0,60,60,0.0,0.0,259558,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9983,0,0,0.122131,81740.025636,0.001796,1.2e-05,7e-05
3,1,60.0,60,60,0.0,0.0,250952,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9652,0,0,0.155204,62189.191252,0.002296,1.6e-05,8.3e-05
4,1,60.0,60,60,0.0,0.0,259844,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9994,0,0,0.150346,66473.316835,0.005427,1.5e-05,0.0001
5,1,60.0,60,60,0.0,0.0,258440,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9940,0,0,0.146744,67737.00547,0.001699,1.5e-05,8e-05
6,1,60.0,60,60,0.0,0.0,253734,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9759,0,0,0.133626,73032.202973,0.003685,1.4e-05,8.5e-05
7,1,60.0,60,60,0.0,0.0,259740,26,26.0,26,0.0,0,0.0,0,0.0,6.0,0.0,0.0,9990,0,0,0.150142,66537.031827,0.013583,1.5e-05,0.000234


In [44]:
dosDataset2.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0
mean,1.0,61.091714,61.091714,61.091714,0.0,0.0,253040.994286,27.894,27.894,27.894,0.0,0.0,0.0,0.0,0.0,5.993143,0.0,0.0,9744.11,0.0,0.0,0.135026,75561.767591,0.005481,1.4e-05,0.000111
std,0.0,6.07958,6.07958,6.07958,0.0,0.0,53605.80764,3.797446,3.797446,3.797446,0.0,0.0,0.0,0.0,0.0,1.422057,0.0,0.0,1351.323427,0.0,0.0,0.025126,15188.838244,0.002726,3e-06,3.4e-05
min,1.0,51.0,51.0,51.0,0.0,0.0,145814.0,22.0,22.0,22.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,7383.0,0.0,0.0,0.09147,51017.945639,0.001151,8e-06,5e-05
25%,1.0,56.0,56.0,56.0,0.0,0.0,213075.5,25.0,25.0,25.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,8599.0,0.0,0.0,0.113464,63094.446696,0.003049,1.1e-05,8.6e-05
50%,1.0,61.0,61.0,61.0,0.0,0.0,247336.0,28.0,28.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,9746.0,0.0,0.0,0.134709,72987.715787,0.00513,1.4e-05,0.000106
75%,1.0,66.0,66.0,66.0,0.0,0.0,290695.0,31.0,31.0,31.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,10889.0,0.0,0.0,0.156577,86788.440694,0.00774,1.6e-05,0.000135
max,1.0,71.0,71.0,71.0,0.0,0.0,390800.0,34.0,34.0,34.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,12091.0,0.0,0.0,0.179248,115711.419883,0.012271,2.2e-05,0.000199


In [45]:
x = dosDataset2[dosDataset2['Flow Duration'] > 0.16]
x[x['Flow Duration'] < 0.17][:20]

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
1,1,57,57,57,0,0,195114.0,33,33,33,0,0,0,0,0,6,0,0,9714,0,0,0.166688,57573.855623,0.009306,1.4e-05,0.000114
22,1,61,61,61,0,0,186366.0,28,28,28,0,0,0,0,0,6,0,0,9170,0,0,0.1648,63467.113975,0.002604,1.4e-05,0.000109
26,1,58,58,58,0,0,347168.0,26,26,26,0,0,0,0,0,6,0,0,11159,0,0,0.161816,56446.375445,0.003686,1.9e-05,0.000104
70,1,53,53,53,0,0,242772.0,25,25,25,0,0,0,0,0,4,0,0,11400,0,0,0.162028,58440.98417,0.007877,1.4e-05,0.000112
71,1,51,51,51,0,0,285255.0,23,23,23,0,0,0,0,0,5,0,0,9390,0,0,0.166631,63375.264191,0.00822,1.9e-05,0.00011
82,1,53,53,53,0,0,181497.0,23,23,23,0,0,0,0,0,7,0,0,8319,0,0,0.165354,56528.048248,0.004512,1.9e-05,0.000166
92,1,57,57,57,0,0,213778.0,29,29,29,0,0,0,0,0,4,0,0,10764,0,0,0.162572,64884.305585,0.009907,1.9e-05,0.00015
94,1,64,64,64,0,0,184013.0,30,30,30,0,0,0,0,0,6,0,0,8832,0,0,0.168739,56713.05712,0.010406,2e-05,0.000177
116,1,56,56,56,0,0,202728.0,28,28,28,0,0,0,0,0,8,0,0,10077,0,0,0.160216,64596.479542,0.009977,1.8e-05,0.000158
121,1,71,71,71,0,0,215195.0,32,32,32,0,0,0,0,0,8,0,0,9524,0,0,0.160864,65135.376085,0.009387,1.4e-05,0.000112


In [46]:
dosDataset2['Label'] = ATTACK_NAME

In [47]:
# sample dos attack dataset
mergedDosDataset = pd.concat([dosDataset, dosDataset2], axis=0)
mergedDosDataset = mergedDosDataset.sample(frac=1, random_state=42).reset_index(drop=True)
print(mergedDosDataset.shape)

(15000, 27)


In [None]:
# save the dataset
# mergedDosDataset.to_csv('dos_hping_dataset_new.csv', index=False)

---