In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
import random

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC

NUM_OF_ROWS = 6000
ATTACK_NAME = 'PortScan'

In [65]:
pd.set_option('display.max_columns', None)  # Show all columns

---

In [66]:
# import the attack sample dataset
portSamples = pd.read_csv('portscan_open_port_samples_1.csv')
portSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4990,57.006307,54,60,2.999593,8.997558,130078,26,26.0,26,0.0,24,20.002407,20,0.098088,2.002399,0,61.357547,5003,4986,4986,1.460154,6841.058966,1.012127,0.000146,0.010134
1,5003,57.007603,54,60,2.999457,8.996741,130208,26,26.0,26,0.0,24,20.003208,20,0.113228,2.003195,0,33.890682,5008,4988,4988,1.520292,6575.052498,1.009149,0.000152,0.010104
2,4985,56.998799,54,60,3.0,8.999999,129792,26,26.0,26,0.0,20,20.0,20,0.0,2.0,0,178.285714,4992,4996,4996,1.420907,7029.312865,0.961251,0.000142,0.009626
3,4937,57.012343,54,60,2.999573,8.997439,130026,26,26.0,26,0.0,24,20.002417,20,0.098305,2.0024,0,31.853503,5001,4964,4964,14.153931,704.044686,10.852338,0.001421,0.110534
4,4995,57.0049,54,60,2.999863,8.999176,130182,26,26.0,26,0.0,24,20.000801,20,0.056608,2.000799,0,34.882637,5007,4992,4992,1.527861,6544.443,1.007949,0.000153,0.01009
5,5020,57.015505,54,60,2.999693,8.998159,130598,26,26.0,26,0.0,24,20.001608,20,0.080193,2.001593,0,219.124161,5023,4974,4974,1.943438,5143.976667,1.0682,0.000194,0.011385
6,4962,57.004834,54,60,2.999593,8.99756,129246,26,26.0,26,0.0,24,20.00242,20,0.098354,2.002414,0,50.964511,4971,4959,4959,1.45254,6836.2998,1.008287,0.000146,0.010125
7,5003,57.012301,54,60,2.999975,8.999849,130520,26,26.0,26,0.0,20,20.0,20,0.0,2.0,0,0.0,5020,4979,4979,0.986577,10135.04233,0.504682,9.9e-05,0.00508
8,4968,57.007404,54,60,2.999324,8.995943,130156,26,26.0,26,0.0,24,20.00401,20,0.12658,2.003995,0,29.487087,5006,4988,4988,6.943519,1439.327787,2.646078,0.000695,0.034648
9,5001,57.003902,54,60,2.999997,8.999985,130104,26,26.0,26,0.0,20,20.0,20,0.0,2.0,0,33.155963,5004,4991,4991,1.93033,5177.871039,1.005448,0.000193,0.0113


In [67]:
portSamples.drop(index=17, inplace=True)
portSamples.reset_index(drop=True, inplace=True)
portSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4990,57.006307,54,60,2.999593,8.997558,130078,26,26.0,26,0.0,24,20.002407,20,0.098088,2.002399,0,61.357547,5003,4986,4986,1.460154,6841.058966,1.012127,0.000146,0.010134
1,5003,57.007603,54,60,2.999457,8.996741,130208,26,26.0,26,0.0,24,20.003208,20,0.113228,2.003195,0,33.890682,5008,4988,4988,1.520292,6575.052498,1.009149,0.000152,0.010104
2,4985,56.998799,54,60,3.0,8.999999,129792,26,26.0,26,0.0,20,20.0,20,0.0,2.0,0,178.285714,4992,4996,4996,1.420907,7029.312865,0.961251,0.000142,0.009626
3,4937,57.012343,54,60,2.999573,8.997439,130026,26,26.0,26,0.0,24,20.002417,20,0.098305,2.0024,0,31.853503,5001,4964,4964,14.153931,704.044686,10.852338,0.001421,0.110534
4,4995,57.0049,54,60,2.999863,8.999176,130182,26,26.0,26,0.0,24,20.000801,20,0.056608,2.000799,0,34.882637,5007,4992,4992,1.527861,6544.443,1.007949,0.000153,0.01009
5,5020,57.015505,54,60,2.999693,8.998159,130598,26,26.0,26,0.0,24,20.001608,20,0.080193,2.001593,0,219.124161,5023,4974,4974,1.943438,5143.976667,1.0682,0.000194,0.011385
6,4962,57.004834,54,60,2.999593,8.99756,129246,26,26.0,26,0.0,24,20.00242,20,0.098354,2.002414,0,50.964511,4971,4959,4959,1.45254,6836.2998,1.008287,0.000146,0.010125
7,5003,57.012301,54,60,2.999975,8.999849,130520,26,26.0,26,0.0,20,20.0,20,0.0,2.0,0,0.0,5020,4979,4979,0.986577,10135.04233,0.504682,9.9e-05,0.00508
8,4968,57.007404,54,60,2.999324,8.995943,130156,26,26.0,26,0.0,24,20.00401,20,0.12658,2.003995,0,29.487087,5006,4988,4988,6.943519,1439.327787,2.646078,0.000695,0.034648
9,5001,57.003902,54,60,2.999997,8.999985,130104,26,26.0,26,0.0,20,20.0,20,0.0,2.0,0,33.155963,5004,4991,4991,1.93033,5177.871039,1.005448,0.000193,0.0113


In [68]:
# find the columns that we need to synthesis data for to produce an attack dataset
columnsToGather = portSamples.replace(0, np.nan) #replace all 0 values with null
columnsToGather = columnsToGather.dropna(how="all", axis=1).columns.tolist() #remove all columns where there are null values
columnsToGather #left with all columns that the values are not 0 (be know for a fact that the data is consistant and there are not missing values in the rows)

['Number of Ports',
 'Average Packet Length',
 'Packet Length Min',
 'Packet Length Max',
 'Packet Length Std',
 'Packet Length Variance',
 'Total Length of Fwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Min',
 'Fwd Packet Length Std',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Bwd Packet Length Std',
 'Fwd Segment Size Avg',
 'Subflow Fwd Bytes',
 'SYN Flag Count',
 'ACK Flag Count',
 'RST Flag Count',
 'Flow Duration',
 'Packets Per Second',
 'IAT Max',
 'IAT Mean',
 'IAT Std']

In [69]:
# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (portSamples[col].min() * 0.85, portSamples[col].max() * 1.075) for col in columnsToGather}
MinMaxDict

{'Number of Ports': (np.float64(2476.9), np.float64(5396.5)),
 'Average Packet Length': (np.float64(48.448978776),
  np.float64(68.9408523695)),
 'Packet Length Min': (np.float64(45.9), np.float64(58.05)),
 'Packet Length Max': (np.float64(51.0), np.float64(79.55)),
 'Packet Length Std': (np.float64(2.54942516115), np.float64(10.7459608509)),
 'Packet Length Variance': (np.float64(7.646551357049999),
  np.float64(107.41923218625)),
 'Total Length of Fwd Packet': (np.float64(102765.0), np.float64(216367.4)),
 'Fwd Packet Length Max': (np.float64(22.099999999999998), np.float64(43.0)),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998),
  np.float64(42.9897476605)),
 'Fwd Packet Length Min': (np.float64(22.099999999999998), np.float64(34.4)),
 'Fwd Packet Length Std': (np.float64(0.0), np.float64(5.192345685749999)),
 'Bwd Packet Length Max': (np.float64(17.0), np.float64(43.0)),
 'Bwd Packet Length Mean': (np.float64(17.0), np.float64(21.514086810749998)),
 'Bwd Packet Length Mi

In [70]:
# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['Number of Ports', 'Packet Length Min', 'Packet Length Max', 'Total Length of Fwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (2476, 5396),
 'Average Packet Length': (np.float64(48.448978776),
  np.float64(68.9408523695)),
 'Packet Length Min': (45, 58),
 'Packet Length Max': (51, 79),
 'Packet Length Std': (np.float64(2.54942516115), np.float64(10.7459608509)),
 'Packet Length Variance': (np.float64(7.646551357049999),
  np.float64(107.41923218625)),
 'Total Length of Fwd Packet': (102765, 216367),
 'Fwd Packet Length Max': (22, 43),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998),
  np.float64(42.9897476605)),
 'Fwd Packet Length Min': (22, 34),
 'Fwd Packet Length Std': (np.float64(0.0), np.float64(5.192345685749999)),
 'Bwd Packet Length Max': (17, 43),
 'Bwd Packet Length Mean': (np.float64(17.0), np.float64(21.514086810749998)),
 'Bwd Packet Length Min': (17, 21),
 'Bwd Packet Length Std': (np.float64(0.0), np.float64(0.536213333575)),
 'Fwd Segment Size Avg': (np.float64(0.0), np.float64(2.15429484645)),
 'Subflow Fwd Bytes': (np.float64(0.0), np.float64(1547.23214255)),


### Create the dataframe

In [71]:
# creating an empty dataframe before adding values to it
portDataset = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(portSamples.columns))), columns=portSamples.columns)
portDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
# adding zeros to all columns that should not have any values
zeroColumns = [col for col in portSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    portDataset[col] = int(0)
zeroColumns

['Bwd Segment Size Avg']

In [73]:
portDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Independant Columns

In [74]:
independant = ['Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean']

packet_length_max = np.random.randint(MinMaxDict['Fwd Packet Length Max'][0] * 0.9, MinMaxDict['Fwd Packet Length Max'][1] * 1.1, NUM_OF_ROWS)

# Probability of doing X (30%) and Y (70%)
probability = [0.25, 0.75]

# Decide whether to use backward flags (True or False) based on the probability for each row
copy_values = np.random.choice([True, False], size=NUM_OF_ROWS, p=probability)

# Create 'Average Packet Length' and 'Packet Length Min' based on 'Packet Length Max'
packet_length_min = np.where(copy_values, packet_length_max, packet_length_max + np.random.uniform(-4, 4, NUM_OF_ROWS))
packet_length_min = np.minimum(packet_length_min, packet_length_max)

# If True, copy the 'Packet Length Max' values; if False, apply small variation
average_packet_length = np.where(packet_length_max != packet_length_min, (packet_length_max + packet_length_min) / 2, packet_length_min)

# Assign the values to the dataset
portDataset['Fwd Packet Length Max'] = packet_length_max.astype(int)
portDataset['Fwd Packet Length Mean'] = average_packet_length
portDataset['Fwd Packet Length Min'] = packet_length_min.astype(int)

In [75]:
portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21,19.988608,18,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42,42.000000,42,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25,23.055677,21,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29,29.000000,29,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29,29.000000,29,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46,46.000000,46,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41,41.000000,41,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35,35.000000,35,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32,31.658664,31,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
independent = ['Number of Ports', 'Average Packet Length', 'Packet Length Max', 'Bwd Packet Length Max', 'Subflow Fwd Bytes', 'Bwd Packet Length Mean']

# Generate 'Bwd Packet Length Min' values
bwd_min_low, bwd_min_high = MinMaxDict['Bwd Packet Length Min']
bwd_min_values = np.random.randint(bwd_min_low * 0.9, bwd_min_high * 1.05, size=NUM_OF_ROWS)

for col in independent:
    if col == 'Bwd Packet Length Mean':
        rand_values = np.random.uniform(MinMaxDict[col][0]*0.995, MinMaxDict[col][1] * 1.005, NUM_OF_ROWS)
    else:
        rand_values = np.random.randint(MinMaxDict[col][0] * 0.9, MinMaxDict[col][1] * 1.1, NUM_OF_ROWS)

    portDataset[col] = rand_values

# Ensure that Bwd Packet Length Max is always >= Bwd Packet Length Min
portDataset['Bwd Packet Length Min'] = bwd_min_values
portDataset['Bwd Packet Length Max'] = np.maximum(bwd_min_values, portDataset['Bwd Packet Length Max'])  # Fix inconsistencies

# Ensure that Bwd Packet Length Max is always > Bwd Packet Length Mean > Bwd Packet Length Min
invalid_rows = portDataset['Bwd Packet Length Mean'] > portDataset['Bwd Packet Length Max']

# Compute the correct mean for those rows
corrected_means = (portDataset.loc[invalid_rows, 'Bwd Packet Length Min'] + 
                   portDataset.loc[invalid_rows, 'Bwd Packet Length Max']) / 2

# Update only the invalid rows
portDataset.loc[invalid_rows, 'Bwd Packet Length Mean'] = corrected_means

portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,3570,50,0.0,61,0.0,0.0,0.0,21,19.988608,18,0.0,36,19.768188,21,0.0,0.0,0,230,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5912,72,0.0,55,0.0,0.0,0.0,42,42.000000,42,0.0,43,17.006091,20,0.0,0.0,0,581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3772,54,0.0,70,0.0,0.0,0.0,25,23.055677,21,0.0,28,20.692685,16,0.0,0.0,0,1093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5034,67,0.0,49,0.0,0.0,0.0,29,29.000000,29,0.0,21,20.494675,21,0.0,0.0,0,1499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3501,45,0.0,79,0.0,0.0,0.0,29,29.000000,29,0.0,36,17.023478,18,0.0,0.0,0,808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,5833,45,0.0,68,0.0,0.0,0.0,46,46.000000,46,0.0,27,20.350317,18,0.0,0.0,0,1688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5996,5557,53,0.0,67,0.0,0.0,0.0,41,41.000000,41,0.0,19,16.921111,17,0.0,0.0,0,995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5997,4194,55,0.0,67,0.0,0.0,0.0,35,35.000000,35,0.0,24,19.435125,17,0.0,0.0,0,318,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5998,5469,50,0.0,64,0.0,0.0,0.0,32,31.658664,31,0.0,35,20.086560,16,0.0,0.0,0,1026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
half_and_half = ['Packet Length Std', 'Packet Length Variance', 'Fwd Packet Length Std', 
                 'Flow Duration', 'Total Length of Fwd Packet', 'Bwd Packet Length Std', 'Fwd Segment Size Avg']

for col in half_and_half:
    # Generate random values from the uniform distribution (90% - 110% of min-max range)
    rand_values = np.random.uniform(MinMaxDict[col][0]*0.9, MinMaxDict[col][1]*1.1, NUM_OF_ROWS)
    
    # Generate alternative random values based on column-specific conditions
    if col == 'Packet Length Std':
        usual_values = np.random.uniform(2.9, 3.1, NUM_OF_ROWS)
    elif col == 'Packet Length Variance':
        usual_values = np.random.uniform(8.85, 9.15, NUM_OF_ROWS)
    elif col == 'Fwd Packet Length Std':
        rand_values = np.random.uniform(MinMaxDict[col][0], MinMaxDict[col][1]*1.1, NUM_OF_ROWS)
        usual_values = np.zeros(NUM_OF_ROWS)
    elif col == 'Flow Duration':
        rand_values = np.random.uniform(MinMaxDict[col][0]*0.85, MinMaxDict[col][1], NUM_OF_ROWS)
        usual_values = np.random.uniform(0.85, 8.597, NUM_OF_ROWS)
    elif col == 'Total Length of Fwd Packet':
        usual_values = np.random.randint(MinMaxDict[col][0]*0.9, 150000, NUM_OF_ROWS)
    elif col == 'Bwd Packet Length Std':
        rand_values = np.random.uniform(MinMaxDict[col][0], MinMaxDict[col][1]*1.1, NUM_OF_ROWS)
        usual_values = np.random.uniform(0.035, 0.15, NUM_OF_ROWS)
    elif col == 'Fwd Segment Size Avg':
        rand_values = np.random.uniform(MinMaxDict[col][0]*0.95, MinMaxDict[col][1]*1.05, NUM_OF_ROWS)
        usual_values = np.random.uniform(1.99, 2.01, NUM_OF_ROWS)

    # Choose values randomly (20% from rand_values, 80% from usual_values)
    chosen_values = np.where(np.random.rand(NUM_OF_ROWS) > 0.2, usual_values, rand_values)

    portDataset[col] = chosen_values

portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,3570,50,0.0,61,6.488154,9.122663,111236.000000,21,19.988608,18,0.000000,36,19.768188,21,0.043707,1.991511,0,230,0.0,0.0,0.0,6.734543,0.0,0.0,0.0,0.0
1,5912,72,0.0,55,3.019392,8.859341,115936.000000,42,42.000000,42,0.000000,43,17.006091,20,0.055332,2.003651,0,581,0.0,0.0,0.0,1.882942,0.0,0.0,0.0,0.0
2,3772,54,0.0,70,3.084480,97.810513,137965.455764,25,23.055677,21,4.664126,28,20.692685,16,0.050031,2.008204,0,1093,0.0,0.0,0.0,11.245720,0.0,0.0,0.0,0.0
3,5034,67,0.0,49,3.040142,8.888198,132929.000000,29,29.000000,29,0.000000,21,20.494675,21,0.048484,1.996689,0,1499,0.0,0.0,0.0,5.547774,0.0,0.0,0.0,0.0
4,3501,45,0.0,79,2.937269,9.144361,98596.644308,29,29.000000,29,0.000000,36,17.023478,18,0.039530,1.999360,0,808,0.0,0.0,0.0,2.783013,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,5833,45,0.0,68,2.911154,8.943153,133455.000000,46,46.000000,46,0.000000,27,20.350317,18,0.084079,1.929362,0,1688,0.0,0.0,0.0,6.844114,0.0,0.0,0.0,0.0
5996,5557,53,0.0,67,2.903249,8.900729,149771.000000,41,41.000000,41,0.000000,19,16.921111,17,0.117288,2.009920,0,995,0.0,0.0,0.0,5.873060,0.0,0.0,0.0,0.0
5997,4194,55,0.0,67,3.016435,9.018624,139292.000000,35,35.000000,35,0.000000,24,19.435125,17,0.076641,1.994841,0,318,0.0,0.0,0.0,1.159218,0.0,0.0,0.0,0.0
5998,5469,50,0.0,64,3.070927,8.922056,150687.500493,32,31.658664,31,0.000000,35,20.086560,16,0.131664,1.996076,0,1026,0.0,0.0,0.0,7.135865,0.0,0.0,0.0,0.0


### Same Values

In [78]:
# Generate random values for the 'Flow Duration' column
randValues = np.random.randint(MinMaxDict['Packet Length Min'][0]*0.9, MinMaxDict['Packet Length Min'][1]*1.05, size=NUM_OF_ROWS)

# Assign the random values
portDataset['Packet Length Min'] = randValues

portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,3570,50,49,61,6.488154,9.122663,111236.000000,21,19.988608,18,0.000000,36,19.768188,21,0.043707,1.991511,0,230,0.0,0.0,0.0,6.734543,0.0,0.0,0.0,0.0
1,5912,72,58,55,3.019392,8.859341,115936.000000,42,42.000000,42,0.000000,43,17.006091,20,0.055332,2.003651,0,581,0.0,0.0,0.0,1.882942,0.0,0.0,0.0,0.0
2,3772,54,41,70,3.084480,97.810513,137965.455764,25,23.055677,21,4.664126,28,20.692685,16,0.050031,2.008204,0,1093,0.0,0.0,0.0,11.245720,0.0,0.0,0.0,0.0
3,5034,67,48,49,3.040142,8.888198,132929.000000,29,29.000000,29,0.000000,21,20.494675,21,0.048484,1.996689,0,1499,0.0,0.0,0.0,5.547774,0.0,0.0,0.0,0.0
4,3501,45,55,79,2.937269,9.144361,98596.644308,29,29.000000,29,0.000000,36,17.023478,18,0.039530,1.999360,0,808,0.0,0.0,0.0,2.783013,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,5833,45,47,68,2.911154,8.943153,133455.000000,46,46.000000,46,0.000000,27,20.350317,18,0.084079,1.929362,0,1688,0.0,0.0,0.0,6.844114,0.0,0.0,0.0,0.0
5996,5557,53,58,67,2.903249,8.900729,149771.000000,41,41.000000,41,0.000000,19,16.921111,17,0.117288,2.009920,0,995,0.0,0.0,0.0,5.873060,0.0,0.0,0.0,0.0
5997,4194,55,56,67,3.016435,9.018624,139292.000000,35,35.000000,35,0.000000,24,19.435125,17,0.076641,1.994841,0,318,0.0,0.0,0.0,1.159218,0.0,0.0,0.0,0.0
5998,5469,50,48,64,3.070927,8.922056,150687.500493,32,31.658664,31,0.000000,35,20.086560,16,0.131664,1.996076,0,1026,0.0,0.0,0.0,7.135865,0.0,0.0,0.0,0.0


### Correlation

In [79]:
first_correlation = ['SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
independent_col = portSamples[first_correlation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = portSamples[first_correlation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(first_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('ACK Flag Count', np.float64(0.9934935261651908))
('RST Flag Count', np.float64(0.9933592536718514))


In [80]:
# adding the rest of the attack feature values to the dataset at random based on the smaple data
portDataset['SYN Flag Count'] = np.random.randint(MinMaxDict['SYN Flag Count'][0]*0.85, MinMaxDict['SYN Flag Count'][1]*1.1, NUM_OF_ROWS)

for index, row in portDataset.iterrows():
    for col, factor in zip(first_correlation[1:], scaling_factors): #iterating over all rows we need to add values to except 'Number of Ports'
        # calculate a random small delta of the factor for adding some randomness
        delta = random.uniform(factor[1] * 0.01, factor[1] * 0.02) # select a dantom (how much is 1% and 2% from factor)

        # apply the randomness to the calculated number
        updatedFactor = factor[1] + (-1) * delta

        # calculate the value we want to add into the dataset in the given row
        portDataset.loc[index, col] = int(row['SYN Flag Count'] * updatedFactor)

In [81]:
second_correlation = ['Flow Duration', 'Packets Per Second', 'IAT Max', 'IAT Mean', 'IAT Std']

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
independent_col = portSamples[second_correlation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = portSamples[second_correlation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(second_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Packets Per Second', np.float64(78.30849469959969))
('IAT Max', np.float64(0.6321809769264504))
('IAT Mean', np.float64(0.00010107180700798969))
('IAT Std', np.float64(0.006820131130425529))


In [82]:
durationToPacketsCorr = [x * y for x, y in zip(portSamples['Flow Duration'].values, portSamples['Packets Per Second'].values)]
durationToPacketsCorr = np.mean(durationToPacketsCorr)
durationToPacketsCorr

np.float64(9940.88888921109)

In [83]:
# adding the rest of the attack feature values to the dataset at random based on the smaple data
for index, row in portDataset.iterrows():
    for col, factor in zip(second_correlation[1:], scaling_factors): #iterating over all rows we need to add values to except 'Number of Ports'
        if col == 'Packets Per Second':
            delta = random.uniform(durationToPacketsCorr * 0.075, durationToPacketsCorr * 0.1) # select a delta
            updatedFactor = durationToPacketsCorr + random.choice([-1, 1]) * delta
            portDataset.loc[index, col] = updatedFactor / row['Flow Duration']
        else:
            # calculate a random small delta of the factor for adding some randomness
            delta = random.uniform(factor[1] * 0.01, factor[1] * 0.02) # select a dantom (how much is 1% and 2% from factor)

            # apply the randomness to the calculated number
            updatedFactor = factor[1] + random.choice([-1, 1]) * delta

            # calculate the value we want to add into the dataset in the given row
            portDataset.loc[index, col] = row['Flow Duration'] * updatedFactor

In [84]:
# adding a label to the dataset
portDataset['Label'] = ATTACK_NAME

In [85]:
portSamples.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0,18.0
mean,4848.222222,57.459618,54.0,61.555556,3.513727,15.084284,134114.666667,27.555556,26.884977,26.333333,0.283675,24.666667,20.002894,20.0,0.1139,1.874946,0.0,152.51972,4986.777778,4954.444444,4953.777778,5.742802,4944.269115,3.658145,0.000579,0.039199
std,491.206505,1.681917,0.0,4.527332,1.702665,21.622031,17063.62018,4.527332,3.302369,1.414214,1.136493,5.861138,0.003808,0.0,0.146877,0.472518,0.0,330.387193,85.082351,83.26091,83.743695,9.974609,2783.707692,6.380214,0.001009,0.068487
min,2914.0,56.998799,54.0,60.0,2.999324,8.995943,120900.0,26.0,26.0,26.0,0.0,20.0,20.0,20.0,0.0,0.0,0.0,0.0,4650.0,4641.0,4641.0,0.986577,235.28316,0.504682,9.9e-05,0.00508
25%,4963.5,57.004851,54.0,60.0,2.999574,8.997443,130084.5,26.0,26.0,26.0,0.0,21.0,20.0002,20.0,0.014152,2.0,0.0,33.339643,5001.5,4966.5,4966.5,1.463707,2663.73993,1.006845,0.000147,0.010095
50%,4998.0,57.007502,54.0,60.0,2.999775,8.998651,130195.0,26.0,26.0,26.0,0.0,24.0,20.002409,20.0,0.098137,2.001196,0.0,35.352346,5006.5,4982.5,4982.5,1.691956,5844.73954,1.010638,0.000172,0.010876
75%,5003.0,57.012333,54.0,60.0,2.999995,8.999973,130390.0,26.0,26.0,26.0,0.0,24.0,20.003212,20.0,0.113304,2.00241,0.0,149.053673,5012.0,4988.0,4988.0,4.097363,6808.495571,1.460001,0.00041,0.021448
max,5020.0,64.131025,54.0,74.0,9.996243,99.924867,201272.0,40.0,39.990463,32.0,4.830089,40.0,20.013104,20.0,0.498803,2.003995,0.0,1439.285714,5030.0,4996.0,4996.0,41.979205,10135.04233,25.286814,0.004251,0.276435


In [86]:
portDataset.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,4104.975833,58.449667,49.525833,64.945333,3.820296,19.28844,129577.931762,32.588333,32.212271,31.648,0.548504,30.740333,19.040614,17.981667,0.134526,1.813167,0.0,842.0635,4643.306833,4543.417667,4542.545333,8.401852,2532.025295,5.312439,0.00085,0.05731
std,1069.244046,9.136504,5.770345,11.677403,2.040141,25.159791,29335.517834,8.060626,8.080106,8.172427,1.321334,8.95116,1.463635,2.004246,0.116261,0.475156,0.0,485.196661,745.89602,730.255757,729.730705,9.505513,2193.386884,6.01352,0.000961,0.064888
min,2228.0,43.0,40.0,45.0,2.294579,7.193303,92488.0,19.0,17.07015,15.0,0.0,15.0,15.0,15.0,0.000146,0.00591,0.0,0.0,3359.0,3271.0,3272.0,0.740757,198.734764,0.473858,7.4e-05,0.005151
25%,3186.5,51.0,45.0,55.0,2.958409,8.943149,108576.0,26.0,25.0,25.0,0.0,22.0,17.904185,16.0,0.068597,1.991783,0.0,431.0,3989.0,3905.75,3902.75,3.242581,1268.307279,2.045858,0.000327,0.022073
50%,4112.0,59.0,50.0,65.0,3.022271,9.034316,124644.0,33.0,32.0,32.0,0.0,31.0,19.0,18.0,0.102235,1.998399,0.0,843.5,4638.5,4539.0,4534.5,5.508206,1797.912381,3.48133,0.000557,0.037701
75%,5041.0,66.0,55.0,75.0,3.083851,9.129248,141239.25,40.0,39.0,39.0,0.0,39.0,20.222363,20.0,0.137212,2.004548,0.0,1252.0,5294.0,5182.25,5182.0,7.871045,3089.602869,4.973721,0.000794,0.053619
max,5934.0,74.0,59.0,85.0,11.816027,118.13405,237883.117053,46.0,46.0,46.0,5.701243,46.0,21.621289,21.0,0.589333,2.260774,0.0,1700.0,5946.0,5848.0,5842.0,45.099843,14700.78448,29.049538,0.004641,0.313105


In [87]:
intColumns = ['Number of Ports', 'Packet Length Min', 'Packet Length Max', 'Total Length of Fwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']
for col in intColumns:
    portDataset[col] = portDataset[col].astype(int)

portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std,Label
0,3570,50,49,61,6.488154,9.122663,111236,21,19.988608,18,0.000000,36,19.768188,21,0.043707,1.991511,0,230,5284,5176,5172,6.734543,1608.620971,4.211153,0.000670,0.045114,PortScan
1,5912,72,58,55,3.019392,8.859341,115936,42,42.000000,42,0.000000,43,17.006091,20,0.055332,2.003651,0,581,4073,3999,3990,1.882942,4865.685705,1.176692,0.000194,0.012651,PortScan
2,3772,54,41,70,3.084480,97.810513,137965,25,23.055677,21,4.664126,28,20.692685,16,0.050031,2.008204,0,1093,4908,4807,4813,11.245720,816.952584,7.205803,0.001117,0.075518,PortScan
3,5034,67,48,49,3.040142,8.888198,132929,29,29.000000,29,0.000000,21,20.494675,21,0.048484,1.996689,0,1499,4447,4351,4335,5.547774,1928.757932,3.468613,0.000553,0.037275,PortScan
4,3501,45,55,79,2.937269,9.144361,98596,29,29.000000,29,0.000000,36,17.023478,18,0.039530,1.999360,0,808,3864,3793,3779,2.783013,3847.258991,1.739369,0.000277,0.019177,PortScan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,5833,45,47,68,2.911154,8.943153,133455,46,46.000000,46,0.000000,27,20.350317,18,0.084079,1.929362,0,1688,4526,4434,4415,6.844114,1576.071907,4.252928,0.000680,0.045939,PortScan
5996,5557,53,58,67,2.903249,8.900729,149771,41,41.000000,41,0.000000,19,16.921111,17,0.117288,2.009920,0,995,4057,3971,3972,5.873060,1820.851348,3.764703,0.000601,0.040733,PortScan
5997,4194,55,56,67,3.016435,9.018624,139292,35,35.000000,35,0.000000,24,19.435125,17,0.076641,1.994841,0,318,5822,5691,5668,1.159218,9428.500695,0.720920,0.000119,0.007806,PortScan
5998,5469,50,48,64,3.070927,8.922056,150687,32,31.658664,31,0.000000,35,20.086560,16,0.131664,1.996076,0,1026,3715,3650,3644,7.135865,1518.931815,4.559245,0.000710,0.049239,PortScan


---

In [88]:
# save the dataset
# portDataset.to_csv('port_scan_open_ports_type1_dataset_new.csv', index=False)

---

### Second sample dataset

---

In [89]:
NUM_OF_ROWS = 6000

In [90]:
# import the attack sample dataset
portSamples = pd.read_csv('portscan_open_port_samples_2.csv')
portSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,3198,59.99655,58,60,0.082993,0.006888,165516,26,26.0,26,0.0,24,24.0,24,0.0,2.006912,0.0,26.077832,6366,11,11,9.874222,645.82303,1.088071,0.001549,0.01638
1,4163,59.998305,58,60,0.058205,0.003388,214526,26,26.0,26,0.0,24,24.0,24,0.0,2.003394,0.0,28.954785,8251,7,7,9.240387,893.685517,0.05886,0.001119,0.005757
2,2728,59.998156,58,60,0.060706,0.003685,140842,26,26.0,26,0.0,24,24.0,24,0.0,2.003692,0.0,31.487145,5417,5,5,5.565601,974.198454,0.049144,0.001027,0.004635
3,3607,59.996941,58,60,0.078163,0.006109,186680,26,26.0,26,0.0,24,24.0,24,0.0,2.006128,0.0,26.047161,7180,11,11,10.963778,655.887048,1.096967,0.001525,0.015087
4,4047,59.998252,58,60,0.059109,0.003494,208000,26,26.0,26,0.0,24,24.0,24,0.0,2.0035,0.0,29.005717,8000,7,7,8.881072,901.580345,0.067206,0.001109,0.004755
5,3473,59.996815,58,60,0.079745,0.006359,179322,26,26.0,26,0.0,24,24.0,24,0.0,2.00638,0.0,26.060456,6897,11,11,10.75229,642.46779,1.097064,0.001557,0.015597
6,4015,59.998231,58,60,0.059455,0.003535,205582,26,26.0,26,0.0,24,24.0,24,0.0,2.003541,0.0,29.193695,7907,7,7,8.70982,908.629566,0.058229,0.001101,0.004645
7,3484,59.997114,58,60,0.075924,0.005765,179894,26,26.0,26,0.0,24,24.0,24,0.0,2.005781,0.0,26.048943,6919,10,10,10.701885,647.456033,1.092027,0.001545,0.016018
8,4038,59.998251,58,60,0.05912,0.003495,207922,26,26.0,26,0.0,24,24.0,24,0.0,2.003501,0.0,29.023171,7997,7,7,8.978749,891.438213,0.070668,0.001122,0.006004
9,3827,59.996868,58,60,0.079083,0.006254,198926,26,26.0,26,0.0,24,24.0,24,0.0,2.006274,0.0,26.044252,7651,12,12,10.75696,712.375993,1.095676,0.001404,0.014864


In [91]:
smallPortSamples = portSamples.iloc[[11, 12]]

portSamples.drop(index=11, inplace=True)
portSamples.drop(index=12, inplace=True)
portSamples.drop(index=17, inplace=True)
portSamples.reset_index(drop=True, inplace=True)
portSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,3198,59.99655,58,60,0.082993,0.006888,165516,26,26.0,26,0.0,24,24.0,24,0.0,2.006912,0.0,26.077832,6366,11,11,9.874222,645.82303,1.088071,0.001549,0.01638
1,4163,59.998305,58,60,0.058205,0.003388,214526,26,26.0,26,0.0,24,24.0,24,0.0,2.003394,0.0,28.954785,8251,7,7,9.240387,893.685517,0.05886,0.001119,0.005757
2,2728,59.998156,58,60,0.060706,0.003685,140842,26,26.0,26,0.0,24,24.0,24,0.0,2.003692,0.0,31.487145,5417,5,5,5.565601,974.198454,0.049144,0.001027,0.004635
3,3607,59.996941,58,60,0.078163,0.006109,186680,26,26.0,26,0.0,24,24.0,24,0.0,2.006128,0.0,26.047161,7180,11,11,10.963778,655.887048,1.096967,0.001525,0.015087
4,4047,59.998252,58,60,0.059109,0.003494,208000,26,26.0,26,0.0,24,24.0,24,0.0,2.0035,0.0,29.005717,8000,7,7,8.881072,901.580345,0.067206,0.001109,0.004755
5,3473,59.996815,58,60,0.079745,0.006359,179322,26,26.0,26,0.0,24,24.0,24,0.0,2.00638,0.0,26.060456,6897,11,11,10.75229,642.46779,1.097064,0.001557,0.015597
6,4015,59.998231,58,60,0.059455,0.003535,205582,26,26.0,26,0.0,24,24.0,24,0.0,2.003541,0.0,29.193695,7907,7,7,8.70982,908.629566,0.058229,0.001101,0.004645
7,3484,59.997114,58,60,0.075924,0.005765,179894,26,26.0,26,0.0,24,24.0,24,0.0,2.005781,0.0,26.048943,6919,10,10,10.701885,647.456033,1.092027,0.001545,0.016018
8,4038,59.998251,58,60,0.05912,0.003495,207922,26,26.0,26,0.0,24,24.0,24,0.0,2.003501,0.0,29.023171,7997,7,7,8.978749,891.438213,0.070668,0.001122,0.006004
9,3827,59.996868,58,60,0.079083,0.006254,198926,26,26.0,26,0.0,24,24.0,24,0.0,2.006274,0.0,26.044252,7651,12,12,10.75696,712.375993,1.095676,0.001404,0.014864


In [92]:
# find the columns that we need to synthesis data for to produce an attack dataset
columnsToGather = portSamples.replace(0, np.nan) #replace all 0 values with null
columnsToGather = columnsToGather.dropna(how="all", axis=1).columns.tolist() #remove all columns where there are null values
columnsToGather #left with all columns that the values are not 0 (be know for a fact that the data is consistant and there are not missing values in the rows)

['Number of Ports',
 'Average Packet Length',
 'Packet Length Min',
 'Packet Length Max',
 'Packet Length Std',
 'Packet Length Variance',
 'Total Length of Fwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Min',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Fwd Segment Size Avg',
 'Subflow Fwd Bytes',
 'SYN Flag Count',
 'ACK Flag Count',
 'RST Flag Count',
 'Flow Duration',
 'Packets Per Second',
 'IAT Max',
 'IAT Mean',
 'IAT Std']

In [93]:
# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (portSamples[col].min() * 0.85, portSamples[col].max() * 1.075) for col in columnsToGather}
MinMaxDict

{'Number of Ports': (np.float64(2318.7999999999997), np.float64(4798.8)),
 'Average Packet Length': (np.float64(50.997067586639474),
  np.float64(64.49817752482441)),
 'Packet Length Min': (np.float64(49.3), np.float64(62.349999999999994)),
 'Packet Length Max': (np.float64(51.0), np.float64(64.5)),
 'Packet Length Std': (np.float64(0.04947392386561522),
  np.float64(0.08921783960191092)),
 'Packet Length Variance': (np.float64(0.0028796107560714003),
  np.float64(0.007404486421611374)),
 'Total Length of Fwd Packet': (np.float64(119715.7),
  np.float64(247581.09999999998)),
 'Fwd Packet Length Max': (np.float64(22.099999999999998), np.float64(27.95)),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(27.95)),
 'Fwd Packet Length Min': (np.float64(22.099999999999998), np.float64(27.95)),
 'Bwd Packet Length Max': (np.float64(20.4), np.float64(25.799999999999997)),
 'Bwd Packet Length Mean': (np.float64(20.4), np.float64(25.799999999999997)),
 'Bwd Packet Length Min

In [94]:
# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['Number of Ports', 'Packet Length Min', 'Packet Length Max', 'Total Length of Fwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (2318, 4798),
 'Average Packet Length': (np.float64(50.997067586639474),
  np.float64(64.49817752482441)),
 'Packet Length Min': (49, 62),
 'Packet Length Max': (51, 64),
 'Packet Length Std': (np.float64(0.04947392386561522),
  np.float64(0.08921783960191092)),
 'Packet Length Variance': (np.float64(0.0028796107560714003),
  np.float64(0.007404486421611374)),
 'Total Length of Fwd Packet': (119715, 247581),
 'Fwd Packet Length Max': (22, 27),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(27.95)),
 'Fwd Packet Length Min': (22, 27),
 'Bwd Packet Length Max': (20, 25),
 'Bwd Packet Length Mean': (np.float64(20.4), np.float64(25.799999999999997)),
 'Bwd Packet Length Min': (20, 25),
 'Fwd Segment Size Avg': (np.float64(1.7028844988486243),
  np.float64(2.157430097392397)),
 'Subflow Fwd Bytes': (np.float64(22.137614558785017),
  np.float64(33.84868097473731)),
 'SYN Flag Count': (4604, 9522),
 'ACK Flag Count': (4, 12),
 'RST Flag Count': (4, 

### Creating the dataframe

In [95]:
# creating an empty dataframe before adding values to it
portDataset2 = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(portSamples.columns))), columns=portSamples.columns)
portDataset2.head(3)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
# adding zeros to all columns that should not have any values
zeroColumns = [col for col in portSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    portDataset2[col] = int(0)
zeroColumns

['Fwd Packet Length Std', 'Bwd Packet Length Std', 'Bwd Segment Size Avg']

In [97]:
portSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,3198,59.99655,58,60,0.082993,0.006888,165516,26,26.0,26,0.0,24,24.0,24,0.0,2.006912,0.0,26.077832,6366,11,11,9.874222,645.82303,1.088071,0.001549,0.01638
1,4163,59.998305,58,60,0.058205,0.003388,214526,26,26.0,26,0.0,24,24.0,24,0.0,2.003394,0.0,28.954785,8251,7,7,9.240387,893.685517,0.05886,0.001119,0.005757
2,2728,59.998156,58,60,0.060706,0.003685,140842,26,26.0,26,0.0,24,24.0,24,0.0,2.003692,0.0,31.487145,5417,5,5,5.565601,974.198454,0.049144,0.001027,0.004635
3,3607,59.996941,58,60,0.078163,0.006109,186680,26,26.0,26,0.0,24,24.0,24,0.0,2.006128,0.0,26.047161,7180,11,11,10.963778,655.887048,1.096967,0.001525,0.015087
4,4047,59.998252,58,60,0.059109,0.003494,208000,26,26.0,26,0.0,24,24.0,24,0.0,2.0035,0.0,29.005717,8000,7,7,8.881072,901.580345,0.067206,0.001109,0.004755
5,3473,59.996815,58,60,0.079745,0.006359,179322,26,26.0,26,0.0,24,24.0,24,0.0,2.00638,0.0,26.060456,6897,11,11,10.75229,642.46779,1.097064,0.001557,0.015597
6,4015,59.998231,58,60,0.059455,0.003535,205582,26,26.0,26,0.0,24,24.0,24,0.0,2.003541,0.0,29.193695,7907,7,7,8.70982,908.629566,0.058229,0.001101,0.004645
7,3484,59.997114,58,60,0.075924,0.005765,179894,26,26.0,26,0.0,24,24.0,24,0.0,2.005781,0.0,26.048943,6919,10,10,10.701885,647.456033,1.092027,0.001545,0.016018
8,4038,59.998251,58,60,0.05912,0.003495,207922,26,26.0,26,0.0,24,24.0,24,0.0,2.003501,0.0,29.023171,7997,7,7,8.978749,891.438213,0.070668,0.001122,0.006004
9,3827,59.996868,58,60,0.079083,0.006254,198926,26,26.0,26,0.0,24,24.0,24,0.0,2.006274,0.0,26.044252,7651,12,12,10.75696,712.375993,1.095676,0.001404,0.014864


In [98]:
random_values = ['Average Packet Length', 'Packet Length Std', 'Packet Length Variance', 'Fwd Segment Size Avg', 'Subflow Fwd Bytes']

for col in random_values:
    if col == 'Subflow Fwd Bytes':
        val = np.random.uniform(MinMaxDict[col][0]*0.995, MinMaxDict[col][1]*1.005, size=NUM_OF_ROWS)
    else:
        val = np.random.uniform(MinMaxDict[col][0]*0.9, MinMaxDict[col][1]*1.1, size=NUM_OF_ROWS)
    portDataset2[col] = val

In [99]:
same_value1 = ['Packet Length Min', 'Packet Length Max']
val1 = np.random.randint(MinMaxDict[same_value1[0]][0]*0.9, MinMaxDict[same_value1[0]][1]*1.05, size=NUM_OF_ROWS)

same_value2 = ['Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Min']
val2 = np.random.randint(MinMaxDict[same_value2[0]][0]*0.9, MinMaxDict[same_value2[0]][1]*1.05, size=NUM_OF_ROWS)

same_value3 = ['Bwd Packet Length Min', 'Bwd Packet Length Max', 'Bwd Packet Length Mean']
val3 = np.random.randint(MinMaxDict[same_value3[0]][0]*0.9, MinMaxDict[same_value3[0]][1]*1.05, size=NUM_OF_ROWS)

for col in same_value1:
    if col == 'Packet Length Min':
        portDataset2[col] = val1
    else:
        portDataset2[col] = [val + np.random.randint(2, 8) for val in val1]

for col in same_value2:
    portDataset2[col] = val2

for col in same_value3:
    portDataset2[col] = val3

In [100]:
first_correlation = ['Number of Ports', 'Total Length of Fwd Packet', 'SYN Flag Count', 'ACK Flag Count']

# finding the correlation between the SYN Flag Count column to the rest of the columns in order to create new data
independent_col = portSamples[first_correlation[0]].values.reshape(-1, 1) #column 'SYN Flag Count'
dependent_cols = portSamples[first_correlation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(first_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Total Length of Fwd Packet', np.float64(51.603268694587804))
('SYN Flag Count', np.float64(1.9847411036379927))
('ACK Flag Count', np.float64(0.0022734815333307983))


In [101]:
portDataset2['Number of Ports'] = np.random.randint(MinMaxDict['Number of Ports'][0]*0.85, MinMaxDict['Number of Ports'][1]*1.1, NUM_OF_ROWS)

for index, row in portDataset2.iterrows():
    for col, factor in scaling_factors: #iterating over all rows we need to add values
        # calculate the value we want to add into the dataset in the given row
        delta = random.uniform(factor * 0.1, factor * 0.2) # select a delta
        updatedFactor = factor + random.choice([-1, 1]) * delta
        portDataset2.loc[index, col] = int(row['Number of Ports'] * updatedFactor)
        if col == 'ACK Flag Count':
            portDataset2.loc[index, 'RST Flag Count'] = int(row['Number of Ports'] * updatedFactor) #copy the value to RST column


In [102]:
# Generate random values for the 'Flow Duration' column
randValues = np.random.uniform(MinMaxDict['Flow Duration'][0]*0.9, MinMaxDict['Flow Duration'][1]*1.05, size=NUM_OF_ROWS)

# Assign the random values
portDataset2['Flow Duration'] = randValues

In [103]:
# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
secondCorrelation = ['Flow Duration', 'Packets Per Second', 'IAT Max', 'IAT Mean', 'IAT Std']
independent_col = portSamples[secondCorrelation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = portSamples[secondCorrelation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(secondCorrelation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

durationToPacketsCorr = [x * y for x, y in zip(portSamples['Flow Duration'].values, portSamples['Packets Per Second'].values)]
durationToPacketsCorr = np.mean(durationToPacketsCorr)
durationToPacketsCorr

('Packets Per Second', np.float64(81.07547767708505))
('IAT Max', np.float64(0.05919619316767066))
('IAT Mean', np.float64(0.0001340921963392785))
('IAT Std', np.float64(0.0010486295444490786))


np.float64(7516.125)

In [104]:
#iterating over all rows we need to add values
for index, row in portDataset2.iterrows():
    for col, factor in scaling_factors: 
        # calculate a random small delta of the factor for adding some randomness
        if col == 'Packets Per Second':
            delta = random.uniform(durationToPacketsCorr * 0.075, durationToPacketsCorr * 0.1) # select a delta
            updatedFactor = durationToPacketsCorr + random.choice([-1, 1]) * delta
            portDataset2.loc[index, col] = updatedFactor / row['Flow Duration']
        else:
            if col == 'IAT Std' or col == 'IAT Max':
                delta = random.uniform(factor * 0.5, factor * 0.8)
                updatedFactor = factor + random.choices([-1, 1], weights=[1, 2], k=1)[0] * delta  
            else:
                delta = random.uniform(factor * 0.1, factor * 0.2) # select a delta
                updatedFactor = factor + random.choice([-1, 1]) * delta
            portDataset2.loc[index, col] = row['Flow Duration'] * updatedFactor

In [105]:
portSamples.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0,16.0
mean,3782.0625,59.997635,58.0,60.0,0.068096,0.004724,195191.75,26.0,26.0,26.0,0.0,24.0,24.0,24.0,0.0,2.004736,0.0,27.88073,7507.375,8.75,8.75,9.529528,804.528544,0.518116,0.001279,0.009613
std,431.957478,0.000669,0.0,0.0,0.009604,0.001335,22068.30596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001342,0.0,1.769354,848.780998,2.081666,2.081666,1.421124,136.161725,0.526051,0.000226,0.005487
min,2728.0,59.99655,58.0,60.0,0.058205,0.003388,140842.0,26.0,26.0,26.0,0.0,24.0,24.0,24.0,0.0,2.003394,0.0,26.044252,5417.0,5.0,5.0,5.565601,642.46779,0.049144,0.001027,0.003061
25%,3570.25,59.996937,58.0,60.0,0.059371,0.003525,184359.5,26.0,26.0,26.0,0.0,24.0,24.0,24.0,0.0,2.003531,0.0,26.055982,7090.75,7.0,7.0,8.838259,650.235297,0.061723,0.001093,0.004727
50%,3921.0,59.998011,58.0,60.0,0.063039,0.003974,202254.0,26.0,26.0,26.0,0.0,24.0,24.0,24.0,0.0,2.003982,0.0,28.891462,7779.0,8.0,8.0,9.340837,892.561865,0.105977,0.001121,0.00665
75%,4052.75,59.998236,58.0,60.0,0.078206,0.006116,208747.5,26.0,26.0,26.0,0.0,24.0,24.0,24.0,0.0,2.006135,0.0,29.042916,8028.75,11.0,11.0,10.753457,915.198956,1.095999,0.001538,0.015236
max,4464.0,59.998305,58.0,60.0,0.082993,0.006888,230308.0,26.0,26.0,26.0,0.0,24.0,24.0,24.0,0.0,2.006912,0.0,31.487145,8858.0,12.0,12.0,11.127804,974.198454,1.098471,0.001557,0.01638


In [106]:
portDataset2.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0,6000.0
mean,3613.028,58.4116,54.030667,58.563167,0.071251,0.005401,186506.654,22.976833,22.976833,22.976833,0.0,21.506,21.506,21.506,0.0,1.953806,0.0,27.992281,7194.998167,7.717167,7.717167,8.403936,977.962065,0.60374,0.001128,0.010658
std,952.978497,7.146114,6.027419,6.27651,0.015562,0.001607,57305.163406,2.581745,2.581745,2.581745,0.0,2.303443,2.303443,2.303443,0.0,0.243213,0.0,3.457435,2215.829463,2.53599,2.53599,2.378525,321.234403,0.361512,0.000367,0.006447
min,1970.0,45.908241,44.0,46.0,0.04454,0.002593,82281.0,19.0,19.0,19.0,0.0,18.0,18.0,18.0,0.0,1.532733,0.0,22.031027,3146.0,3.0,3.0,4.259694,539.979964,0.050907,0.000465,0.000943
25%,2790.0,52.327416,49.0,53.0,0.057691,0.003998,140656.25,21.0,21.0,21.0,0.0,19.0,19.0,19.0,0.0,1.74522,0.0,25.021268,5410.0,6.0,6.0,6.318483,719.097911,0.222919,0.000833,0.0038
50%,3587.5,58.341712,54.0,59.0,0.07123,0.005431,183161.5,23.0,23.0,23.0,0.0,22.0,22.0,22.0,0.0,1.956425,0.0,27.967211,7016.5,8.0,8.0,8.39364,895.396728,0.613986,0.001107,0.010752
75%,4450.0,64.426588,59.0,64.0,0.08497,0.006794,223226.5,25.0,25.0,25.0,0.0,24.0,24.0,24.0,0.0,2.166654,0.0,30.943575,8650.0,9.0,9.0,10.468523,1182.543356,0.918197,0.001379,0.016204
max,5276.0,70.9416,64.0,71.0,0.098135,0.008144,324786.0,27.0,27.0,27.0,0.0,25.0,25.0,25.0,0.0,2.373081,0.0,34.016402,12511.0,14.0,14.0,12.560428,1927.728442,1.325951,0.002014,0.023603


In [107]:
# adding a label to the dataset
portDataset2['Label'] = ATTACK_NAME

In [108]:
intColumns = ['Number of Ports', 'Packet Length Min', 'Packet Length Max', 'Total Length of Fwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']
for col in intColumns:
    portDataset2[col] = portDataset2[col].astype(int)

portDataset2['Fwd Packet Length Mean'] = portDataset2['Fwd Packet Length Mean'].astype(float)
portDataset2['Bwd Packet Length Mean'] = portDataset2['Bwd Packet Length Mean'].astype(float)

portDataset2

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std,Label
0,2794,54.872049,45,47,0.077043,0.008111,160594,19,19.0,19,0,23,23.0,23,0,2.132231,0,25.515568,4909,7,7,8.933326,765.880167,0.829897,0.001341,0.002409,PortScan
1,2499,46.850817,62,69,0.070230,0.007122,151461,22,22.0,22,0,24,24.0,24,0,1.809882,0,22.284008,5847,4,4,12.477972,556.376353,1.183853,0.001914,0.004432,PortScan
2,3090,49.360336,53,55,0.070385,0.003434,135795,20,20.0,20,0,19,19.0,19,0,1.553098,0,28.217070,5356,8,8,6.033483,1362.617575,0.564125,0.000708,0.002295,PortScan
3,2067,70.135152,60,63,0.052959,0.006806,123005,22,22.0,22,0,19,19.0,19,0,2.179775,0,24.334222,4690,4,4,11.141372,732.104951,1.113265,0.001331,0.005178,PortScan
4,3007,57.670632,46,53,0.066763,0.007178,126966,23,23.0,23,0,22,22.0,22,0,1.573986,0,22.215481,4875,8,8,12.288279,665.725037,1.129530,0.001425,0.020022,PortScan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,4188,65.704809,52,54,0.076606,0.007510,190749,19,19.0,19,0,22,22.0,22,0,2.234670,0,25.743280,7026,8,8,9.668225,845.740414,0.885718,0.001148,0.016258,PortScan
5996,3525,56.276542,62,66,0.074306,0.006492,157930,20,20.0,20,0,23,23.0,23,0,1.864899,0,29.495420,7905,9,9,8.539361,808.981546,0.760401,0.001358,0.015209,PortScan
5997,4335,66.937412,46,49,0.094518,0.006089,266720,20,20.0,20,0,22,22.0,22,0,1.906370,0,33.101514,7555,11,11,9.937619,686.126740,1.054984,0.001182,0.005044,PortScan
5998,3083,47.856033,59,66,0.057204,0.003915,190528,22,22.0,22,0,18,18.0,18,0,1.641143,0,32.050878,6798,5,5,8.073046,839.613617,0.768218,0.000880,0.014054,PortScan


---

### Creating more rows base on small subset of samples that is slightly different

In [109]:
smallPortSamples
NUM_OF_ROWS = 3000

In [110]:
# creating an empty dataframe before adding values to it
portDataset3 = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(smallPortSamples.columns))), columns=smallPortSamples.columns)

# find the columns that we need to synthesis data for to produce an attack dataset
columnsToGather = smallPortSamples.replace(0, np.nan) #replace all 0 values with null
columnsToGather = columnsToGather.dropna(how="all", axis=1).columns.tolist() #remove all columns where there are null values

# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (smallPortSamples[col].min() * 0.85, smallPortSamples[col].max() * 1.1) for col in columnsToGather}

# adding zeros to all columns that should not have any values
zeroColumns = [col for col in smallPortSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    portDataset3[col] = int(0)
zeroColumns

['Bwd Packet Length Std', 'Fwd Segment Size Avg', 'Bwd Segment Size Avg']

In [111]:
# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['Number of Ports', 'Packet Length Min', 'Packet Length Max', 'Total Length of Fwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (850, 1100),
 'Average Packet Length': (np.float64(62.86623634558094),
  np.float64(81.35640326975478)),
 'Packet Length Min': (56, 72),
 'Packet Length Max': (62, 81),
 'Packet Length Std': (np.float64(0.4774370048814869),
  np.float64(0.6185460879795043)),
 'Packet Length Variance': (np.float64(0.2681718748590646),
  np.float64(0.3478175117770441)),
 'Total Length of Fwd Packet': (68238, 177012),
 'Fwd Packet Length Max': (34, 44),
 'Fwd Packet Length Mean': (np.float64(33.96615231458437),
  np.float64(43.956295008691335)),
 'Fwd Packet Length Min': (27, 35),
 'Fwd Packet Length Std': (np.float64(0.47802647711329393),
  np.float64(0.6193114828292697)),
 'Bwd Packet Length Max': (34, 44),
 'Bwd Packet Length Mean': (np.float64(34.0), np.float64(44.0)),
 'Bwd Packet Length Min': (34, 44),
 'Subflow Fwd Bytes': (np.float64(34.118732851085056),
  np.float64(44.264661654135345)),
 'SYN Flag Count': (1703, 4418),
 'ACK Flag Count': (12, 33),
 'RST Flag Count': (4, 11),


In [112]:
random_values = ['Average Packet Length', 'Packet Length Std', 'Packet Length Variance', 'Fwd Packet Length Std', 'Subflow Fwd Bytes', 'Number of Ports']

for col in random_values:
    val = np.random.uniform(MinMaxDict[col][0]*0.95, MinMaxDict[col][1]*1.05, size=NUM_OF_ROWS)
    portDataset3[col] = val

In [113]:
smallPortSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
11,1000,73.960367,66,74,0.561691,0.315496,160920,40,39.960268,32,0.562384,40,40.0,40,0.0,0.0,0.0,40.139686,4017,30,10,12.911425,312.668816,3.273951,0.003199,0.057305
12,1000,73.960278,66,74,0.562315,0.316198,80280,40,39.960179,32,0.56301,40,40.0,40,0.0,0.0,0.0,40.240602,2004,15,5,4.624056,435.548349,1.099811,0.002297,0.026689


In [114]:
same_value1 = ['Packet Length Min', 'Packet Length Max']
val1 = np.random.randint(MinMaxDict[same_value1[0]][0]*0.9, MinMaxDict[same_value1[0]][1]*1.05, size=NUM_OF_ROWS)

same_value2 = ['Bwd Packet Length Min', 'Bwd Packet Length Max', 'Bwd Packet Length Mean']
val2 = np.random.randint(MinMaxDict[same_value2[0]][0]*0.9, MinMaxDict[same_value2[0]][1]*1.05, size=NUM_OF_ROWS)


for col in same_value1:
    if col == 'Packet Length Min':
        portDataset3[col] = val1
    else:
        portDataset3[col] = [val + np.random.randint(2, 14) for val in val1]

for col in same_value2:
    portDataset3[col] = val2

In [115]:
independant = ['Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean']

packet_length_max = np.random.randint(MinMaxDict['Fwd Packet Length Max'][0] * 0.9, MinMaxDict['Fwd Packet Length Max'][1] * 1.1, NUM_OF_ROWS)

# Create 'Average Packet Length' and 'Packet Length Min' based on 'Packet Length Max'
packet_length_min = packet_length_max - np.random.randint(2, 16, NUM_OF_ROWS)

# If True, copy the 'Packet Length Max' values; if False, apply small variation
average_packet_length = np.where(packet_length_max != packet_length_min, (packet_length_max + packet_length_min) / 2, packet_length_min)

# Assign the values to the dataset
portDataset3['Fwd Packet Length Max'] = packet_length_max.astype(int)
portDataset3['Fwd Packet Length Mean'] = average_packet_length
portDataset3['Fwd Packet Length Min'] = packet_length_min.astype(int)

In [116]:
portDataset3

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,828.397838,82.698616,69,71,0.623411,0.357392,0.0,34,30.5,27,0.489409,44,44,44,0,0,0,32.612740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,885.623149,69.039923,50,55,0.498475,0.303532,0.0,33,31.5,30,0.541476,41,41,41,0,0,0,38.181662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1106.069980,74.457036,69,74,0.623210,0.316214,0.0,35,31.0,27,0.520460,33,33,33,0,0,0,35.979352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,947.327140,74.243697,60,72,0.595113,0.304597,0.0,35,31.5,28,0.585399,43,43,43,0,0,0,42.295034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1066.185670,78.571226,65,73,0.513212,0.290556,0.0,45,38.0,31,0.536893,34,34,34,0,0,0,38.685801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,976.176547,76.443478,63,72,0.548970,0.278417,0.0,30,27.0,24,0.541902,38,38,38,0,0,0,37.306759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2996,1077.255040,60.223921,68,76,0.515061,0.255874,0.0,33,31.5,30,0.584745,41,41,41,0,0,0,43.626746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2997,1121.678676,84.726783,50,54,0.544731,0.278125,0.0,42,37.5,33,0.540584,41,41,41,0,0,0,42.110632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2998,1131.603142,80.199196,55,65,0.502208,0.274385,0.0,35,29.5,24,0.525551,45,45,45,0,0,0,36.028578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [117]:
first_correlation = ['SYN Flag Count', 'ACK Flag Count', 'RST Flag Count', 'Total Length of Fwd Packet']

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
independent_col = smallPortSamples[first_correlation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = smallPortSamples[first_correlation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(first_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)
    
# adding the rest of the attack feature values to the dataset at random based on the smaple data
portDataset3['SYN Flag Count'] = np.random.randint(MinMaxDict['SYN Flag Count'][0]*0.9, MinMaxDict['SYN Flag Count'][1]*1.05, NUM_OF_ROWS)

for index, row in portDataset3.iterrows():
    for col, factor in zip(first_correlation[1:], scaling_factors): #iterating over all rows we need to add values to except 'Number of Ports'
        # calculate a random small delta of the factor for adding some randomness
        delta = random.uniform(factor[1] * 0.01, factor[1] * 0.02) # select a dantom (how much is 1% and 2% from factor)

        # apply the randomness to the calculated number
        updatedFactor = factor[1] + (-1) * delta

        # calculate the value we want to add into the dataset in the given row
        portDataset3.loc[index, col] = int(row['SYN Flag Count'] * updatedFactor)

('ACK Flag Count', np.float64(0.007471601883754736))
('RST Flag Count', np.float64(0.002490533961251579))
('Total Length of Fwd Packet', np.float64(40.05977281507004))


In [118]:
portDataset3

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,828.397838,82.698616,69,71,0.623411,0.357392,142683.0,34,30.5,27,0.489409,44,44,44,0,0,0,32.612740,3627,26.0,8.0,0.0,0.0,0.0,0.0,0.0
1,885.623149,69.039923,50,55,0.498475,0.303532,125607.0,33,31.5,30,0.541476,41,41,41,0,0,0,38.181662,3178,23.0,7.0,0.0,0.0,0.0,0.0,0.0
2,1106.069980,74.457036,69,74,0.623210,0.316214,74921.0,35,31.0,27,0.520460,33,33,33,0,0,0,35.979352,1892,13.0,4.0,0.0,0.0,0.0,0.0,0.0
3,947.327140,74.243697,60,72,0.595113,0.304597,130249.0,35,31.5,28,0.585399,43,43,43,0,0,0,42.295034,3290,24.0,8.0,0.0,0.0,0.0,0.0,0.0
4,1066.185670,78.571226,65,73,0.513212,0.290556,139710.0,45,38.0,31,0.536893,34,34,34,0,0,0,38.685801,3543,26.0,8.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,976.176547,76.443478,63,72,0.548970,0.278417,142752.0,30,27.0,24,0.541902,38,38,38,0,0,0,37.306759,3606,26.0,8.0,0.0,0.0,0.0,0.0,0.0
2996,1077.255040,60.223921,68,76,0.515061,0.255874,147331.0,33,31.5,30,0.584745,41,41,41,0,0,0,43.626746,3731,27.0,9.0,0.0,0.0,0.0,0.0,0.0
2997,1121.678676,84.726783,50,54,0.544731,0.278125,131085.0,42,37.5,33,0.540584,41,41,41,0,0,0,42.110632,3308,24.0,8.0,0.0,0.0,0.0,0.0,0.0
2998,1131.603142,80.199196,55,65,0.502208,0.274385,78093.0,35,29.5,24,0.525551,45,45,45,0,0,0,36.028578,1989,14.0,4.0,0.0,0.0,0.0,0.0,0.0


In [119]:
# Generate random values for the 'Flow Duration' column
randValues = np.random.uniform(MinMaxDict['Flow Duration'][0]*0.9, MinMaxDict['Flow Duration'][1]*1.05, size=NUM_OF_ROWS)

# Assign the random values
portDataset3['Flow Duration'] = randValues

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
secondCorrelation = ['Flow Duration', 'Packets Per Second', 'IAT Max', 'IAT Mean', 'IAT Std']
independent_col = smallPortSamples[secondCorrelation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = smallPortSamples[secondCorrelation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name, factor) for name, factor in zip(secondCorrelation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

durationToPacketsCorr = [x * y for x, y in zip(smallPortSamples['Flow Duration'].values, smallPortSamples['Packets Per Second'].values)]
durationToPacketsCorr = np.mean(durationToPacketsCorr)
durationToPacketsCorr

('Packets Per Second', np.float64(32.17131779515319))
('IAT Max', np.float64(0.2517824943859383))
('IAT Mean', np.float64(0.00027607674297135994))
('IAT Std', np.float64(0.004589897159462554))


np.float64(3025.5)

In [120]:
# adding the rest of the attack feature values to the dataset at random based on the smaple data
for index, row in portDataset3.iterrows():
    for col, factor in zip(secondCorrelation[1:], scaling_factors): #iterating over all rows we need to add values to except 'Number of Ports'
        if col == 'Packets Per Second':
            delta = random.uniform(durationToPacketsCorr * 0.1, durationToPacketsCorr * 0.2) # select a delta
            updatedFactor = durationToPacketsCorr + random.choices([-1, 1], weights=[2, 1], k=1)[0] * delta
            portDataset3.loc[index, col] = updatedFactor / row['Flow Duration']
        elif col == 'IAT Mean':
            delta = random.uniform(factor[1] * 0.5, factor[1] * 0.8) # select a dantom (how much is 1% and 2% from factor)
            updatedFactor = factor[1] + delta
            portDataset3.loc[index, col] = row['Flow Duration'] * updatedFactor
        else:
            # calculate a random small delta of the factor for adding some randomness
            delta = random.uniform(factor[1] * 0.15, factor[1] * 0.35) # select a dantom (how much is 1% and 2% from factor)

            # apply the randomness to the calculated number
            updatedFactor = factor[1] + random.choice([-1, 1]) * delta

            # calculate the value we want to add into the dataset in the given row
            portDataset3.loc[index, col] = row['Flow Duration'] * updatedFactor

In [121]:
intColumns = ['Number of Ports', 'Packet Length Min', 'Packet Length Max', 'Total Length of Fwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']
for col in intColumns:
    portDataset3[col] = portDataset3[col].astype(int)

portDataset3['Bwd Packet Length Mean'] = portDataset3['Bwd Packet Length Mean'].astype(float)

portDataset3

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,828,82.698616,69,71,0.623411,0.357392,142683,34,30.5,27,0.489409,44,44.0,44,0,0,0,32.612740,3627,26,8,9.893118,246.842850,3.048363,0.004549,0.059420
1,885,69.039923,50,55,0.498475,0.303532,125607,33,31.5,30,0.541476,41,41.0,41,0,0,0,38.181662,3178,23,7,6.868218,518.538250,2.268850,0.003260,0.025926
2,1106,74.457036,69,74,0.623210,0.316214,74921,35,31.0,27,0.520460,33,33.0,33,0,0,0,35.979352,1892,13,4,12.637468,197.603837,3.962413,0.005787,0.048868
3,947,74.243697,60,72,0.595113,0.304597,130249,35,31.5,28,0.585399,43,43.0,43,0,0,0,42.295034,3290,24,8,8.010062,331.088927,1.657928,0.003400,0.044548
4,1066,78.571226,65,73,0.513212,0.290556,139710,45,38.0,31,0.536893,34,34.0,34,0,0,0,38.685801,3543,26,8,12.995890,266.969961,2.310697,0.005493,0.073818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,976,76.443478,63,72,0.548970,0.278417,142752,30,27.0,24,0.541902,38,38.0,38,0,0,0,37.306759,3606,26,8,14.715669,166.872606,2.414693,0.006344,0.051230
2996,1077,60.223921,68,76,0.515061,0.255874,147331,33,31.5,30,0.584745,41,41.0,41,0,0,0,43.626746,3731,27,9,11.732520,214.680398,3.435259,0.005546,0.066078
2997,1121,84.726783,50,54,0.544731,0.278125,131085,42,37.5,33,0.540584,41,41.0,41,0,0,0,42.110632,3308,24,8,6.384057,559.370798,1.904853,0.003020,0.035056
2998,1131,80.199196,55,65,0.502208,0.274385,78093,35,29.5,24,0.525551,45,45.0,45,0,0,0,36.028578,1989,14,4,3.771304,704.460376,0.706167,0.001736,0.013124


In [122]:
smallPortSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
11,1000,73.960367,66,74,0.561691,0.315496,160920,40,39.960268,32,0.562384,40,40.0,40,0.0,0.0,0.0,40.139686,4017,30,10,12.911425,312.668816,3.273951,0.003199,0.057305
12,1000,73.960278,66,74,0.562315,0.316198,80280,40,39.960179,32,0.56301,40,40.0,40,0.0,0.0,0.0,40.240602,2004,15,5,4.624056,435.548349,1.099811,0.002297,0.026689


In [123]:
portDataset3.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0
mean,981.234667,72.622875,62.198333,69.715667,0.551311,0.311095,121760.604333,38.484,34.1745,29.865,0.55162,37.490333,37.490333,37.490333,0.0,0.0,0.0,39.277562,3085.750333,22.209667,7.075333,9.216118,363.727362,2.338281,0.004201,0.042201
std,100.818228,7.370066,7.093831,7.852433,0.056279,0.03176,35329.699293,5.140741,5.540415,6.580568,0.05683,4.500295,4.500295,4.500295,0.0,0.0,0.0,4.055446,895.396252,6.590664,2.217949,3.248584,168.616224,1.043167,0.001499,0.018766
min,807.0,59.731265,50.0,52.0,0.453583,0.254786,60176.0,30.0,22.5,15.0,0.454298,30.0,30.0,30.0,0.0,0.0,0.0,32.41604,1532.0,11.0,3.0,3.537417,162.595787,0.58586,0.001488,0.010682
25%,893.75,66.339666,56.0,64.0,0.503036,0.283907,91766.75,34.0,30.0,25.0,0.502198,34.0,34.0,34.0,0.0,0.0,0.0,35.81357,2322.0,17.0,5.0,6.419671,240.429921,1.512001,0.002919,0.027703
50%,986.0,72.58833,62.0,70.0,0.551514,0.312332,121892.0,39.0,34.0,30.0,0.551303,37.0,37.0,37.0,0.0,0.0,0.0,39.222481,3085.5,22.0,7.0,9.315347,310.138794,2.182434,0.004234,0.039268
75%,1070.0,78.925814,68.0,76.0,0.599412,0.33822,151459.75,43.0,38.5,35.0,0.601039,41.0,41.0,41.0,0.0,0.0,0.0,42.713089,3834.25,28.0,9.0,11.950812,447.655545,2.992287,0.00543,0.053965
max,1154.0,85.422017,74.0,87.0,0.649451,0.365204,183475.0,47.0,46.0,45.0,0.650266,45.0,45.0,45.0,0.0,0.0,0.0,46.471925,4637.0,34.0,11.0,14.908517,1014.454792,5.044449,0.007303,0.091739


In [124]:
# adding a label to the dataset
portDataset3['Label'] = ATTACK_NAME

---

### Merging all three attack types tougether into one dataset

In [125]:
# sample dos attack dataset
mergedPortDataset = pd.concat([portDataset, portDataset2, portDataset3], axis=0)
mergedPortDataset = mergedPortDataset.sample(frac=1, random_state=42).reset_index(drop=True)
print(mergedPortDataset.shape)

(15000, 27)


In [None]:
# save the dataset
# mergedPortDataset.to_csv('port_scan_open_ports_dataset_new.csv', index=False)