In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
import random

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC

NUM_OF_ROWS = 7500
ATTACK_NAME = 'PortScan'

In [3]:
pd.set_option('display.max_columns', None)  # Show all columns

---

In [4]:
# import the attack sample dataset
portSamples = pd.read_csv('portscan_closed_port_samples_1.csv')
print(f'Dataset Shape: {portSamples.shape}')
portSamples

Dataset Shape: (19, 26)


Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,1970,60.0,60,60,0.0,0.0,102154,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.679028,3929,0,0,39.64862,99.095504,0.101249,0.010094,0.017669
1,1980,60.0,60,60,0.0,0.0,102778,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.674799,3953,0,0,39.988767,98.85276,0.090225,0.010119,0.016015
2,1800,60.0,60,60,0.0,0.0,93366,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.072605,3591,0,0,37.254382,96.391345,1.103064,0.010377,0.029695
3,4942,59.998174,58,60,0.060403,0.003649,256074,26,26.0,26,0.0,24,24.0,24,0.0,2.003655,0.0,26.026425,9849,9,9,28.134188,350.3922,1.101169,0.002854,0.017863
4,3416,59.998822,58,60,0.048532,0.002355,176410,26,26.0,26,0.0,24,24.0,24,0.0,2.002358,0.0,26.38893,6785,4,4,39.957571,169.905223,0.137244,0.005887,0.018825
5,1410,60.0,60,60,0.0,0.0,73060,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.761905,2810,0,0,39.887447,70.448229,0.14227,0.0142,0.029792
6,3314,59.998782,58,60,0.049349,0.002435,170612,26,26.0,26,0.0,24,24.0,24,0.0,2.002438,0.0,26.039683,6562,4,4,38.899376,168.794481,1.100685,0.005925,0.021482
7,5019,59.99819,58,60,0.060138,0.003617,258336,26,26.0,26,0.0,24,24.0,24,0.0,2.003623,0.0,28.282899,9936,9,9,11.427656,870.257194,0.038543,0.001149,0.002328
8,1930,74.0,74,74,0.0,0.0,154400,40,40.0,40,0.0,0,0.0,0,0.0,0.0,0.0,40.103896,3860,0,0,39.700051,97.229094,1.017906,0.010288,0.034187
9,1999,74.0,74,74,0.0,0.0,159200,40,40.0,40,0.0,0,0.0,0,0.0,0.0,0.0,41.020356,3980,0,0,39.994987,99.512472,0.215444,0.010052,0.03092


In [5]:
# find the columns that we need to synthesis data for to produce an attack dataset
columnsToGather = portSamples.replace(0, np.nan) #replace all 0 values with null
columnsToGather = columnsToGather.dropna(how="all", axis=1).columns.tolist() #remove all columns where there are null values
columnsToGather #left with all columns that the values are not 0 (be know for a fact that the data is consistant and there are not missing values in the rows)

['Number of Ports',
 'Average Packet Length',
 'Packet Length Min',
 'Packet Length Max',
 'Packet Length Std',
 'Packet Length Variance',
 'Total Length of Fwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Min',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Fwd Segment Size Avg',
 'Subflow Fwd Bytes',
 'SYN Flag Count',
 'ACK Flag Count',
 'RST Flag Count',
 'Flow Duration',
 'Packets Per Second',
 'IAT Max',
 'IAT Mean',
 'IAT Std']

In [6]:
# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (portSamples[col].min() * 0.85, portSamples[col].max() * 1.15) for col in columnsToGather}
MinMaxDict

{'Number of Ports': (np.float64(850.0), np.float64(5771.849999999999)),
 'Average Packet Length': (np.float64(50.998447961046864), np.float64(85.1)),
 'Packet Length Min': (np.float64(49.3), np.float64(85.1)),
 'Packet Length Max': (np.float64(51.0), np.float64(85.1)),
 'Packet Length Std': (np.float64(0.0), np.float64(0.06946344940596497)),
 'Packet Length Variance': (np.float64(0.0), np.float64(0.00419580069858701)),
 'Total Length of Fwd Packet': (np.float64(62101.0),
  np.float64(297086.39999999997)),
 'Fwd Packet Length Max': (np.float64(22.099999999999998), np.float64(46.0)),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(46.0)),
 'Fwd Packet Length Min': (np.float64(22.099999999999998), np.float64(46.0)),
 'Bwd Packet Length Max': (np.float64(0.0), np.float64(27.599999999999998)),
 'Bwd Packet Length Mean': (np.float64(0.0), np.float64(27.599999999999998)),
 'Bwd Packet Length Min': (np.float64(0.0), np.float64(27.599999999999998)),
 'Fwd Segment Size Avg

In [7]:
# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['Number of Ports', 'Packet Length Min', 'Packet Length Max', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (850, 5771),
 'Average Packet Length': (np.float64(50.998447961046864), np.float64(85.1)),
 'Packet Length Min': (49, 85),
 'Packet Length Max': (51, 85),
 'Packet Length Std': (np.float64(0.0), np.float64(0.06946344940596497)),
 'Packet Length Variance': (np.float64(0.0), np.float64(0.00419580069858701)),
 'Total Length of Fwd Packet': (np.float64(62101.0),
  np.float64(297086.39999999997)),
 'Fwd Packet Length Max': (22, 46),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(46.0)),
 'Fwd Packet Length Min': (22, 46),
 'Bwd Packet Length Max': (0, 27),
 'Bwd Packet Length Mean': (np.float64(0.0), np.float64(27.599999999999998)),
 'Bwd Packet Length Min': (0, 27),
 'Fwd Segment Size Avg': (np.float64(0.0), np.float64(2.3042034724337497)),
 'Subflow Fwd Bytes': (np.float64(22.122461632279702),
  np.float64(47.18832343063808)),
 'SYN Flag Count': (2388, 11426),
 'ACK Flag Count': (0, 10),
 'RST Flag Count': (0, 10),
 'Flow Duration': (np.float64(

---

### Creating the dataset

In [8]:
# creating an empty dataframe before adding values to it
portDataset = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(portSamples.columns))), columns=portSamples.columns)
portDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# adding zeros to all columns that should not have any values
zeroColumns = [col for col in portSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    portDataset[col] = int(0)
zeroColumns

['Fwd Packet Length Std', 'Bwd Packet Length Std', 'Bwd Segment Size Avg']

In [10]:
portDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### First Correlation

In [11]:
first_correlation = ['Number of Ports', 'Total Length of Fwd Packet', 'SYN Flag Count']

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
independent_col = portSamples[first_correlation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = portSamples[first_correlation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(first_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Total Length of Fwd Packet', np.float64(58.833888228624055))
('SYN Flag Count', np.float64(2.0010731757589486))


In [12]:
# adding the rest of the attack feature values to the dataset at random based on the smaple data
portDataset['Number of Ports'] = np.random.randint(MinMaxDict['Number of Ports'][0]*0.9, MinMaxDict['Number of Ports'][1]*1.10, NUM_OF_ROWS)

for index, row in portDataset.iterrows():
    for col, factor in zip(first_correlation[1:], scaling_factors): #iterating over all rows we need to add values to except 'Number of Ports'
        # calculate a random small delta of the factor for adding some randomness
        delta = random.uniform(factor[1] * 0.01, factor[1] * 0.02) # select a delta

        # apply the randomness to the calculated number
        updatedFactor = factor[1] + random.choice([-1, 1]) * delta

        # calculate the value we want to add into the dataset in the given row
        portDataset.loc[index, col] = row['Number of Ports'] * updatedFactor

In [13]:
portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4937,0.0,0.0,0.0,0.0,0.0,295274.555873,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,10057.038098,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4520,0.0,0.0,0.0,0.0,0.0,268804.626273,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,9138.931602,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2213,0.0,0.0,0.0,0.0,0.0,128394.170370,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,4351.886434,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2071,0.0,0.0,0.0,0.0,0.0,124265.264100,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,4102.207878,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5604,0.0,0.0,0.0,0.0,0.0,324824.284105,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,11345.584191,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,3356,0.0,0.0,0.0,0.0,0.0,194063.260100,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,6844.536637,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7496,843,0.0,0.0,0.0,0.0,0.0,50575.792839,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,1718.858214,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7497,3745,0.0,0.0,0.0,0.0,0.0,217891.302069,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,7593.121309,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7498,3945,0.0,0.0,0.0,0.0,0.0,234586.325884,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,7794.812460,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Second Correlation

In [14]:
second_correlation = ['Flow Duration', 'Packets Per Second', 'IAT Max', 'IAT Mean', 'IAT Std']

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
independent_col = portSamples[second_correlation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = portSamples[second_correlation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(second_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Packets Per Second', np.float64(3.4232599999586326))
('IAT Max', np.float64(0.016503150061282924))
('IAT Mean', np.float64(0.00024361763844503648))
('IAT Std', np.float64(0.0007367524780914209))


In [15]:
# Generate random values for the 'Flow Duration' column
randValues = np.random.uniform(MinMaxDict['Flow Duration'][0]*0.9, MinMaxDict['Flow Duration'][1]*1.05, size=NUM_OF_ROWS)

# Assign the random values
portDataset['Flow Duration'] = randValues

portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4937,0.0,0.0,0.0,0.0,0.0,295274.555873,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,10057.038098,0.0,0.0,40.182439,0.0,0.0,0.0,0.0
1,4520,0.0,0.0,0.0,0.0,0.0,268804.626273,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,9138.931602,0.0,0.0,22.833001,0.0,0.0,0.0,0.0
2,2213,0.0,0.0,0.0,0.0,0.0,128394.170370,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,4351.886434,0.0,0.0,17.000133,0.0,0.0,0.0,0.0
3,2071,0.0,0.0,0.0,0.0,0.0,124265.264100,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,4102.207878,0.0,0.0,28.337542,0.0,0.0,0.0,0.0
4,5604,0.0,0.0,0.0,0.0,0.0,324824.284105,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,11345.584191,0.0,0.0,32.033318,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,3356,0.0,0.0,0.0,0.0,0.0,194063.260100,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,6844.536637,0.0,0.0,27.990015,0.0,0.0,0.0,0.0
7496,843,0.0,0.0,0.0,0.0,0.0,50575.792839,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,1718.858214,0.0,0.0,17.851424,0.0,0.0,0.0,0.0
7497,3745,0.0,0.0,0.0,0.0,0.0,217891.302069,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,7593.121309,0.0,0.0,47.610778,0.0,0.0,0.0,0.0
7498,3945,0.0,0.0,0.0,0.0,0.0,234586.325884,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,7794.812460,0.0,0.0,22.613233,0.0,0.0,0.0,0.0


In [16]:
durationToPacketsCorr = [x * y for x, y in zip(portSamples['Flow Duration'].values, portSamples['Packets Per Second'].values)]
durationToPacketsCorr = np.mean(durationToPacketsCorr)
durationToPacketsCorr

np.float64(4832.368421052632)

In [17]:
#iterating over all rows we need to add values
for index, row in portDataset.iterrows():
    for col, factor in scaling_factors: 
        # calculate a random small delta of the factor for adding some randomness
        if col == 'Packets Per Second':
            delta = random.uniform(durationToPacketsCorr * 0.25, durationToPacketsCorr * 0.65) # select a delta
            updatedFactor = durationToPacketsCorr + random.choice([-1, 1]) * delta
            portDataset.loc[index, col] = updatedFactor / row['Flow Duration']
        else:
            if col == 'IAT Std':
                delta = random.uniform(factor * 0.35, factor * 0.65)
                updatedFactor = factor + random.choice([-1, 1]) * delta  
            else:
                delta = random.uniform(factor * 0.1, factor * 0.2) # select a delta
                updatedFactor = factor + random.choice([-1, 1]) * delta

            if col == 'IAT Max':
                delta = random.uniform(factor * 0.6, factor * 0.99)
                updatedFactor = factor + random.choices([-1, 1], weights=[1, 3], k=1)[0] * delta  
                portDataset.loc[index, col] = (row['Flow Duration'] * updatedFactor) * 2.3
            else:
                portDataset.loc[index, col] = row['Flow Duration'] * updatedFactor

In [18]:
portSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,1970,60.0,60,60,0.0,0.0,102154,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.679028,3929,0,0,39.64862,99.095504,0.101249,0.010094,0.017669
1,1980,60.0,60,60,0.0,0.0,102778,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.674799,3953,0,0,39.988767,98.85276,0.090225,0.010119,0.016015
2,1800,60.0,60,60,0.0,0.0,93366,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.072605,3591,0,0,37.254382,96.391345,1.103064,0.010377,0.029695
3,4942,59.998174,58,60,0.060403,0.003649,256074,26,26.0,26,0.0,24,24.0,24,0.0,2.003655,0.0,26.026425,9849,9,9,28.134188,350.3922,1.101169,0.002854,0.017863
4,3416,59.998822,58,60,0.048532,0.002355,176410,26,26.0,26,0.0,24,24.0,24,0.0,2.002358,0.0,26.38893,6785,4,4,39.957571,169.905223,0.137244,0.005887,0.018825
5,1410,60.0,60,60,0.0,0.0,73060,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.761905,2810,0,0,39.887447,70.448229,0.14227,0.0142,0.029792
6,3314,59.998782,58,60,0.049349,0.002435,170612,26,26.0,26,0.0,24,24.0,24,0.0,2.002438,0.0,26.039683,6562,4,4,38.899376,168.794481,1.100685,0.005925,0.021482
7,5019,59.99819,58,60,0.060138,0.003617,258336,26,26.0,26,0.0,24,24.0,24,0.0,2.003623,0.0,28.282899,9936,9,9,11.427656,870.257194,0.038543,0.001149,0.002328
8,1930,74.0,74,74,0.0,0.0,154400,40,40.0,40,0.0,0,0.0,0,0.0,0.0,0.0,40.103896,3860,0,0,39.700051,97.229094,1.017906,0.010288,0.034187
9,1999,74.0,74,74,0.0,0.0,159200,40,40.0,40,0.0,0,0.0,0,0.0,0.0,0.0,41.020356,3980,0,0,39.994987,99.512472,0.215444,0.010052,0.03092


In [19]:
x = portDataset[portDataset['Flow Duration']>=39]
x[x['Flow Duration']<=40][0:40]

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
36,1966,0.0,0.0,0.0,0.0,0.0,117759.17021,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,3884.444479,0.0,0.0,39.311879,84.885958,0.393558,0.011423,0.046166
42,2117,0.0,0.0,0.0,0.0,0.0,122256.197017,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,4162.508263,0.0,0.0,39.01351,78.730533,0.289826,0.00791,0.047362
48,6052,0.0,0.0,0.0,0.0,0.0,360817.42259,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,11939.385899,0.0,0.0,39.883511,159.889737,2.759253,0.010836,0.011994
172,4747,0.0,0.0,0.0,0.0,0.0,284381.596049,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,9344.26143,0.0,0.0,39.093587,158.671735,2.886182,0.008341,0.047272
189,2106,0.0,0.0,0.0,0.0,0.0,121733.60101,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,4139.46642,0.0,0.0,39.225096,58.393239,2.883273,0.008228,0.042921
207,3121,0.0,0.0,0.0,0.0,0.0,186223.817425,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,6359.99362,0.0,0.0,39.931152,57.614982,2.635112,0.010956,0.018836
209,1325,0.0,0.0,0.0,0.0,0.0,76978.000999,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,2680.878275,0.0,0.0,39.525102,170.818292,2.852815,0.007931,0.014178
247,790,0.0,0.0,0.0,0.0,0.0,45900.360804,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,1604.758241,0.0,0.0,39.900528,74.883295,0.520687,0.011451,0.011111
281,1668,0.0,0.0,0.0,0.0,0.0,99478.813355,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,3297.380751,0.0,0.0,39.547225,170.716154,2.787919,0.007822,0.043997
385,5144,0.0,0.0,0.0,0.0,0.0,297088.285049,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,10179.399801,0.0,0.0,39.512898,62.553433,2.486133,0.008594,0.014117


### Independant Columns

In [20]:
independant = ['Packet Length Std', 'Packet Length Variance', 'Fwd Segment Size Avg']

for col in independant:
    # Generate random values from the uniform distribution
    rand_values = np.random.uniform(MinMaxDict[col][0], MinMaxDict[col][1]*1.1, NUM_OF_ROWS)

    # Randomly choose between 0 or the generated random value
    chosen_values = np.where(np.random.rand(NUM_OF_ROWS) > 0.5, rand_values, 0)

    portDataset[col] = chosen_values

portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4937,0.0,0.0,0.0,0.000000,0.000000,295274.555873,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,10057.038098,0.0,0.0,40.182439,180.143925,3.009695,0.011235,0.012214
1,4520,0.0,0.0,0.0,0.003872,0.003258,268804.626273,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,9138.931602,0.0,0.0,22.833001,113.880880,0.240233,0.006287,0.027636
2,2213,0.0,0.0,0.0,0.044876,0.000000,128394.170370,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,4351.886434,0.0,0.0,17.000133,408.039316,1.055007,0.004928,0.018415
3,2071,0.0,0.0,0.0,0.000000,0.002396,124265.264100,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,4102.207878,0.0,0.0,28.337542,99.117687,0.276214,0.006210,0.033231
4,5604,0.0,0.0,0.0,0.000000,0.000254,324824.284105,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,11345.584191,0.0,0.0,32.033318,88.725163,2.203730,0.008772,0.012725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,3356,0.0,0.0,0.0,0.032028,0.000000,194063.260100,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,6844.536637,0.0,0.0,27.990015,80.013492,0.208922,0.005879,0.028841
7496,843,0.0,0.0,0.0,0.013753,0.001992,50575.792839,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,1718.858214,0.0,0.0,17.851424,147.091992,1.145207,0.003612,0.007270
7497,3745,0.0,0.0,0.0,0.016975,0.000645,217891.302069,0.0,0.0,0.0,0,0.0,0.0,0.0,0,1.693204,0,0.0,7593.121309,0.0,0.0,47.610778,61.405820,3.174924,0.009950,0.019841
7498,3945,0.0,0.0,0.0,0.000000,0.000000,234586.325884,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,7794.812460,0.0,0.0,22.613233,293.336424,1.555944,0.004516,0.026353


In [21]:
x = portDataset[portDataset['Packet Length Std']>0]
x

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
1,4520,0.0,0.0,0.0,0.003872,0.003258,268804.626273,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,9138.931602,0.0,0.0,22.833001,113.880880,0.240233,0.006287,0.027636
2,2213,0.0,0.0,0.0,0.044876,0.000000,128394.170370,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,4351.886434,0.0,0.0,17.000133,408.039316,1.055007,0.004928,0.018415
6,3633,0.0,0.0,0.0,0.032684,0.004093,210657.846386,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.413083,0,0.0,7392.209610,0.0,0.0,33.618235,104.628747,2.125304,0.009772,0.036171
10,2848,0.0,0.0,0.0,0.072205,0.000000,164576.911249,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,5631.113280,0.0,0.0,15.491164,410.920202,1.118119,0.004221,0.017774
11,5446,0.0,0.0,0.0,0.043144,0.002819,324518.571331,0.0,0.0,0.0,0,0.0,0.0,0.0,0,1.879781,0,0.0,10733.156294,0.0,0.0,18.045376,178.947220,1.216476,0.005008,0.020725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7493,1008,0.0,0.0,0.0,0.018003,0.002565,60324.784883,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,1996.216213,0.0,0.0,24.737124,144.462337,0.130288,0.006783,0.026927
7495,3356,0.0,0.0,0.0,0.032028,0.000000,194063.260100,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,6844.536637,0.0,0.0,27.990015,80.013492,0.208922,0.005879,0.028841
7496,843,0.0,0.0,0.0,0.013753,0.001992,50575.792839,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,1718.858214,0.0,0.0,17.851424,147.091992,1.145207,0.003612,0.007270
7497,3745,0.0,0.0,0.0,0.016975,0.000645,217891.302069,0.0,0.0,0.0,0,0.0,0.0,0.0,0,1.693204,0,0.0,7593.121309,0.0,0.0,47.610778,61.405820,3.174924,0.009950,0.019841


### Same Values

In [22]:
same_values = ['Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Min'] # 'Subflow Fwd Bytes' is approximatly the same as values in here

for col in same_values:
    # Generate random values for the 'Flow Duration' column
    randValues = np.random.uniform(MinMaxDict[col][0]*0.9, MinMaxDict[col][1]*1.1, size=NUM_OF_ROWS)

    # 'Subflow Fwd Bytes' is approximatly the same as values in ['Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Min']
    # Create a new column 'Subflow Fwd Bytes' with slightly adjusted values from randValues
    adjustment_factor = np.random.uniform(0.9995, 1.0005, size=NUM_OF_ROWS)
    subflow_fwd_bytes = randValues * adjustment_factor
    portDataset['Subflow Fwd Bytes'] = subflow_fwd_bytes

    # Assign the random values
    portDataset[col] = randValues

In [23]:
portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4937,0.0,0.0,0.0,0.000000,0.000000,295274.555873,40.180754,36.599654,37.034497,0,0.0,0.0,0.0,0,0.000000,0,37.029879,10057.038098,0.0,0.0,40.182439,180.143925,3.009695,0.011235,0.012214
1,4520,0.0,0.0,0.0,0.003872,0.003258,268804.626273,29.845001,31.501220,27.964617,0,0.0,0.0,0.0,0,0.000000,0,27.964431,9138.931602,0.0,0.0,22.833001,113.880880,0.240233,0.006287,0.027636
2,2213,0.0,0.0,0.0,0.044876,0.000000,128394.170370,25.712947,44.438594,26.412433,0,0.0,0.0,0.0,0,0.000000,0,26.401763,4351.886434,0.0,0.0,17.000133,408.039316,1.055007,0.004928,0.018415
3,2071,0.0,0.0,0.0,0.000000,0.002396,124265.264100,44.929845,49.128014,30.082936,0,0.0,0.0,0.0,0,0.000000,0,30.086774,4102.207878,0.0,0.0,28.337542,99.117687,0.276214,0.006210,0.033231
4,5604,0.0,0.0,0.0,0.000000,0.000254,324824.284105,34.457008,41.735428,34.734899,0,0.0,0.0,0.0,0,0.000000,0,34.727302,11345.584191,0.0,0.0,32.033318,88.725163,2.203730,0.008772,0.012725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,3356,0.0,0.0,0.0,0.032028,0.000000,194063.260100,25.862227,36.416253,35.684380,0,0.0,0.0,0.0,0,0.000000,0,35.697690,6844.536637,0.0,0.0,27.990015,80.013492,0.208922,0.005879,0.028841
7496,843,0.0,0.0,0.0,0.013753,0.001992,50575.792839,28.887713,42.583445,45.463583,0,0.0,0.0,0.0,0,0.000000,0,45.451422,1718.858214,0.0,0.0,17.851424,147.091992,1.145207,0.003612,0.007270
7497,3745,0.0,0.0,0.0,0.016975,0.000645,217891.302069,37.938322,40.330022,28.918628,0,0.0,0.0,0.0,0,1.693204,0,28.911283,7593.121309,0.0,0.0,47.610778,61.405820,3.174924,0.009950,0.019841
7498,3945,0.0,0.0,0.0,0.000000,0.000000,234586.325884,43.497938,21.070000,36.681889,0,0.0,0.0,0.0,0,0.000000,0,36.676634,7794.812460,0.0,0.0,22.613233,293.336424,1.555944,0.004516,0.026353


### Approximate Values

In [24]:
approx_same = ['Average Packet Length', 'Packet Length Min', 'Packet Length Max']

# Generate random values for 'Packet Length Max'
packet_length_max = np.random.randint(MinMaxDict['Packet Length Max'][0] * 0.9, MinMaxDict['Packet Length Max'][1] * 1.1, NUM_OF_ROWS)

# Decide whether to copy or adjust based on a condition or randomly
copy_values = np.random.choice([True, False], size=NUM_OF_ROWS)  # Randomly decide whether to copy values or not

# Create 'Average Packet Length' and 'Packet Length Min' based on 'Packet Length Max'
packet_length_min = np.where(copy_values, packet_length_max, packet_length_max + np.random.uniform(-2, 2, NUM_OF_ROWS))
packet_length_min = np.minimum(packet_length_min, packet_length_max)

# If True, copy the 'Packet Length Max' values; if False, apply small variation
average_packet_length = np.where(packet_length_max != packet_length_min, (packet_length_max + packet_length_min) / 2, packet_length_min)

# Assign the values to the dataset
portDataset['Packet Length Max'] = packet_length_max
portDataset['Average Packet Length'] = average_packet_length
portDataset['Packet Length Min'] = packet_length_min.astype(int)

In [25]:
portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4937,57.049265,56,58,0.000000,0.000000,295274.555873,40.180754,36.599654,37.034497,0,0.0,0.0,0.0,0,0.000000,0,37.029879,10057.038098,0.0,0.0,40.182439,180.143925,3.009695,0.011235,0.012214
1,4520,70.000000,70,70,0.003872,0.003258,268804.626273,29.845001,31.501220,27.964617,0,0.0,0.0,0.0,0,0.000000,0,27.964431,9138.931602,0.0,0.0,22.833001,113.880880,0.240233,0.006287,0.027636
2,2213,55.523608,55,56,0.044876,0.000000,128394.170370,25.712947,44.438594,26.412433,0,0.0,0.0,0.0,0,0.000000,0,26.401763,4351.886434,0.0,0.0,17.000133,408.039316,1.055007,0.004928,0.018415
3,2071,54.000000,54,54,0.000000,0.002396,124265.264100,44.929845,49.128014,30.082936,0,0.0,0.0,0.0,0,0.000000,0,30.086774,4102.207878,0.0,0.0,28.337542,99.117687,0.276214,0.006210,0.033231
4,5604,90.175678,89,91,0.000000,0.000254,324824.284105,34.457008,41.735428,34.734899,0,0.0,0.0,0.0,0,0.000000,0,34.727302,11345.584191,0.0,0.0,32.033318,88.725163,2.203730,0.008772,0.012725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,3356,86.000000,86,86,0.032028,0.000000,194063.260100,25.862227,36.416253,35.684380,0,0.0,0.0,0.0,0,0.000000,0,35.697690,6844.536637,0.0,0.0,27.990015,80.013492,0.208922,0.005879,0.028841
7496,843,50.000000,50,50,0.013753,0.001992,50575.792839,28.887713,42.583445,45.463583,0,0.0,0.0,0.0,0,0.000000,0,45.451422,1718.858214,0.0,0.0,17.851424,147.091992,1.145207,0.003612,0.007270
7497,3745,80.534720,80,81,0.016975,0.000645,217891.302069,37.938322,40.330022,28.918628,0,0.0,0.0,0.0,0,1.693204,0,28.911283,7593.121309,0.0,0.0,47.610778,61.405820,3.174924,0.009950,0.019841
7498,3945,84.000000,84,84,0.000000,0.000000,234586.325884,43.497938,21.070000,36.681889,0,0.0,0.0,0.0,0,0.000000,0,36.676634,7794.812460,0.0,0.0,22.613233,293.336424,1.555944,0.004516,0.026353


In [26]:
x = portDataset[portDataset['Packet Length Min'] != portDataset['Packet Length Max']]
x

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4937,57.049265,56,58,0.000000,0.000000,295274.555873,40.180754,36.599654,37.034497,0,0.0,0.0,0.0,0,0.000000,0,37.029879,10057.038098,0.0,0.0,40.182439,180.143925,3.009695,0.011235,0.012214
2,2213,55.523608,55,56,0.044876,0.000000,128394.170370,25.712947,44.438594,26.412433,0,0.0,0.0,0.0,0,0.000000,0,26.401763,4351.886434,0.0,0.0,17.000133,408.039316,1.055007,0.004928,0.018415
4,5604,90.175678,89,91,0.000000,0.000254,324824.284105,34.457008,41.735428,34.734899,0,0.0,0.0,0.0,0,0.000000,0,34.727302,11345.584191,0.0,0.0,32.033318,88.725163,2.203730,0.008772,0.012725
5,5144,52.284883,51,53,0.000000,0.000679,297358.701661,38.772935,37.781717,46.943290,0,0.0,0.0,0.0,0,0.000000,0,46.947381,10399.129545,0.0,0.0,47.438348,53.909875,3.547930,0.013391,0.052545
18,5771,66.591748,66,67,0.074084,0.003747,344086.249295,47.168871,36.540603,35.383995,0,0.0,0.0,0.0,0,0.000000,0,35.395585,11730.193077,0.0,0.0,31.085231,54.510684,1.891587,0.006524,0.012062
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7488,4435,89.187365,88,90,0.000000,0.000000,264321.776006,34.992920,23.380719,33.110557,0,0.0,0.0,0.0,0,0.000000,0,33.108890,9012.084752,0.0,0.0,36.621530,195.255641,0.486279,0.007227,0.044231
7491,2679,44.482920,43,45,0.033342,0.003466,159897.498815,33.318848,35.580122,48.852091,0,0.0,0.0,0.0,0,0.000000,0,48.835596,5282.001369,0.0,0.0,14.257342,451.304005,1.057787,0.004009,0.004654
7493,1008,46.354484,45,47,0.018003,0.002565,60324.784883,45.249622,41.246484,34.440676,0,0.0,0.0,0.0,0,0.000000,0,34.443671,1996.216213,0.0,0.0,24.737124,144.462337,0.130288,0.006783,0.026927
7497,3745,80.534720,80,81,0.016975,0.000645,217891.302069,37.938322,40.330022,28.918628,0,0.0,0.0,0.0,0,1.693204,0,28.911283,7593.121309,0.0,0.0,47.610778,61.405820,3.174924,0.009950,0.019841


### Backwards Packets with Flags

In [27]:
backward_flags = ['Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Min', 'ACK Flag Count', 'RST Flag Count']

# Probability of doing X (30%) and Y (70%)
probability = [0.25, 0.75]

# Decide whether to use backward flags (True or False) based on the probability for each row
has_backward_flags = np.random.choice([True, False], size=NUM_OF_ROWS, p=probability)

# Check if the value should be True or False for each row
for i in range(NUM_OF_ROWS):
    if has_backward_flags[i]:
        # If True, generate random values for Bwd Packet Length and Flag Count
        bwd_vector = np.random.randint(16, MinMaxDict['Bwd Packet Length Max'][1] * 1.15)
        flag_vector = np.random.randint(2, MinMaxDict['ACK Flag Count'][1] * 1.15)
        
        # Apply values for the first 3 backward flags
        for col in backward_flags[:3]:
            portDataset.at[i, col] = bwd_vector
        
        # Apply values for the remaining 2 flags
        for col in backward_flags[3:]:
            portDataset.at[i, col] = flag_vector
    else:
        # If False, set only the current row to zero for all backward flags
        for col in backward_flags:
            portDataset.at[i, col] = 0

In [28]:
portDataset[portDataset['ACK Flag Count'] > 0]

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
4,5604,90.175678,89,91,0.000000,0.000254,324824.284105,34.457008,41.735428,34.734899,0,23.0,23.0,23.0,0,0.000000,0,34.727302,11345.584191,9.0,9.0,32.033318,88.725163,2.203730,0.008772,0.012725
10,2848,85.000000,85,85,0.072205,0.000000,164576.911249,25.375523,50.060674,22.860171,0,28.0,28.0,28.0,0,0.000000,0,22.856665,5631.113280,8.0,8.0,15.491164,410.920202,1.118119,0.004221,0.017774
14,3650,61.000000,61,61,0.000000,0.000000,217595.912648,20.816929,43.970331,22.745187,0,22.0,22.0,22.0,0,0.000000,0,22.743801,7403.043366,10.0,10.0,30.080201,106.413304,0.361120,0.006084,0.013765
21,5585,52.000000,52,52,0.024073,0.003334,334109.197655,25.458332,45.456422,28.865347,0,29.0,29.0,29.0,0,0.000000,0,28.867337,11367.100969,6.0,6.0,33.445168,181.929503,2.283397,0.009757,0.011769
23,5428,57.000000,57,57,0.034348,0.000000,323484.247327,29.344231,21.800088,41.407482,0,29.0,29.0,29.0,0,0.000000,0,41.395436,10751.279709,9.0,9.0,10.483303,238.318229,0.099812,0.002293,0.003809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7482,3155,76.000000,76,76,0.051779,0.000000,187677.807444,20.172264,36.238480,20.716782,0,24.0,24.0,24.0,0,0.000000,0,20.716209,6438.055724,6.0,6.0,26.604917,240.403496,0.130447,0.007227,0.008672
7488,4435,89.187365,88,90,0.000000,0.000000,264321.776006,34.992920,23.380719,33.110557,0,26.0,26.0,26.0,0,0.000000,0,33.108890,9012.084752,9.0,9.0,36.621530,195.255641,0.486279,0.007227,0.044231
7489,1919,46.000000,46,46,0.000000,0.000841,111426.442614,32.844473,24.790139,38.364161,0,21.0,21.0,21.0,0,0.000000,0,38.351179,3774.926550,4.0,4.0,31.317870,239.690570,1.950801,0.006389,0.032424
7490,943,65.000000,65,65,0.025608,0.000000,56425.981323,31.496450,35.245175,22.283953,0,26.0,26.0,26.0,0,1.746702,0,22.283034,1855.640971,5.0,5.0,28.142971,253.704660,2.037563,0.005948,0.009341


---

In [29]:
# making the SYN Flag Count column have int values instead of floats
portDataset['SYN Flag Count'] = portDataset['SYN Flag Count'].astype(int)

# adding a label to the dataset
portDataset['Label'] = ATTACK_NAME

In [30]:
portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std,Label
0,4937,57.049265,56,58,0.000000,0.000000,295274.555873,40.180754,36.599654,37.034497,0,0.0,0.0,0.0,0,0.000000,0,37.029879,10057,0.0,0.0,40.182439,180.143925,3.009695,0.011235,0.012214,PortScan
1,4520,70.000000,70,70,0.003872,0.003258,268804.626273,29.845001,31.501220,27.964617,0,0.0,0.0,0.0,0,0.000000,0,27.964431,9138,0.0,0.0,22.833001,113.880880,0.240233,0.006287,0.027636,PortScan
2,2213,55.523608,55,56,0.044876,0.000000,128394.170370,25.712947,44.438594,26.412433,0,0.0,0.0,0.0,0,0.000000,0,26.401763,4351,0.0,0.0,17.000133,408.039316,1.055007,0.004928,0.018415,PortScan
3,2071,54.000000,54,54,0.000000,0.002396,124265.264100,44.929845,49.128014,30.082936,0,0.0,0.0,0.0,0,0.000000,0,30.086774,4102,0.0,0.0,28.337542,99.117687,0.276214,0.006210,0.033231,PortScan
4,5604,90.175678,89,91,0.000000,0.000254,324824.284105,34.457008,41.735428,34.734899,0,23.0,23.0,23.0,0,0.000000,0,34.727302,11345,9.0,9.0,32.033318,88.725163,2.203730,0.008772,0.012725,PortScan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,3356,86.000000,86,86,0.032028,0.000000,194063.260100,25.862227,36.416253,35.684380,0,0.0,0.0,0.0,0,0.000000,0,35.697690,6844,0.0,0.0,27.990015,80.013492,0.208922,0.005879,0.028841,PortScan
7496,843,50.000000,50,50,0.013753,0.001992,50575.792839,28.887713,42.583445,45.463583,0,0.0,0.0,0.0,0,0.000000,0,45.451422,1718,0.0,0.0,17.851424,147.091992,1.145207,0.003612,0.007270,PortScan
7497,3745,80.534720,80,81,0.016975,0.000645,217891.302069,37.938322,40.330022,28.918628,0,0.0,0.0,0.0,0,1.693204,0,28.911283,7593,0.0,0.0,47.610778,61.405820,3.174924,0.009950,0.019841,PortScan
7498,3945,84.000000,84,84,0.000000,0.000000,234586.325884,43.497938,21.070000,36.681889,0,0.0,0.0,0.0,0,0.000000,0,36.676634,7794,0.0,0.0,22.613233,293.336424,1.555944,0.004516,0.026353,PortScan


In [31]:
portSamples.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0
mean,2387.526316,66.631196,66.105263,66.631579,0.01411,0.000764,150824.842105,32.631579,32.631579,32.631579,0.0,6.315789,6.315789,6.315789,0.0,1.053397,0.0,33.236968,4830.789474,1.578947,1.578947,36.960596,161.40442,0.627105,0.008909,0.02711
std,1100.883199,7.182221,7.730853,7.181848,0.024427,0.001351,48603.855729,7.181848,7.181848,7.181848,0.0,10.857934,10.857934,10.857934,0.0,1.026725,0.0,7.314582,2104.368646,3.005842,3.005842,6.933406,182.503211,0.719782,0.003132,0.011919
min,1000.0,59.998174,58.0,60.0,0.0,0.0,73060.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.026425,2810.0,0.0,0.0,11.427656,70.448229,0.038543,0.001149,0.002328
25%,1880.0,59.999411,59.0,60.0,0.0,0.0,112749.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.230768,3760.0,0.0,0.0,38.380666,96.947314,0.130432,0.008012,0.019438
50%,1994.0,60.0,60.0,60.0,0.0,0.0,158960.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,28.282899,3974.0,0.0,0.0,39.700051,99.53883,0.152234,0.010049,0.029792
75%,2618.5,74.0,74.0,74.0,0.024266,0.001178,163941.0,40.0,40.0,40.0,0.0,12.0,12.0,12.0,0.0,2.001179,0.0,41.00323,5232.0,2.0,2.0,39.960304,133.336021,1.100399,0.010318,0.030047
max,5019.0,74.0,74.0,74.0,0.060403,0.003649,258336.0,40.0,40.0,40.0,0.0,24.0,24.0,24.0,0.0,2.003655,0.0,41.033325,9936.0,9.0,9.0,40.052014,870.257194,2.878662,0.0142,0.063127


In [32]:
portDataset.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0,7500.0
mean,3550.968267,68.264119,68.0196,68.384133,0.018854,0.001162,208883.77946,35.328129,35.210824,35.303778,0.0,5.8212,5.8212,5.8212,0.0,0.624078,0.0,35.303785,7104.996,1.4992,1.4992,28.393535,206.789925,1.501894,0.006901,0.021003
std,1600.567696,13.899827,13.913129,13.898381,0.024647,0.001489,94165.486711,8.906122,8.897981,8.838581,0.0,10.280071,10.280071,10.280071,0.0,0.81259,0.0,8.838702,3206.05948,2.896328,2.896328,11.415405,152.772117,1.011941,0.003006,0.014253
min,765.0,44.028407,43.0,45.0,0.0,0.0,44323.613088,19.814116,19.89251,19.801053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.795268,1517.0,0.0,0.0,8.742338,35.744813,0.003817,0.001727,0.002304
25%,2187.75,56.0,56.0,56.0,0.0,0.0,128344.706603,27.523675,27.465843,27.682087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.681341,4370.0,0.0,0.0,18.534095,91.53514,0.595463,0.004399,0.010036
50%,3504.5,68.0,68.0,68.0,0.0,5.6e-05,206262.088967,35.436135,35.274516,35.21869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.223442,7000.0,0.0,0.0,28.357522,167.419641,1.46662,0.006712,0.016013
75%,4946.0,80.0,80.0,80.0,0.03811,0.002343,290194.318428,43.125867,42.951795,42.917419,0.0,16.0,16.0,16.0,0.0,1.238314,0.0,42.92328,9895.25,2.0,2.0,38.055683,259.24079,2.364221,0.00906,0.031138
max,6347.0,92.0,92.0,92.0,0.076396,0.004611,380603.081701,50.588107,50.596188,50.59752,0.0,30.0,30.0,30.0,0.0,2.534536,0.0,50.602521,12947.0,10.0,10.0,48.352818,905.322431,3.634606,0.014092,0.058377


---

### Creating a dataset from the second sample on close port attack

In [33]:
NUM_OF_ROWS = 12500 #this is not the final number, it will be reduced because we will choose 7,500 rows out of the second dataset

In [34]:
portSamples = pd.read_csv('portscan_closed_port_samples_2.csv')
print(f'Dataset Shape: {portSamples.shape}')
portSamples

Dataset Shape: (10, 26)


Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,100,60.0,60,60,0.0,0.0,5200,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,27.368421,200,0,0,2.920097,68.490873,1.103361,0.014674,0.082441
1,120,60.0,60,60,0.0,0.0,6240,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,27.130435,240,0,0,3.34079,71.839295,1.102872,0.013978,0.075529
2,120,60.0,60,60,0.0,0.0,6240,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,27.130435,240,0,0,3.325836,72.162309,1.10776,0.013916,0.076588
3,140,60.0,60,60,0.0,0.0,7280,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.962963,280,0,0,3.753263,74.601753,1.109797,0.013453,0.071323
4,240,60.0,60,60,0.0,0.0,12480,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.553191,480,0,0,5.749811,83.481006,1.105315,0.012004,0.05814
5,180,60.0,60,60,0.0,0.0,9360,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.742857,360,0,0,4.554911,79.03557,1.103638,0.012688,0.063769
6,280,60.0,60,60,0.0,0.0,14560,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.472727,560,0,0,6.582215,85.077743,1.1115,0.011775,0.055129
7,150,60.0,60,60,0.0,0.0,7800,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.896552,300,0,0,3.944551,76.054284,1.105556,0.013192,0.069209
8,190,60.0,60,60,0.0,0.0,9880,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.702703,380,0,0,4.743357,80.112036,1.105079,0.012515,0.061978
9,220,60.0,60,60,0.0,0.0,11440,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.604651,440,0,0,5.354074,82.180411,1.10544,0.012196,0.058847


In [35]:
# find the columns that we need to synthesis data for to produce an attack dataset
columnsToGather = portSamples.replace(0, np.nan) #replace all 0 values with null
columnsToGather = columnsToGather.dropna(how="all", axis=1).columns.tolist() #remove all columns where there are null values
columnsToGather #left with all columns that the values are not 0 (be know for a fact that the data is consistant and there are not missing values in the rows)

['Number of Ports',
 'Average Packet Length',
 'Packet Length Min',
 'Packet Length Max',
 'Total Length of Fwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Min',
 'Fwd Segment Size Avg',
 'Subflow Fwd Bytes',
 'SYN Flag Count',
 'Flow Duration',
 'Packets Per Second',
 'IAT Max',
 'IAT Mean',
 'IAT Std']

In [36]:
# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (portSamples[col].min() * 0.85, portSamples[col].max() * 1.15) for col in columnsToGather}
MinMaxDict

{'Number of Ports': (np.float64(85.0), np.float64(322.0)),
 'Average Packet Length': (np.float64(51.0), np.float64(69.0)),
 'Packet Length Min': (np.float64(51.0), np.float64(69.0)),
 'Packet Length Max': (np.float64(51.0), np.float64(69.0)),
 'Total Length of Fwd Packet': (np.float64(4420.0), np.float64(16744.0)),
 'Fwd Packet Length Max': (np.float64(22.099999999999998), np.float64(29.9)),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(29.9)),
 'Fwd Packet Length Min': (np.float64(22.099999999999998), np.float64(29.9)),
 'Fwd Segment Size Avg': (np.float64(1.7), np.float64(2.3)),
 'Subflow Fwd Bytes': (np.float64(22.501818181818187),
  np.float64(31.47368421052631)),
 'SYN Flag Count': (np.float64(170.0), np.float64(644.0)),
 'Flow Duration': (np.float64(2.482082545757294),
  np.float64(7.56954733133316)),
 'Packets Per Second': (np.float64(58.21724190720356),
  np.float64(97.83940407299949)),
 'IAT Max': (np.float64(0.9374411106109619), np.float64(1.278225028

In [37]:
# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['Number of Ports', 'Packet Length Min', 'Packet Length Max', 'Total Length of Fwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'SYN Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (85, 322),
 'Average Packet Length': (np.float64(51.0), np.float64(69.0)),
 'Packet Length Min': (51, 69),
 'Packet Length Max': (51, 69),
 'Total Length of Fwd Packet': (4420, 16744),
 'Fwd Packet Length Max': (22, 29),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(29.9)),
 'Fwd Packet Length Min': (22, 29),
 'Fwd Segment Size Avg': (np.float64(1.7), np.float64(2.3)),
 'Subflow Fwd Bytes': (np.float64(22.501818181818187),
  np.float64(31.47368421052631)),
 'SYN Flag Count': (170, 644),
 'Flow Duration': (np.float64(2.482082545757294),
  np.float64(7.56954733133316)),
 'Packets Per Second': (np.float64(58.21724190720356),
  np.float64(97.83940407299949)),
 'IAT Max': (np.float64(0.9374411106109619), np.float64(1.278225028514862)),
 'IAT Mean': (np.float64(0.010008734901817199),
  np.float64(0.016874933063085323)),
 'IAT Std': (np.float64(0.046859943603463065),
  np.float64(0.09480762832623796))}

### Creating the dataset

In [38]:
# creating an empty dataframe before adding values to it
portDataset2 = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(portSamples.columns))), columns=portSamples.columns)

# adding zeros to all columns that should not have any values
zeroColumns = [col for col in portSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    portDataset2[col] = int(0)
zeroColumns

['Packet Length Std',
 'Packet Length Variance',
 'Fwd Packet Length Std',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Bwd Packet Length Std',
 'Bwd Segment Size Avg',
 'ACK Flag Count',
 'RST Flag Count']

In [39]:
first_correlation = ['Number of Ports', 'Total Length of Fwd Packet', 'SYN Flag Count']

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
independent_col = portSamples[first_correlation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = portSamples[first_correlation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(first_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)
    
# adding the rest of the attack feature values to the dataset at random based on the smaple data
portDataset2['Number of Ports'] = np.random.randint(MinMaxDict['Number of Ports'][0]*0.9, MinMaxDict['Number of Ports'][1]*1.10, NUM_OF_ROWS)

for index, row in portDataset2.iterrows():
    for col, factor in zip(first_correlation[1:], scaling_factors): #iterating over all rows we need to add values to except 'Number of Ports'
        # calculate a random small delta of the factor for adding some randomness
        delta = random.uniform(factor[1] * 0.01, factor[1] * 0.02) # select a delta

        # apply the randomness to the calculated number
        updatedFactor = factor[1] + random.choice([-1, 1]) * delta

        # calculate the value we want to add into the dataset in the given row
        portDataset2.loc[index, col] = int(row['Number of Ports'] * updatedFactor)

portDataset2

('Total Length of Fwd Packet', np.float64(51.99999999999999))
('SYN Flag Count', np.float64(2.0))


Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,103,0.0,0.0,0.0,0,0,5268.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,203.0,0,0,0.0,0.0,0.0,0.0,0.0
1,240,0.0,0.0,0.0,0,0,12268.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,472.0,0,0,0.0,0.0,0.0,0.0,0.0
2,195,0.0,0.0,0.0,0,0,10292.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,395.0,0,0,0.0,0.0,0.0,0.0,0.0
3,87,0.0,0.0,0.0,0,0,4451.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,177.0,0,0,0.0,0.0,0.0,0.0,0.0
4,206,0.0,0.0,0.0,0,0,10567.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,405.0,0,0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12495,129,0.0,0.0,0.0,0,0,6810.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,253.0,0,0,0.0,0.0,0.0,0.0,0.0
12496,90,0.0,0.0,0.0,0,0,4734.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,182.0,0,0,0.0,0.0,0.0,0.0,0.0
12497,85,0.0,0.0,0.0,0,0,4370.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,166.0,0,0,0.0,0.0,0.0,0.0,0.0
12498,129,0.0,0.0,0.0,0,0,6803.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,254.0,0,0,0.0,0.0,0.0,0.0,0.0


In [40]:
second_correlation = ['Number of Ports', 'Flow Duration', 'IAT Mean', 'IAT Std'] #'Packets Per Second', 'IAT Max',

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
independent_col = portSamples[second_correlation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = portSamples[second_correlation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(second_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Flow Duration', np.float64(0.024958470011985968))
('IAT Mean', np.float64(6.650583610346264e-05))
('IAT Std', np.float64(0.000336893054787924))


In [41]:
#iterating over all rows we need to add values
for index, row in portDataset2.iterrows():
    for col, factor in scaling_factors: 
        # calculate a random small delta of the factor for adding some randomness
        if col == 'Flow Duration':
            delta = random.uniform(factor * 0.05, factor * 0.1) # select a delta
        elif col == 'Packets Per Second' or col == 'IAT Max':
            delta = 0
            #delta = random.uniform(factor * 0.005, factor * 0.1)# select a delta
        elif col == 'IAT Std':
            delta = random.uniform(factor * 0.05, factor * 0.2) * random.choice([-1, 1]) # select a delta
        else:
            delta = random.uniform(factor * 0.1, factor * 0.25) # select a delta
        updatedFactor = factor + delta
        portDataset2.loc[index, col] = row['Number of Ports'] * updatedFactor

In [42]:
second_correlation = ['Flow Duration', 'Packets Per Second', 'IAT Max']

packets_per_second = 63.5 + (portDataset2['Flow Duration'] - 2.0) * (35 / 7.5)  # Linear transformation
portDataset2['Packets Per Second'] = np.clip(packets_per_second, 63.5, 98.75)  # Ensure within range

iat_max = 1.100 + (portDataset2['Flow Duration'] - 2.0) * (0.013 / 7.5) + np.random.uniform(-0.002, 0.002, size=NUM_OF_ROWS)
portDataset2['IAT Max'] = np.clip(iat_max, 1.100, 1.113)  # Ensure within range

In [43]:
portDataset2

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,103,0.0,0.0,0.0,0,0,5268.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,203.0,0,0,2.703809,66.784442,1.102714,0.008549,0.040128
1,240,0.0,0.0,0.0,0,0,12268.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,472.0,0,0,6.587334,84.907561,1.107393,0.019312,0.071118
2,195,0.0,0.0,0.0,0,0,10292.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,395.0,0,0,5.268214,78.751664,1.104832,0.015151,0.072729
3,87,0.0,0.0,0.0,0,0,4451.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,177.0,0,0,2.374970,65.249860,1.100837,0.007020,0.026771
4,206,0.0,0.0,0.0,0,0,10567.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,405.0,0,0,5.565912,80.140920,1.106042,0.016379,0.073788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12495,129,0.0,0.0,0.0,0,0,6810.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,253.0,0,0,3.423030,70.140806,1.102867,0.010456,0.037291
12496,90,0.0,0.0,0.0,0,0,4734.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,182.0,0,0,2.395965,65.347836,1.100000,0.007224,0.025526
12497,85,0.0,0.0,0.0,0,0,4370.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,166.0,0,0,2.278774,64.800944,1.101035,0.006998,0.023059
12498,129,0.0,0.0,0.0,0,0,6803.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,254.0,0,0,3.516828,70.578529,1.104009,0.009489,0.039791


In [44]:
x = portDataset2[portDataset2['Number of Ports'] > 119]
x[x['Number of Ports'] < 121]

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
1287,120,0.0,0.0,0.0,0,0,6337.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,244.0,0,0,3.162396,68.924517,1.100939,0.009231,0.037249
1526,120,0.0,0.0,0.0,0,0,6361.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,244.0,0,0,3.195952,69.081112,1.102986,0.009233,0.032553
1698,120,0.0,0.0,0.0,0,0,6338.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,242.0,0,0,3.158521,68.906431,1.101105,0.009357,0.045873
1775,120,0.0,0.0,0.0,0,0,6307.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,244.0,0,0,3.192017,69.062748,1.100414,0.009958,0.043101
1779,120,0.0,0.0,0.0,0,0,6302.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,235.0,0,0,3.244399,69.307197,1.101834,0.008844,0.04826
1820,120,0.0,0.0,0.0,0,0,6168.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,235.0,0,0,3.278059,69.464274,1.102407,0.009762,0.033957
1832,120,0.0,0.0,0.0,0,0,6321.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,242.0,0,0,3.223624,69.210244,1.102578,0.009534,0.037166
2227,120,0.0,0.0,0.0,0,0,6174.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,236.0,0,0,3.247085,69.319728,1.104043,0.009164,0.032752
3006,120,0.0,0.0,0.0,0,0,6352.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,242.0,0,0,3.153613,68.883525,1.100908,0.009527,0.03729
3380,120,0.0,0.0,0.0,0,0,6165.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,244.0,0,0,3.225318,69.218149,1.100218,0.009256,0.037765


In [45]:
portSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,100,60.0,60,60,0.0,0.0,5200,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,27.368421,200,0,0,2.920097,68.490873,1.103361,0.014674,0.082441
1,120,60.0,60,60,0.0,0.0,6240,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,27.130435,240,0,0,3.34079,71.839295,1.102872,0.013978,0.075529
2,120,60.0,60,60,0.0,0.0,6240,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,27.130435,240,0,0,3.325836,72.162309,1.10776,0.013916,0.076588
3,140,60.0,60,60,0.0,0.0,7280,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.962963,280,0,0,3.753263,74.601753,1.109797,0.013453,0.071323
4,240,60.0,60,60,0.0,0.0,12480,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.553191,480,0,0,5.749811,83.481006,1.105315,0.012004,0.05814
5,180,60.0,60,60,0.0,0.0,9360,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.742857,360,0,0,4.554911,79.03557,1.103638,0.012688,0.063769
6,280,60.0,60,60,0.0,0.0,14560,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.472727,560,0,0,6.582215,85.077743,1.1115,0.011775,0.055129
7,150,60.0,60,60,0.0,0.0,7800,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.896552,300,0,0,3.944551,76.054284,1.105556,0.013192,0.069209
8,190,60.0,60,60,0.0,0.0,9880,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.702703,380,0,0,4.743357,80.112036,1.105079,0.012515,0.061978
9,220,60.0,60,60,0.0,0.0,11440,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.604651,440,0,0,5.354074,82.180411,1.10544,0.012196,0.058847


### Independant Columns

In [46]:
portDataset2['Fwd Segment Size Avg'] = np.full(NUM_OF_ROWS, 2.0)

portDataset2['Subflow Fwd Bytes'] = np.random.uniform(MinMaxDict['Subflow Fwd Bytes'][0]*0.95, MinMaxDict['Subflow Fwd Bytes'][1]*1.05, NUM_OF_ROWS)

portDataset2

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,103,0.0,0.0,0.0,0,0,5268.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,26.187555,203.0,0,0,2.703809,66.784442,1.102714,0.008549,0.040128
1,240,0.0,0.0,0.0,0,0,12268.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,31.012061,472.0,0,0,6.587334,84.907561,1.107393,0.019312,0.071118
2,195,0.0,0.0,0.0,0,0,10292.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,31.496547,395.0,0,0,5.268214,78.751664,1.104832,0.015151,0.072729
3,87,0.0,0.0,0.0,0,0,4451.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,24.395142,177.0,0,0,2.374970,65.249860,1.100837,0.007020,0.026771
4,206,0.0,0.0,0.0,0,0,10567.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,23.055181,405.0,0,0,5.565912,80.140920,1.106042,0.016379,0.073788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12495,129,0.0,0.0,0.0,0,0,6810.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,30.602902,253.0,0,0,3.423030,70.140806,1.102867,0.010456,0.037291
12496,90,0.0,0.0,0.0,0,0,4734.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,30.837832,182.0,0,0,2.395965,65.347836,1.100000,0.007224,0.025526
12497,85,0.0,0.0,0.0,0,0,4370.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,22.659213,166.0,0,0,2.278774,64.800944,1.101035,0.006998,0.023059
12498,129,0.0,0.0,0.0,0,0,6803.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,32.215885,254.0,0,0,3.516828,70.578529,1.104009,0.009489,0.039791


In [47]:
same_values1 = ['Average Packet Length', 'Packet Length Min', 'Packet Length Max']

# Generate random values for the 'Flow Duration' column
randValues = np.random.randint(MinMaxDict['Average Packet Length'][0]*0.95, MinMaxDict['Average Packet Length'][1]*1.05, size=NUM_OF_ROWS)

# Assign the random values
for col in same_values1:
    portDataset2[col] = randValues

In [48]:
same_values2 = ['Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Min']

# Generate random values for the 'Flow Duration' column
randValues = np.random.randint(MinMaxDict['Fwd Packet Length Max'][0]*0.95, MinMaxDict['Fwd Packet Length Max'][1]*1.05, size=NUM_OF_ROWS)

# Assign the random values
for col in same_values2:
    portDataset2[col] = randValues

In [49]:
portDataset2

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,103,61,61,61,0,0,5268.0,22,22,22,0,0,0,0,0,2.0,0,26.187555,203.0,0,0,2.703809,66.784442,1.102714,0.008549,0.040128
1,240,70,70,70,0,0,12268.0,29,29,29,0,0,0,0,0,2.0,0,31.012061,472.0,0,0,6.587334,84.907561,1.107393,0.019312,0.071118
2,195,48,48,48,0,0,10292.0,24,24,24,0,0,0,0,0,2.0,0,31.496547,395.0,0,0,5.268214,78.751664,1.104832,0.015151,0.072729
3,87,54,54,54,0,0,4451.0,24,24,24,0,0,0,0,0,2.0,0,24.395142,177.0,0,0,2.374970,65.249860,1.100837,0.007020,0.026771
4,206,69,69,69,0,0,10567.0,20,20,20,0,0,0,0,0,2.0,0,23.055181,405.0,0,0,5.565912,80.140920,1.106042,0.016379,0.073788
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12495,129,52,52,52,0,0,6810.0,28,28,28,0,0,0,0,0,2.0,0,30.602902,253.0,0,0,3.423030,70.140806,1.102867,0.010456,0.037291
12496,90,70,70,70,0,0,4734.0,24,24,24,0,0,0,0,0,2.0,0,30.837832,182.0,0,0,2.395965,65.347836,1.100000,0.007224,0.025526
12497,85,69,69,69,0,0,4370.0,24,24,24,0,0,0,0,0,2.0,0,22.659213,166.0,0,0,2.278774,64.800944,1.101035,0.006998,0.023059
12498,129,69,69,69,0,0,6803.0,24,24,24,0,0,0,0,0,2.0,0,32.215885,254.0,0,0,3.516828,70.578529,1.104009,0.009489,0.039791


In [50]:
portSamples.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,174.0,60.0,60.0,60.0,0.0,0.0,9048.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,26.856494,348.0,0.0,0.0,4.426891,77.303528,1.106032,0.013039,0.067295
std,58.727241,0.0,0.0,0.0,0.0,0.0,3053.816556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.291425,117.454483,0.0,0.0,1.188862,5.538596,0.002827,0.00096,0.009113
min,100.0,60.0,60.0,60.0,0.0,0.0,5200.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,26.472727,200.0,0.0,0.0,2.920097,68.490873,1.102872,0.011775,0.055129
25%,125.0,60.0,60.0,60.0,0.0,0.0,6500.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,26.629164,250.0,0.0,0.0,3.443908,72.77217,1.103998,0.012276,0.059629
50%,165.0,60.0,60.0,60.0,0.0,0.0,8580.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,26.819704,330.0,0.0,0.0,4.249731,77.544927,1.105377,0.01294,0.066489
75%,212.5,60.0,60.0,60.0,0.0,0.0,11050.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,27.088567,425.0,0.0,0.0,5.201395,81.663317,1.107209,0.0138,0.074478
max,280.0,60.0,60.0,60.0,0.0,0.0,14560.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,27.368421,560.0,0.0,0.0,6.582215,85.077743,1.1115,0.014674,0.082441


In [51]:
portDataset2.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0,12500.0
mean,215.3072,59.53264,59.53264,59.53264,0.0,0.0,11195.69376,24.49376,24.49376,24.49376,0.0,0.0,0.0,0.0,0.0,2.0,0.0,27.182248,430.1676,0.0,0.0,5.776319,81.122179,1.10655,0.016818,0.072544
std,80.624645,6.930037,6.930037,6.930037,0.0,0.0,4195.266521,2.882233,2.882233,2.882233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.35964,161.465179,0.0,0.0,2.164003,10.0975,0.003834,0.00633,0.029037
min,76.0,48.0,48.0,48.0,0.0,0.0,3875.0,20.0,20.0,20.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,21.376749,149.0,0.0,0.0,1.992244,63.5,1.1,0.005565,0.020522
25%,145.0,53.0,53.0,53.0,0.0,0.0,7546.0,22.0,22.0,22.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,24.275084,290.0,0.0,0.0,3.879441,72.270724,1.1033,0.011314,0.047767
50%,215.0,60.0,60.0,60.0,0.0,0.0,11166.0,24.0,24.0,24.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,27.146572,429.0,0.0,0.0,5.768967,81.088515,1.106531,0.01676,0.071564
75%,285.0,66.0,66.0,66.0,0.0,0.0,14842.25,27.0,27.0,27.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,30.066986,569.0,0.0,0.0,7.655417,89.891945,1.109805,0.022279,0.094217
max,353.0,71.0,71.0,71.0,0.0,0.0,18712.0,29.0,29.0,29.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,33.044361,719.0,0.0,0.0,9.678227,98.75,1.113,0.029234,0.142637


In [52]:
# adding a label to the dataset
portDataset2['Label'] = ATTACK_NAME

In [53]:
portDataset2 = portDataset2[portDataset2['Number of Ports'] >= 120]
portDataset2.shape

(10547, 27)

In [54]:
portDataset2 = portDataset2.sample(n=7500, random_state = 42) 
portDataset2.shape

(7500, 27)

---

### Merging the two samples tougether

In [55]:
mergedPortDataset = pd.concat([portDataset, portDataset2], axis=0)
mergedPortDataset = mergedPortDataset.sample(frac=1, random_state=42).reset_index(drop=True)
mergedPortDataset.shape

(15000, 27)

In [None]:
# save the dataset
# mergedPortDataset.to_csv('port_scan_closed_port_dataset_new.csv', index=False)