In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
import random

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC

NUM_OF_ROWS = 7000
ATTACK_NAME = 'PortScan'

In [112]:
pd.set_option('display.max_columns', None)  # Show all columns

---

In [113]:
# import the attack sample dataset
portSamples = pd.read_csv('portscan_closed_port_samples_1.csv')
print(f'Dataset Shape: {portSamples.shape}')
portSamples

Dataset Shape: (19, 26)


Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,1970,60.0,60,60,0.0,0.0,102154,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.679028,3929,0,0,39.64862,99.095504,0.101249,0.010094,0.017669
1,1980,60.0,60,60,0.0,0.0,102778,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.674799,3953,0,0,39.988767,98.85276,0.090225,0.010119,0.016015
2,1800,60.0,60,60,0.0,0.0,93366,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.072605,3591,0,0,37.254382,96.391345,1.103064,0.010377,0.029695
3,4942,59.998174,58,60,0.060403,0.003649,256074,26,26.0,26,0.0,24,24.0,24,0.0,2.003655,0.0,26.026425,9849,9,9,28.134188,350.3922,1.101169,0.002854,0.017863
4,3416,59.998822,58,60,0.048532,0.002355,176410,26,26.0,26,0.0,24,24.0,24,0.0,2.002358,0.0,26.38893,6785,4,4,39.957571,169.905223,0.137244,0.005887,0.018825
5,1410,60.0,60,60,0.0,0.0,73060,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.761905,2810,0,0,39.887447,70.448229,0.14227,0.0142,0.029792
6,3314,59.998782,58,60,0.049349,0.002435,170612,26,26.0,26,0.0,24,24.0,24,0.0,2.002438,0.0,26.039683,6562,4,4,38.899376,168.794481,1.100685,0.005925,0.021482
7,5019,59.99819,58,60,0.060138,0.003617,258336,26,26.0,26,0.0,24,24.0,24,0.0,2.003623,0.0,28.282899,9936,9,9,11.427656,870.257194,0.038543,0.001149,0.002328
8,1930,74.0,74,74,0.0,0.0,154400,40,40.0,40,0.0,0,0.0,0,0.0,0.0,0.0,40.103896,3860,0,0,39.700051,97.229094,1.017906,0.010288,0.034187
9,1999,74.0,74,74,0.0,0.0,159200,40,40.0,40,0.0,0,0.0,0,0.0,0.0,0.0,41.020356,3980,0,0,39.994987,99.512472,0.215444,0.010052,0.03092


In [114]:
# find the columns that we need to synthesis data for to produce an attack dataset
columnsToGather = portSamples.replace(0, np.nan) #replace all 0 values with null
columnsToGather = columnsToGather.dropna(how="all", axis=1).columns.tolist() #remove all columns where there are null values
columnsToGather #left with all columns that the values are not 0 (be know for a fact that the data is consistant and there are not missing values in the rows)

['Number of Ports',
 'Average Packet Length',
 'Packet Length Min',
 'Packet Length Max',
 'Packet Length Std',
 'Packet Length Variance',
 'Total Length of Fwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Min',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Fwd Segment Size Avg',
 'Subflow Fwd Bytes',
 'SYN Flag Count',
 'ACK Flag Count',
 'RST Flag Count',
 'Flow Duration',
 'Packets Per Second',
 'IAT Max',
 'IAT Mean',
 'IAT Std']

In [115]:
# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (portSamples[col].min() * 0.85, portSamples[col].max() * 1.15) for col in columnsToGather}
MinMaxDict

{'Number of Ports': (np.float64(850.0), np.float64(5771.849999999999)),
 'Average Packet Length': (np.float64(50.998447961046864), np.float64(85.1)),
 'Packet Length Min': (np.float64(49.3), np.float64(85.1)),
 'Packet Length Max': (np.float64(51.0), np.float64(85.1)),
 'Packet Length Std': (np.float64(0.0), np.float64(0.06946344940596497)),
 'Packet Length Variance': (np.float64(0.0), np.float64(0.00419580069858701)),
 'Total Length of Fwd Packet': (np.float64(62101.0),
  np.float64(297086.39999999997)),
 'Fwd Packet Length Max': (np.float64(22.099999999999998), np.float64(46.0)),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(46.0)),
 'Fwd Packet Length Min': (np.float64(22.099999999999998), np.float64(46.0)),
 'Bwd Packet Length Max': (np.float64(0.0), np.float64(27.599999999999998)),
 'Bwd Packet Length Mean': (np.float64(0.0), np.float64(27.599999999999998)),
 'Bwd Packet Length Min': (np.float64(0.0), np.float64(27.599999999999998)),
 'Fwd Segment Size Avg

In [116]:
# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['Number of Ports', 'Packet Length Min', 'Packet Length Max', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (850, 5771),
 'Average Packet Length': (np.float64(50.998447961046864), np.float64(85.1)),
 'Packet Length Min': (49, 85),
 'Packet Length Max': (51, 85),
 'Packet Length Std': (np.float64(0.0), np.float64(0.06946344940596497)),
 'Packet Length Variance': (np.float64(0.0), np.float64(0.00419580069858701)),
 'Total Length of Fwd Packet': (np.float64(62101.0),
  np.float64(297086.39999999997)),
 'Fwd Packet Length Max': (22, 46),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(46.0)),
 'Fwd Packet Length Min': (22, 46),
 'Bwd Packet Length Max': (0, 27),
 'Bwd Packet Length Mean': (np.float64(0.0), np.float64(27.599999999999998)),
 'Bwd Packet Length Min': (0, 27),
 'Fwd Segment Size Avg': (np.float64(0.0), np.float64(2.3042034724337497)),
 'Subflow Fwd Bytes': (np.float64(22.122461632279702),
  np.float64(47.18832343063808)),
 'SYN Flag Count': (2388, 11426),
 'ACK Flag Count': (0, 10),
 'RST Flag Count': (0, 10),
 'Flow Duration': (np.float64(

---

### Creating the dataset

In [117]:
# creating an empty dataframe before adding values to it
portDataset = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(portSamples.columns))), columns=portSamples.columns)
portDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [118]:
# adding zeros to all columns that should not have any values
zeroColumns = [col for col in portSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    portDataset[col] = int(0)
zeroColumns

['Fwd Packet Length Std', 'Bwd Packet Length Std', 'Bwd Segment Size Avg']

In [119]:
portDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### First Correlation

In [120]:
first_correlation = ['Number of Ports', 'Total Length of Fwd Packet', 'SYN Flag Count']

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
independent_col = portSamples[first_correlation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = portSamples[first_correlation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(first_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Total Length of Fwd Packet', np.float64(58.833888228624055))
('SYN Flag Count', np.float64(2.0010731757589486))


In [121]:
# adding the rest of the attack feature values to the dataset at random based on the smaple data
portDataset['Number of Ports'] = np.random.randint(MinMaxDict['Number of Ports'][0]*0.9, MinMaxDict['Number of Ports'][1]*1.10, NUM_OF_ROWS)

for index, row in portDataset.iterrows():
    for col, factor in zip(first_correlation[1:], scaling_factors): #iterating over all rows we need to add values to except 'Number of Ports'
        # calculate a random small delta of the factor for adding some randomness
        delta = random.uniform(factor[1] * 0.01, factor[1] * 0.02) # select a delta

        # apply the randomness to the calculated number
        updatedFactor = factor[1] + random.choice([-1, 1]) * delta

        # calculate the value we want to add into the dataset in the given row
        portDataset.loc[index, col] = row['Number of Ports'] * updatedFactor

In [122]:
portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4417,0.0,0.0,0.0,0.0,0.0,263920.150327,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,8714.197595,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4716,0.0,0.0,0.0,0.0,0.0,281673.975108,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,9305.115783,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5193,0.0,0.0,0.0,0.0,0.0,311522.517231,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,10541.551606,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1532,0.0,0.0,0.0,0.0,0.0,89090.151896,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,3025.591673,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,781,0.0,0.0,0.0,0.0,0.0,45094.008045,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,1591.559097,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,829,0.0,0.0,0.0,0.0,0.0,49368.296436,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,1631.159206,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6996,2973,0.0,0.0,0.0,0.0,0.0,172933.587508,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,6059.350364,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6997,4238,0.0,0.0,0.0,0.0,0.0,254241.052815,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,8380.049983,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6998,2062,0.0,0.0,0.0,0.0,0.0,119023.163373,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,4194.423093,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Second Correlation

In [123]:
second_correlation = ['Flow Duration', 'Packets Per Second', 'IAT Max', 'IAT Mean', 'IAT Std']

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
independent_col = portSamples[second_correlation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = portSamples[second_correlation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(second_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Packets Per Second', np.float64(3.4232599999586326))
('IAT Max', np.float64(0.016503150061282924))
('IAT Mean', np.float64(0.00024361763844503648))
('IAT Std', np.float64(0.0007367524780914209))


In [124]:
# Generate random values for the 'Flow Duration' column
randValues = np.random.uniform(MinMaxDict['Flow Duration'][0]*0.9, MinMaxDict['Flow Duration'][1]*1.05, size=NUM_OF_ROWS)

# Assign the random values
portDataset['Flow Duration'] = randValues

portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4417,0.0,0.0,0.0,0.0,0.0,263920.150327,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,8714.197595,0.0,0.0,13.607830,0.0,0.0,0.0,0.0
1,4716,0.0,0.0,0.0,0.0,0.0,281673.975108,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,9305.115783,0.0,0.0,33.445359,0.0,0.0,0.0,0.0
2,5193,0.0,0.0,0.0,0.0,0.0,311522.517231,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,10541.551606,0.0,0.0,14.917589,0.0,0.0,0.0,0.0
3,1532,0.0,0.0,0.0,0.0,0.0,89090.151896,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,3025.591673,0.0,0.0,14.942099,0.0,0.0,0.0,0.0
4,781,0.0,0.0,0.0,0.0,0.0,45094.008045,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,1591.559097,0.0,0.0,47.382791,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,829,0.0,0.0,0.0,0.0,0.0,49368.296436,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,1631.159206,0.0,0.0,42.061354,0.0,0.0,0.0,0.0
6996,2973,0.0,0.0,0.0,0.0,0.0,172933.587508,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,6059.350364,0.0,0.0,32.970398,0.0,0.0,0.0,0.0
6997,4238,0.0,0.0,0.0,0.0,0.0,254241.052815,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,8380.049983,0.0,0.0,39.884547,0.0,0.0,0.0,0.0
6998,2062,0.0,0.0,0.0,0.0,0.0,119023.163373,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,4194.423093,0.0,0.0,41.342092,0.0,0.0,0.0,0.0


In [125]:
durationToPacketsCorr = [x * y for x, y in zip(portSamples['Flow Duration'].values, portSamples['Packets Per Second'].values)]
durationToPacketsCorr = np.mean(durationToPacketsCorr)
durationToPacketsCorr

np.float64(4832.368421052632)

In [126]:
#iterating over all rows we need to add values
for index, row in portDataset.iterrows():
    for col, factor in scaling_factors: 
        # calculate a random small delta of the factor for adding some randomness
        if col == 'Packets Per Second':
            delta = random.uniform(durationToPacketsCorr * 0.25, durationToPacketsCorr * 0.65) # select a delta
            updatedFactor = durationToPacketsCorr + random.choice([-1, 1]) * delta
            portDataset.loc[index, col] = updatedFactor / row['Flow Duration']
        else:
            if col == 'IAT Std':
                delta = random.uniform(factor * 0.35, factor * 0.65)
                updatedFactor = factor + random.choice([-1, 1]) * delta  
            else:
                delta = random.uniform(factor * 0.1, factor * 0.2) # select a delta
                updatedFactor = factor + random.choice([-1, 1]) * delta

            if col == 'IAT Max':
                delta = random.uniform(factor * 0.6, factor * 0.99)
                updatedFactor = factor + random.choices([-1, 1], weights=[1, 3], k=1)[0] * delta  
                portDataset.loc[index, col] = (row['Flow Duration'] * updatedFactor) * 2.3
            else:
                portDataset.loc[index, col] = row['Flow Duration'] * updatedFactor

In [127]:
portSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,1970,60.0,60,60,0.0,0.0,102154,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.679028,3929,0,0,39.64862,99.095504,0.101249,0.010094,0.017669
1,1980,60.0,60,60,0.0,0.0,102778,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.674799,3953,0,0,39.988767,98.85276,0.090225,0.010119,0.016015
2,1800,60.0,60,60,0.0,0.0,93366,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.072605,3591,0,0,37.254382,96.391345,1.103064,0.010377,0.029695
3,4942,59.998174,58,60,0.060403,0.003649,256074,26,26.0,26,0.0,24,24.0,24,0.0,2.003655,0.0,26.026425,9849,9,9,28.134188,350.3922,1.101169,0.002854,0.017863
4,3416,59.998822,58,60,0.048532,0.002355,176410,26,26.0,26,0.0,24,24.0,24,0.0,2.002358,0.0,26.38893,6785,4,4,39.957571,169.905223,0.137244,0.005887,0.018825
5,1410,60.0,60,60,0.0,0.0,73060,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.761905,2810,0,0,39.887447,70.448229,0.14227,0.0142,0.029792
6,3314,59.998782,58,60,0.049349,0.002435,170612,26,26.0,26,0.0,24,24.0,24,0.0,2.002438,0.0,26.039683,6562,4,4,38.899376,168.794481,1.100685,0.005925,0.021482
7,5019,59.99819,58,60,0.060138,0.003617,258336,26,26.0,26,0.0,24,24.0,24,0.0,2.003623,0.0,28.282899,9936,9,9,11.427656,870.257194,0.038543,0.001149,0.002328
8,1930,74.0,74,74,0.0,0.0,154400,40,40.0,40,0.0,0,0.0,0,0.0,0.0,0.0,40.103896,3860,0,0,39.700051,97.229094,1.017906,0.010288,0.034187
9,1999,74.0,74,74,0.0,0.0,159200,40,40.0,40,0.0,0,0.0,0,0.0,0.0,0.0,41.020356,3980,0,0,39.994987,99.512472,0.215444,0.010052,0.03092


In [128]:
x = portDataset[portDataset['Flow Duration']>=39]
x[x['Flow Duration']<=40][0:40]

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
81,3302,0.0,0.0,0.0,0.0,0.0,197813.443116,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,6498.679695,0.0,0.0,39.783577,159.450767,0.275888,0.007909,0.016244
120,6261,0.0,0.0,0.0,0.0,0.0,363434.734335,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,12775.97699,0.0,0.0,39.548559,200.393006,2.843066,0.007962,0.018042
275,4047,0.0,0.0,0.0,0.0,0.0,240812.605977,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,8213.904577,0.0,0.0,39.397919,87.784479,2.600109,0.010883,0.017696
295,4782,0.0,0.0,0.0,0.0,0.0,275866.253672,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,9423.358947,0.0,0.0,39.184663,179.048424,0.361453,0.010804,0.011293
310,2877,0.0,0.0,0.0,0.0,0.0,171774.660827,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,5666.991159,0.0,0.0,39.176322,154.695253,2.832175,0.008032,0.041019
321,4193,0.0,0.0,0.0,0.0,0.0,242230.271522,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,8226.54465,0.0,0.0,39.379332,89.558899,2.428967,0.008294,0.042643
348,3366,0.0,0.0,0.0,0.0,0.0,200959.512592,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,6855.266262,0.0,0.0,39.46692,175.32451,2.776902,0.007903,0.015851
464,2787,0.0,0.0,0.0,0.0,0.0,166383.690229,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,5495.961443,0.0,0.0,39.586201,44.750119,2.790871,0.010907,0.039553
468,3470,0.0,0.0,0.0,0.0,0.0,207574.439961,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,6815.820925,0.0,0.0,39.441822,70.215149,2.684799,0.007725,0.04492
514,1157,0.0,0.0,0.0,0.0,0.0,66815.421715,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,2341.265812,0.0,0.0,39.703251,59.325012,2.548702,0.007793,0.012846


### Independant Columns

In [129]:
independant = ['Packet Length Std', 'Packet Length Variance', 'Fwd Segment Size Avg']

for col in independant:
    # Generate random values from the uniform distribution
    rand_values = np.random.uniform(MinMaxDict[col][0], MinMaxDict[col][1]*1.1, NUM_OF_ROWS)

    # Randomly choose between 0 or the generated random value
    chosen_values = np.where(np.random.rand(NUM_OF_ROWS) > 0.5, rand_values, 0)

    portDataset[col] = chosen_values

portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4417,0.0,0.0,0.0,0.000000,0.001310,263920.150327,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.535872,0,0.0,8714.197595,0.0,0.0,13.607830,550.172541,0.061991,0.002835,0.005011
1,4716,0.0,0.0,0.0,0.000000,0.000000,281673.975108,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,9305.115783,0.0,0.0,33.445359,96.361817,0.151142,0.006583,0.037016
2,5193,0.0,0.0,0.0,0.038577,0.004435,311522.517231,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,10541.551606,0.0,0.0,14.917589,222.811270,0.974353,0.002993,0.005912
3,1532,0.0,0.0,0.0,0.062748,0.003540,89090.151896,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,3025.591673,0.0,0.0,14.942099,523.875182,1.125060,0.003260,0.017891
4,781,0.0,0.0,0.0,0.039077,0.000669,45094.008045,0.0,0.0,0.0,0,0.0,0.0,0.0,0,2.193531,0,0.0,1591.559097,0.0,0.0,47.382791,148.138293,3.012794,0.009427,0.056045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,829,0.0,0.0,0.0,0.000000,0.000000,49368.296436,0.0,0.0,0.0,0,0.0,0.0,0.0,0,2.471052,0,0.0,1631.159206,0.0,0.0,42.061354,183.279428,2.972560,0.008245,0.018306
6996,2973,0.0,0.0,0.0,0.000214,0.004037,172933.587508,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,6059.350364,0.0,0.0,32.970398,221.443528,0.396169,0.009299,0.034399
6997,4238,0.0,0.0,0.0,0.000000,0.000000,254241.052815,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,8380.049983,0.0,0.0,39.884547,63.242084,2.476938,0.011076,0.011326
6998,2062,0.0,0.0,0.0,0.000000,0.000346,119023.163373,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,4194.423093,0.0,0.0,41.342092,60.581781,2.665077,0.008233,0.048181


In [130]:
x = portDataset[portDataset['Packet Length Std']>0]
x

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
2,5193,0.0,0.0,0.0,0.038577,0.004435,311522.517231,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,10541.551606,0.0,0.0,14.917589,222.811270,0.974353,0.002993,0.005912
3,1532,0.0,0.0,0.0,0.062748,0.003540,89090.151896,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,3025.591673,0.0,0.0,14.942099,523.875182,1.125060,0.003260,0.017891
4,781,0.0,0.0,0.0,0.039077,0.000669,45094.008045,0.0,0.0,0.0,0,0.0,0.0,0.0,0,2.193531,0,0.0,1591.559097,0.0,0.0,47.382791,148.138293,3.012794,0.009427,0.056045
6,5100,0.0,0.0,0.0,0.055442,0.000174,295526.939200,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.862930,0,0.0,10060.261645,0.0,0.0,10.494699,642.328994,0.650680,0.002938,0.012256
7,2694,0.0,0.0,0.0,0.028405,0.002617,156446.508272,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,5492.273839,0.0,0.0,20.444808,349.221687,1.504247,0.004379,0.024022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6989,3485,0.0,0.0,0.0,0.066734,0.002315,208916.193698,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,7109.489019,0.0,0.0,44.264651,156.162034,3.075665,0.009227,0.012438
6990,1865,0.0,0.0,0.0,0.035309,0.000000,110957.863692,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.215408,0,0.0,3676.427916,0.0,0.0,9.670269,679.769442,0.689665,0.002644,0.002748
6991,5865,0.0,0.0,0.0,0.023302,0.000000,340365.842988,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.000000,0,0.0,11551.084870,0.0,0.0,13.095636,585.188968,0.964083,0.003677,0.003429
6993,6027,0.0,0.0,0.0,0.059924,0.003717,348103.777490,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.267371,0,0.0,11842.771636,0.0,0.0,10.171054,262.067428,0.682757,0.002040,0.004218


### Same Values

In [131]:
same_values = ['Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Min'] # 'Subflow Fwd Bytes' is approximatly the same as values in here

for col in same_values:
    # Generate random values for the 'Flow Duration' column
    randValues = np.random.uniform(MinMaxDict[col][0]*0.9, MinMaxDict[col][1]*1.1, size=NUM_OF_ROWS)

    # 'Subflow Fwd Bytes' is approximatly the same as values in ['Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Min']
    # Create a new column 'Subflow Fwd Bytes' with slightly adjusted values from randValues
    adjustment_factor = np.random.uniform(0.9995, 1.0005, size=NUM_OF_ROWS)
    subflow_fwd_bytes = randValues * adjustment_factor
    portDataset['Subflow Fwd Bytes'] = subflow_fwd_bytes

    # Assign the random values
    portDataset[col] = randValues

In [132]:
portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4417,0.0,0.0,0.0,0.000000,0.001310,263920.150327,28.217247,45.607043,38.791719,0,0.0,0.0,0.0,0,0.535872,0,38.809063,8714.197595,0.0,0.0,13.607830,550.172541,0.061991,0.002835,0.005011
1,4716,0.0,0.0,0.0,0.000000,0.000000,281673.975108,29.836584,44.504312,32.140509,0,0.0,0.0,0.0,0,0.000000,0,32.138025,9305.115783,0.0,0.0,33.445359,96.361817,0.151142,0.006583,0.037016
2,5193,0.0,0.0,0.0,0.038577,0.004435,311522.517231,34.166581,27.931210,42.739128,0,0.0,0.0,0.0,0,0.000000,0,42.735921,10541.551606,0.0,0.0,14.917589,222.811270,0.974353,0.002993,0.005912
3,1532,0.0,0.0,0.0,0.062748,0.003540,89090.151896,33.663114,23.582331,42.821213,0,0.0,0.0,0.0,0,0.000000,0,42.813172,3025.591673,0.0,0.0,14.942099,523.875182,1.125060,0.003260,0.017891
4,781,0.0,0.0,0.0,0.039077,0.000669,45094.008045,21.237451,45.821838,44.653276,0,0.0,0.0,0.0,0,2.193531,0,44.652399,1591.559097,0.0,0.0,47.382791,148.138293,3.012794,0.009427,0.056045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,829,0.0,0.0,0.0,0.000000,0.000000,49368.296436,41.046342,27.158560,30.490577,0,0.0,0.0,0.0,0,2.471052,0,30.492141,1631.159206,0.0,0.0,42.061354,183.279428,2.972560,0.008245,0.018306
6996,2973,0.0,0.0,0.0,0.000214,0.004037,172933.587508,29.613837,34.681978,50.577238,0,0.0,0.0,0.0,0,0.000000,0,50.585376,6059.350364,0.0,0.0,32.970398,221.443528,0.396169,0.009299,0.034399
6997,4238,0.0,0.0,0.0,0.000000,0.000000,254241.052815,21.254939,21.655572,24.439955,0,0.0,0.0,0.0,0,0.000000,0,24.451166,8380.049983,0.0,0.0,39.884547,63.242084,2.476938,0.011076,0.011326
6998,2062,0.0,0.0,0.0,0.000000,0.000346,119023.163373,42.390573,50.140419,46.200632,0,0.0,0.0,0.0,0,0.000000,0,46.203916,4194.423093,0.0,0.0,41.342092,60.581781,2.665077,0.008233,0.048181


### Approximate Values

In [133]:
approx_same = ['Average Packet Length', 'Packet Length Min', 'Packet Length Max']

# Generate random values for 'Packet Length Max'
packet_length_max = np.random.randint(MinMaxDict['Packet Length Max'][0] * 0.9, MinMaxDict['Packet Length Max'][1] * 1.1, NUM_OF_ROWS)

# Decide whether to copy or adjust based on a condition or randomly
copy_values = np.random.choice([True, False], size=NUM_OF_ROWS)  # Randomly decide whether to copy values or not

# Create 'Average Packet Length' and 'Packet Length Min' based on 'Packet Length Max'
packet_length_min = np.where(copy_values, packet_length_max, packet_length_max + np.random.uniform(-2, 2, NUM_OF_ROWS))
packet_length_min = np.minimum(packet_length_min, packet_length_max)

# If True, copy the 'Packet Length Max' values; if False, apply small variation
average_packet_length = np.where(packet_length_max != packet_length_min, (packet_length_max + packet_length_min) / 2, packet_length_min)

# Assign the values to the dataset
portDataset['Packet Length Max'] = packet_length_max
portDataset['Average Packet Length'] = average_packet_length
portDataset['Packet Length Min'] = packet_length_min.astype(int)

In [134]:
portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,4417,71.000000,71,71,0.000000,0.001310,263920.150327,28.217247,45.607043,38.791719,0,0.0,0.0,0.0,0,0.535872,0,38.809063,8714.197595,0.0,0.0,13.607830,550.172541,0.061991,0.002835,0.005011
1,4716,66.000000,66,66,0.000000,0.000000,281673.975108,29.836584,44.504312,32.140509,0,0.0,0.0,0.0,0,0.000000,0,32.138025,9305.115783,0.0,0.0,33.445359,96.361817,0.151142,0.006583,0.037016
2,5193,86.000000,86,86,0.038577,0.004435,311522.517231,34.166581,27.931210,42.739128,0,0.0,0.0,0.0,0,0.000000,0,42.735921,10541.551606,0.0,0.0,14.917589,222.811270,0.974353,0.002993,0.005912
3,1532,85.000000,85,85,0.062748,0.003540,89090.151896,33.663114,23.582331,42.821213,0,0.0,0.0,0.0,0,0.000000,0,42.813172,3025.591673,0.0,0.0,14.942099,523.875182,1.125060,0.003260,0.017891
4,781,84.000000,84,84,0.039077,0.000669,45094.008045,21.237451,45.821838,44.653276,0,0.0,0.0,0.0,0,2.193531,0,44.652399,1591.559097,0.0,0.0,47.382791,148.138293,3.012794,0.009427,0.056045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,829,86.000000,86,86,0.000000,0.000000,49368.296436,41.046342,27.158560,30.490577,0,0.0,0.0,0.0,0,2.471052,0,30.492141,1631.159206,0.0,0.0,42.061354,183.279428,2.972560,0.008245,0.018306
6996,2973,58.000000,58,58,0.000214,0.004037,172933.587508,29.613837,34.681978,50.577238,0,0.0,0.0,0.0,0,0.000000,0,50.585376,6059.350364,0.0,0.0,32.970398,221.443528,0.396169,0.009299,0.034399
6997,4238,49.494392,48,50,0.000000,0.000000,254241.052815,21.254939,21.655572,24.439955,0,0.0,0.0,0.0,0,0.000000,0,24.451166,8380.049983,0.0,0.0,39.884547,63.242084,2.476938,0.011076,0.011326
6998,2062,91.008691,90,92,0.000000,0.000346,119023.163373,42.390573,50.140419,46.200632,0,0.0,0.0,0.0,0,0.000000,0,46.203916,4194.423093,0.0,0.0,41.342092,60.581781,2.665077,0.008233,0.048181


In [135]:
x = portDataset[portDataset['Packet Length Min'] != portDataset['Packet Length Max']]
x

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
5,863,86.779273,86,87,0.000000,0.001639,51700.181646,42.610156,22.657191,23.409874,0,0.0,0.0,0.0,0,1.344101,0,23.414976,1703.342850,0.0,0.0,13.067482,221.755910,0.147992,0.002715,0.004327
7,2694,79.648581,79,80,0.028405,0.002617,156446.508272,25.187127,32.933219,37.761188,0,0.0,0.0,0.0,0,0.000000,0,37.746673,5492.273839,0.0,0.0,20.444808,349.221687,1.504247,0.004379,0.024022
10,1300,65.296543,64,66,0.012297,0.003006,77641.395447,36.315285,44.594603,24.317694,0,0.0,0.0,0.0,0,0.545722,0,24.305827,2643.629572,0.0,0.0,29.100146,65.957930,1.866508,0.006183,0.011190
16,1449,48.983617,48,49,0.000000,0.000000,86399.410946,25.155413,48.613973,24.541685,0,0.0,0.0,0.0,0,0.743080,0,24.531242,2850.049205,0.0,0.0,25.068187,90.472947,1.685561,0.004994,0.009596
18,4734,78.217926,77,79,0.069203,0.000000,275715.617601,30.937229,49.853940,43.646957,0,0.0,0.0,0.0,0,0.000000,0,43.635395,9621.066981,0.0,0.0,24.275841,249.515379,1.654430,0.006970,0.009793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6990,1865,84.797766,84,85,0.035309,0.000000,110957.863692,49.277452,44.758979,27.302424,0,0.0,0.0,0.0,0,0.215408,0,27.292873,3676.427916,0.0,0.0,9.670269,679.769442,0.689665,0.002644,0.002748
6991,5865,62.470729,61,63,0.023302,0.000000,340365.842988,21.250860,30.450357,27.063466,0,0.0,0.0,0.0,0,0.000000,0,27.076106,11551.084870,0.0,0.0,13.095636,585.188968,0.964083,0.003677,0.003429
6993,6027,71.258837,70,72,0.059924,0.003717,348103.777490,48.609349,22.384413,48.981677,0,0.0,0.0,0.0,0,0.267371,0,48.995231,11842.771636,0.0,0.0,10.171054,262.067428,0.682757,0.002040,0.004218
6997,4238,49.494392,48,50,0.000000,0.000000,254241.052815,21.254939,21.655572,24.439955,0,0.0,0.0,0.0,0,0.000000,0,24.451166,8380.049983,0.0,0.0,39.884547,63.242084,2.476938,0.011076,0.011326


### Backwards Packets with Flags

In [136]:
backward_flags = ['Bwd Packet Length Max', 'Bwd Packet Length Mean', 'Bwd Packet Length Min', 'ACK Flag Count', 'RST Flag Count']

# Probability of doing X (30%) and Y (70%)
probability = [0.25, 0.75]

# Decide whether to use backward flags (True or False) based on the probability for each row
has_backward_flags = np.random.choice([True, False], size=NUM_OF_ROWS, p=probability)

# Check if the value should be True or False for each row
for i in range(NUM_OF_ROWS):
    if has_backward_flags[i]:
        # If True, generate random values for Bwd Packet Length and Flag Count
        bwd_vector = np.random.randint(16, MinMaxDict['Bwd Packet Length Max'][1] * 1.15)
        flag_vector = np.random.randint(2, MinMaxDict['ACK Flag Count'][1] * 1.15)
        
        # Apply values for the first 3 backward flags
        for col in backward_flags[:3]:
            portDataset.at[i, col] = bwd_vector
        
        # Apply values for the remaining 2 flags
        for col in backward_flags[3:]:
            portDataset.at[i, col] = flag_vector
    else:
        # If False, set only the current row to zero for all backward flags
        for col in backward_flags:
            portDataset.at[i, col] = 0

In [137]:
portDataset[portDataset['ACK Flag Count'] > 0]

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
1,4716,66.000000,66,66,0.000000,0.000000,281673.975108,29.836584,44.504312,32.140509,0,21.0,21.0,21.0,0,0.000000,0,32.138025,9305.115783,8.0,8.0,33.445359,96.361817,0.151142,0.006583,0.037016
11,3052,83.000000,83,83,0.000000,0.000000,177299.898709,49.165920,33.321961,40.079788,0,20.0,20.0,20.0,0,0.000000,0,40.080466,6197.427582,6.0,6.0,41.927771,156.507969,3.109156,0.011354,0.015129
12,3004,82.000000,82,82,0.000000,0.003750,173637.752954,25.947233,44.313006,39.163438,0,26.0,26.0,26.0,0,0.000000,0,39.174207,5946.030081,2.0,2.0,48.129402,38.909949,3.490644,0.010194,0.049604
13,2535,92.000000,92,92,0.028605,0.002816,147122.144614,20.861216,48.849744,47.040734,0,28.0,28.0,28.0,0,1.151387,0,47.033581,5162.049038,3.0,3.0,35.316841,67.319687,2.356136,0.007105,0.012698
14,5584,88.000000,88,88,0.036548,0.000000,332271.159950,33.512599,44.841779,22.550307,0,24.0,24.0,24.0,0,1.668038,0,22.539678,10964.761397,5.0,5.0,9.114580,197.297049,0.570479,0.001883,0.010699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6992,4831,80.000000,80,80,0.000000,0.000000,289662.802824,38.724632,20.480519,42.189536,0,25.0,25.0,25.0,0,0.000000,0,42.188386,9500.022610,6.0,6.0,28.385385,85.252001,1.992142,0.005564,0.029927
6994,2926,58.000000,58,58,0.000000,0.000000,173894.383773,23.653552,39.964991,29.538530,0,24.0,24.0,24.0,0,1.171467,0,29.531778,5934.068682,7.0,7.0,10.636684,650.596338,0.803190,0.002207,0.010918
6996,2973,58.000000,58,58,0.000214,0.004037,172933.587508,29.613837,34.681978,50.577238,0,22.0,22.0,22.0,0,0.000000,0,50.585376,6059.350364,8.0,8.0,32.970398,221.443528,0.396169,0.009299,0.034399
6997,4238,49.494392,48,50,0.000000,0.000000,254241.052815,21.254939,21.655572,24.439955,0,27.0,27.0,27.0,0,0.000000,0,24.451166,8380.049983,10.0,10.0,39.884547,63.242084,2.476938,0.011076,0.011326


---

In [138]:
# making the SYN Flag Count column have int values instead of floats
portDataset['SYN Flag Count'] = portDataset['SYN Flag Count'].astype(int)

# adding a label to the dataset
portDataset['Label'] = ATTACK_NAME

In [139]:
portDataset

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std,Label
0,4417,71.000000,71,71,0.000000,0.001310,263920.150327,28.217247,45.607043,38.791719,0,0.0,0.0,0.0,0,0.535872,0,38.809063,8714,0.0,0.0,13.607830,550.172541,0.061991,0.002835,0.005011,PortScan
1,4716,66.000000,66,66,0.000000,0.000000,281673.975108,29.836584,44.504312,32.140509,0,21.0,21.0,21.0,0,0.000000,0,32.138025,9305,8.0,8.0,33.445359,96.361817,0.151142,0.006583,0.037016,PortScan
2,5193,86.000000,86,86,0.038577,0.004435,311522.517231,34.166581,27.931210,42.739128,0,0.0,0.0,0.0,0,0.000000,0,42.735921,10541,0.0,0.0,14.917589,222.811270,0.974353,0.002993,0.005912,PortScan
3,1532,85.000000,85,85,0.062748,0.003540,89090.151896,33.663114,23.582331,42.821213,0,0.0,0.0,0.0,0,0.000000,0,42.813172,3025,0.0,0.0,14.942099,523.875182,1.125060,0.003260,0.017891,PortScan
4,781,84.000000,84,84,0.039077,0.000669,45094.008045,21.237451,45.821838,44.653276,0,0.0,0.0,0.0,0,2.193531,0,44.652399,1591,0.0,0.0,47.382791,148.138293,3.012794,0.009427,0.056045,PortScan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,829,86.000000,86,86,0.000000,0.000000,49368.296436,41.046342,27.158560,30.490577,0,0.0,0.0,0.0,0,2.471052,0,30.492141,1631,0.0,0.0,42.061354,183.279428,2.972560,0.008245,0.018306,PortScan
6996,2973,58.000000,58,58,0.000214,0.004037,172933.587508,29.613837,34.681978,50.577238,0,22.0,22.0,22.0,0,0.000000,0,50.585376,6059,8.0,8.0,32.970398,221.443528,0.396169,0.009299,0.034399,PortScan
6997,4238,49.494392,48,50,0.000000,0.000000,254241.052815,21.254939,21.655572,24.439955,0,27.0,27.0,27.0,0,0.000000,0,24.451166,8380,10.0,10.0,39.884547,63.242084,2.476938,0.011076,0.011326,PortScan
6998,2062,91.008691,90,92,0.000000,0.000346,119023.163373,42.390573,50.140419,46.200632,0,24.0,24.0,24.0,0,0.000000,0,46.203916,4194,6.0,6.0,41.342092,60.581781,2.665077,0.008233,0.048181,PortScan


In [140]:
portSamples.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0
mean,2387.526316,66.631196,66.105263,66.631579,0.01411,0.000764,150824.842105,32.631579,32.631579,32.631579,0.0,6.315789,6.315789,6.315789,0.0,1.053397,0.0,33.236968,4830.789474,1.578947,1.578947,36.960596,161.40442,0.627105,0.008909,0.02711
std,1100.883199,7.182221,7.730853,7.181848,0.024427,0.001351,48603.855729,7.181848,7.181848,7.181848,0.0,10.857934,10.857934,10.857934,0.0,1.026725,0.0,7.314582,2104.368646,3.005842,3.005842,6.933406,182.503211,0.719782,0.003132,0.011919
min,1000.0,59.998174,58.0,60.0,0.0,0.0,73060.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.026425,2810.0,0.0,0.0,11.427656,70.448229,0.038543,0.001149,0.002328
25%,1880.0,59.999411,59.0,60.0,0.0,0.0,112749.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.230768,3760.0,0.0,0.0,38.380666,96.947314,0.130432,0.008012,0.019438
50%,1994.0,60.0,60.0,60.0,0.0,0.0,158960.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,28.282899,3974.0,0.0,0.0,39.700051,99.53883,0.152234,0.010049,0.029792
75%,2618.5,74.0,74.0,74.0,0.024266,0.001178,163941.0,40.0,40.0,40.0,0.0,12.0,12.0,12.0,0.0,2.001179,0.0,41.00323,5232.0,2.0,2.0,39.960304,133.336021,1.100399,0.010318,0.030047
max,5019.0,74.0,74.0,74.0,0.060403,0.003649,258336.0,40.0,40.0,40.0,0.0,24.0,24.0,24.0,0.0,2.003655,0.0,41.033325,9936.0,9.0,9.0,40.052014,870.257194,2.878662,0.0142,0.063127


In [141]:
portDataset.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0
mean,3543.542,68.507098,68.246857,68.637429,0.018948,0.001136,208497.608654,35.121456,35.353173,35.143544,0.0,5.442,5.442,5.442,0.0,0.636058,0.0,35.143652,7089.985714,1.445,1.445,28.493319,208.743714,1.520806,0.006947,0.021034
std,1611.117105,13.760052,13.776332,13.760571,0.024585,0.001473,94892.124348,8.86325,8.944002,8.849725,0.0,9.956359,9.956359,9.956359,0.0,0.822853,0.0,8.84986,3227.335234,2.866183,2.866183,11.395811,155.479896,1.00807,0.003014,0.014317
min,765.0,44.000343,43.0,45.0,0.0,0.0,44161.201803,19.815552,19.895151,19.802619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.801209,1512.0,0.0,0.0,8.749535,35.894366,0.006327,0.001721,0.002363
25%,2157.75,57.0,56.0,57.0,0.0,0.0,126701.773227,27.518401,27.504199,27.346795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.348416,4320.75,0.0,0.0,18.659732,91.868587,0.62385,0.004437,0.009905
50%,3516.0,68.451254,68.0,69.0,0.0,1e-05,206725.876505,34.96465,35.488182,35.357263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.359995,7024.0,0.0,0.0,28.418198,168.494833,1.512596,0.006781,0.016077
75%,4935.0,80.006708,80.0,80.25,0.03772,0.002254,289801.248137,42.867492,43.16413,42.648615,0.0,0.0,0.0,0.0,0.0,1.275408,0.0,42.649071,9874.25,0.0,0.0,38.427487,265.002925,2.379164,0.009115,0.031083
max,6347.0,92.0,92.0,92.0,0.07639,0.004614,380101.76255,50.592978,50.598822,50.590624,0.0,30.0,30.0,30.0,0.0,2.53457,0.0,50.601202,12932.0,10.0,10.0,48.343878,899.959265,3.62834,0.014091,0.058321


---

### Creating a dataset from the second sample on close port attack

In [142]:
NUM_OF_ROWS = 7000

In [143]:
portSamples = pd.read_csv('portscan_closed_port_samples_2.csv')
print(f'Dataset Shape: {portSamples.shape}')
portSamples

Dataset Shape: (10, 26)


Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,100,60.0,60,60,0.0,0.0,5200,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,27.368421,200,0,0,2.920097,68.490873,1.103361,0.014674,0.082441
1,120,60.0,60,60,0.0,0.0,6240,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,27.130435,240,0,0,3.34079,71.839295,1.102872,0.013978,0.075529
2,120,60.0,60,60,0.0,0.0,6240,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,27.130435,240,0,0,3.325836,72.162309,1.10776,0.013916,0.076588
3,140,60.0,60,60,0.0,0.0,7280,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.962963,280,0,0,3.753263,74.601753,1.109797,0.013453,0.071323
4,240,60.0,60,60,0.0,0.0,12480,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.553191,480,0,0,5.749811,83.481006,1.105315,0.012004,0.05814
5,180,60.0,60,60,0.0,0.0,9360,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.742857,360,0,0,4.554911,79.03557,1.103638,0.012688,0.063769
6,280,60.0,60,60,0.0,0.0,14560,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.472727,560,0,0,6.582215,85.077743,1.1115,0.011775,0.055129
7,150,60.0,60,60,0.0,0.0,7800,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.896552,300,0,0,3.944551,76.054284,1.105556,0.013192,0.069209
8,190,60.0,60,60,0.0,0.0,9880,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.702703,380,0,0,4.743357,80.112036,1.105079,0.012515,0.061978
9,220,60.0,60,60,0.0,0.0,11440,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.604651,440,0,0,5.354074,82.180411,1.10544,0.012196,0.058847


In [144]:
# find the columns that we need to synthesis data for to produce an attack dataset
columnsToGather = portSamples.replace(0, np.nan) #replace all 0 values with null
columnsToGather = columnsToGather.dropna(how="all", axis=1).columns.tolist() #remove all columns where there are null values
columnsToGather #left with all columns that the values are not 0 (be know for a fact that the data is consistant and there are not missing values in the rows)

['Number of Ports',
 'Average Packet Length',
 'Packet Length Min',
 'Packet Length Max',
 'Total Length of Fwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Min',
 'Fwd Segment Size Avg',
 'Subflow Fwd Bytes',
 'SYN Flag Count',
 'Flow Duration',
 'Packets Per Second',
 'IAT Max',
 'IAT Mean',
 'IAT Std']

In [145]:
# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (portSamples[col].min() * 0.85, portSamples[col].max() * 1.15) for col in columnsToGather}
MinMaxDict

{'Number of Ports': (np.float64(85.0), np.float64(322.0)),
 'Average Packet Length': (np.float64(51.0), np.float64(69.0)),
 'Packet Length Min': (np.float64(51.0), np.float64(69.0)),
 'Packet Length Max': (np.float64(51.0), np.float64(69.0)),
 'Total Length of Fwd Packet': (np.float64(4420.0), np.float64(16744.0)),
 'Fwd Packet Length Max': (np.float64(22.099999999999998), np.float64(29.9)),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(29.9)),
 'Fwd Packet Length Min': (np.float64(22.099999999999998), np.float64(29.9)),
 'Fwd Segment Size Avg': (np.float64(1.7), np.float64(2.3)),
 'Subflow Fwd Bytes': (np.float64(22.501818181818187),
  np.float64(31.47368421052631)),
 'SYN Flag Count': (np.float64(170.0), np.float64(644.0)),
 'Flow Duration': (np.float64(2.482082545757294),
  np.float64(7.56954733133316)),
 'Packets Per Second': (np.float64(58.21724190720356),
  np.float64(97.83940407299949)),
 'IAT Max': (np.float64(0.9374411106109619), np.float64(1.278225028

In [146]:
# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['Number of Ports', 'Packet Length Min', 'Packet Length Max', 'Total Length of Fwd Packet', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'SYN Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (85, 322),
 'Average Packet Length': (np.float64(51.0), np.float64(69.0)),
 'Packet Length Min': (51, 69),
 'Packet Length Max': (51, 69),
 'Total Length of Fwd Packet': (4420, 16744),
 'Fwd Packet Length Max': (22, 29),
 'Fwd Packet Length Mean': (np.float64(22.099999999999998), np.float64(29.9)),
 'Fwd Packet Length Min': (22, 29),
 'Fwd Segment Size Avg': (np.float64(1.7), np.float64(2.3)),
 'Subflow Fwd Bytes': (np.float64(22.501818181818187),
  np.float64(31.47368421052631)),
 'SYN Flag Count': (170, 644),
 'Flow Duration': (np.float64(2.482082545757294),
  np.float64(7.56954733133316)),
 'Packets Per Second': (np.float64(58.21724190720356),
  np.float64(97.83940407299949)),
 'IAT Max': (np.float64(0.9374411106109619), np.float64(1.278225028514862)),
 'IAT Mean': (np.float64(0.010008734901817199),
  np.float64(0.016874933063085323)),
 'IAT Std': (np.float64(0.046859943603463065),
  np.float64(0.09480762832623796))}

### Creating the dataset

In [147]:
# creating an empty dataframe before adding values to it
portDataset2 = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(portSamples.columns))), columns=portSamples.columns)

# adding zeros to all columns that should not have any values
zeroColumns = [col for col in portSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    portDataset2[col] = int(0)
zeroColumns

['Packet Length Std',
 'Packet Length Variance',
 'Fwd Packet Length Std',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Bwd Packet Length Std',
 'Bwd Segment Size Avg',
 'ACK Flag Count',
 'RST Flag Count']

In [148]:
first_correlation = ['Number of Ports', 'Total Length of Fwd Packet', 'SYN Flag Count']

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
independent_col = portSamples[first_correlation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = portSamples[first_correlation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(first_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)
    
# adding the rest of the attack feature values to the dataset at random based on the smaple data
portDataset2['Number of Ports'] = np.random.randint(MinMaxDict['Number of Ports'][0]*0.9, MinMaxDict['Number of Ports'][1]*1.10, NUM_OF_ROWS)

for index, row in portDataset2.iterrows():
    for col, factor in zip(first_correlation[1:], scaling_factors): #iterating over all rows we need to add values to except 'Number of Ports'
        # calculate a random small delta of the factor for adding some randomness
        delta = random.uniform(factor[1] * 0.01, factor[1] * 0.02) # select a delta

        # apply the randomness to the calculated number
        updatedFactor = factor[1] + random.choice([-1, 1]) * delta

        # calculate the value we want to add into the dataset in the given row
        portDataset2.loc[index, col] = int(row['Number of Ports'] * updatedFactor)

portDataset2

('Total Length of Fwd Packet', np.float64(51.99999999999999))
('SYN Flag Count', np.float64(2.0))


Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,216,0.0,0.0,0.0,0,0,11082.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,436.0,0,0,0.0,0.0,0.0,0.0,0.0
1,151,0.0,0.0,0.0,0,0,7718.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,305.0,0,0,0.0,0.0,0.0,0.0,0.0
2,337,0.0,0.0,0.0,0,0,17336.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,664.0,0,0,0.0,0.0,0.0,0.0,0.0
3,268,0.0,0.0,0.0,0,0,13724.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,542.0,0,0,0.0,0.0,0.0,0.0,0.0
4,332,0.0,0.0,0.0,0,0,17450.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,657.0,0,0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,188,0.0,0.0,0.0,0,0,9582.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,382.0,0,0,0.0,0.0,0.0,0.0,0.0
6996,132,0.0,0.0,0.0,0,0,6733.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,268.0,0,0,0.0,0.0,0.0,0.0,0.0
6997,111,0.0,0.0,0.0,0,0,5879.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,218.0,0,0,0.0,0.0,0.0,0.0,0.0
6998,244,0.0,0.0,0.0,0,0,12474.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,497.0,0,0,0.0,0.0,0.0,0.0,0.0


In [149]:
second_correlation = ['Number of Ports', 'Flow Duration', 'IAT Mean', 'IAT Std'] #'Packets Per Second', 'IAT Max',

# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
independent_col = portSamples[second_correlation[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependent_cols = portSamples[second_correlation[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scaling_factors = np.linalg.lstsq(independent_col, dependent_cols, rcond=None)[0]

scaling_factors = [(name,factor) for name, factor in zip(second_correlation[1:], scaling_factors.flatten())]
for val in scaling_factors:
    print(val)

('Flow Duration', np.float64(0.024958470011985968))
('IAT Mean', np.float64(6.650583610346264e-05))
('IAT Std', np.float64(0.000336893054787924))


In [150]:
#iterating over all rows we need to add values
for index, row in portDataset2.iterrows():
    for col, factor in scaling_factors: 
        # calculate a random small delta of the factor for adding some randomness
        if col == 'Flow Duration':
            delta = random.uniform(factor * 0.05, factor * 0.1) # select a delta
        elif col == 'Packets Per Second' or col == 'IAT Max':
            delta = 0
            #delta = random.uniform(factor * 0.005, factor * 0.1)# select a delta
        elif col == 'IAT Std':
            delta = random.uniform(factor * 0.05, factor * 0.2) * random.choice([-1, 1]) # select a delta
        else:
            delta = random.uniform(factor * 0.1, factor * 0.25) # select a delta
        updatedFactor = factor + delta
        portDataset2.loc[index, col] = row['Number of Ports'] * updatedFactor

In [151]:
second_correlation = ['Flow Duration', 'Packets Per Second', 'IAT Max']

packets_per_second = 63.5 + (portDataset2['Flow Duration'] - 2.0) * (35 / 7.5)  # Linear transformation
portDataset2['Packets Per Second'] = np.clip(packets_per_second, 63.5, 98.75)  # Ensure within range

iat_max = 1.100 + (portDataset2['Flow Duration'] - 2.0) * (0.013 / 7.5) + np.random.uniform(-0.002, 0.002, size=NUM_OF_ROWS)
portDataset2['IAT Max'] = np.clip(iat_max, 1.100, 1.113)  # Ensure within range

In [152]:
portDataset2

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,216,0.0,0.0,0.0,0,0,11082.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,436.0,0,0,5.849265,81.463237,1.105062,0.016715,0.076786
1,151,0.0,0.0,0.0,0,0,7718.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,305.0,0,0,4.041739,73.028116,1.101763,0.011497,0.041301
2,337,0.0,0.0,0.0,0,0,17336.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,664.0,0,0,8.917137,95.779971,1.110019,0.027370,0.104946
3,268,0.0,0.0,0.0,0,0,13724.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,542.0,0,0,7.080537,87.209173,1.107031,0.021953,0.074479
4,332,0.0,0.0,0.0,0,0,17450.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,657.0,0,0,9.010491,96.215624,1.112154,0.026590,0.096638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,188,0.0,0.0,0.0,0,0,9582.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,382.0,0,0,4.942773,77.232940,1.105717,0.015219,0.072470
6996,132,0.0,0.0,0.0,0,0,6733.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,268.0,0,0,3.459421,70.310631,1.104079,0.010883,0.036803
6997,111,0.0,0.0,0.0,0,0,5879.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,218.0,0,0,2.991017,68.124747,1.100098,0.008747,0.031883
6998,244,0.0,0.0,0.0,0,0,12474.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,497.0,0,0,6.533386,84.655801,1.109667,0.019293,0.067099


In [153]:
x = portDataset2[portDataset2['Number of Ports'] > 119]
x[x['Number of Ports'] < 121]

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
285,120,0.0,0.0,0.0,0,0,6329.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,243.0,0,0,3.293215,69.535005,1.101828,0.009303,0.047594
832,120,0.0,0.0,0.0,0,0,6340.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,237.0,0,0,3.277081,69.459712,1.103764,0.009853,0.043749
1230,120,0.0,0.0,0.0,0,0,6163.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,244.0,0,0,3.180735,69.010097,1.103475,0.008966,0.038031
1604,120,0.0,0.0,0.0,0,0,6134.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,236.0,0,0,3.151655,68.874392,1.100857,0.00934,0.044558
1627,120,0.0,0.0,0.0,0,0,6308.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,235.0,0,0,3.162845,68.92661,1.100567,0.009292,0.036132
1869,120,0.0,0.0,0.0,0,0,6330.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,237.0,0,0,3.285068,69.496982,1.101083,0.009577,0.0483
2322,120,0.0,0.0,0.0,0,0,6119.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,244.0,0,0,3.234782,69.262316,1.100172,0.008973,0.036747
2562,120,0.0,0.0,0.0,0,0,6117.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,243.0,0,0,3.271269,69.43259,1.103185,0.009234,0.033501
2689,120,0.0,0.0,0.0,0,0,6133.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,235.0,0,0,3.199436,69.097368,1.103901,0.009669,0.033389
3248,120,0.0,0.0,0.0,0,0,6116.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0,0.0,243.0,0,0,3.202952,69.113776,1.102122,0.009874,0.04253


In [154]:
portSamples

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,100,60.0,60,60,0.0,0.0,5200,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,27.368421,200,0,0,2.920097,68.490873,1.103361,0.014674,0.082441
1,120,60.0,60,60,0.0,0.0,6240,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,27.130435,240,0,0,3.34079,71.839295,1.102872,0.013978,0.075529
2,120,60.0,60,60,0.0,0.0,6240,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,27.130435,240,0,0,3.325836,72.162309,1.10776,0.013916,0.076588
3,140,60.0,60,60,0.0,0.0,7280,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.962963,280,0,0,3.753263,74.601753,1.109797,0.013453,0.071323
4,240,60.0,60,60,0.0,0.0,12480,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.553191,480,0,0,5.749811,83.481006,1.105315,0.012004,0.05814
5,180,60.0,60,60,0.0,0.0,9360,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.742857,360,0,0,4.554911,79.03557,1.103638,0.012688,0.063769
6,280,60.0,60,60,0.0,0.0,14560,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.472727,560,0,0,6.582215,85.077743,1.1115,0.011775,0.055129
7,150,60.0,60,60,0.0,0.0,7800,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.896552,300,0,0,3.944551,76.054284,1.105556,0.013192,0.069209
8,190,60.0,60,60,0.0,0.0,9880,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.702703,380,0,0,4.743357,80.112036,1.105079,0.012515,0.061978
9,220,60.0,60,60,0.0,0.0,11440,26,26.0,26,0.0,0,0.0,0,0.0,2.0,0.0,26.604651,440,0,0,5.354074,82.180411,1.10544,0.012196,0.058847


### Independant Columns

In [155]:
portDataset2['Fwd Segment Size Avg'] = np.full(NUM_OF_ROWS, 2.0)

portDataset2['Subflow Fwd Bytes'] = np.random.uniform(MinMaxDict['Subflow Fwd Bytes'][0]*0.95, MinMaxDict['Subflow Fwd Bytes'][1]*1.05, NUM_OF_ROWS)

portDataset2

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,216,0.0,0.0,0.0,0,0,11082.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,27.170545,436.0,0,0,5.849265,81.463237,1.105062,0.016715,0.076786
1,151,0.0,0.0,0.0,0,0,7718.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,29.537165,305.0,0,0,4.041739,73.028116,1.101763,0.011497,0.041301
2,337,0.0,0.0,0.0,0,0,17336.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,24.814838,664.0,0,0,8.917137,95.779971,1.110019,0.027370,0.104946
3,268,0.0,0.0,0.0,0,0,13724.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,31.318157,542.0,0,0,7.080537,87.209173,1.107031,0.021953,0.074479
4,332,0.0,0.0,0.0,0,0,17450.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,27.931923,657.0,0,0,9.010491,96.215624,1.112154,0.026590,0.096638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,188,0.0,0.0,0.0,0,0,9582.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,22.978139,382.0,0,0,4.942773,77.232940,1.105717,0.015219,0.072470
6996,132,0.0,0.0,0.0,0,0,6733.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,27.942530,268.0,0,0,3.459421,70.310631,1.104079,0.010883,0.036803
6997,111,0.0,0.0,0.0,0,0,5879.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,31.511257,218.0,0,0,2.991017,68.124747,1.100098,0.008747,0.031883
6998,244,0.0,0.0,0.0,0,0,12474.0,0.0,0.0,0.0,0,0,0,0,0,2.0,0,26.264934,497.0,0,0,6.533386,84.655801,1.109667,0.019293,0.067099


In [156]:
same_values1 = ['Average Packet Length', 'Packet Length Min', 'Packet Length Max']

# Generate random values for the 'Flow Duration' column
randValues = np.random.randint(MinMaxDict['Average Packet Length'][0]*0.95, MinMaxDict['Average Packet Length'][1]*1.05, size=NUM_OF_ROWS)

# Assign the random values
for col in same_values1:
    portDataset2[col] = randValues

In [157]:
same_values2 = ['Fwd Packet Length Max', 'Fwd Packet Length Mean', 'Fwd Packet Length Min']

# Generate random values for the 'Flow Duration' column
randValues = np.random.randint(MinMaxDict['Fwd Packet Length Max'][0]*0.95, MinMaxDict['Fwd Packet Length Max'][1]*1.05, size=NUM_OF_ROWS)

# Assign the random values
for col in same_values2:
    portDataset2[col] = randValues

In [158]:
portDataset2

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
0,216,51,51,51,0,0,11082.0,24,24,24,0,0,0,0,0,2.0,0,27.170545,436.0,0,0,5.849265,81.463237,1.105062,0.016715,0.076786
1,151,50,50,50,0,0,7718.0,26,26,26,0,0,0,0,0,2.0,0,29.537165,305.0,0,0,4.041739,73.028116,1.101763,0.011497,0.041301
2,337,54,54,54,0,0,17336.0,24,24,24,0,0,0,0,0,2.0,0,24.814838,664.0,0,0,8.917137,95.779971,1.110019,0.027370,0.104946
3,268,60,60,60,0,0,13724.0,27,27,27,0,0,0,0,0,2.0,0,31.318157,542.0,0,0,7.080537,87.209173,1.107031,0.021953,0.074479
4,332,55,55,55,0,0,17450.0,22,22,22,0,0,0,0,0,2.0,0,27.931923,657.0,0,0,9.010491,96.215624,1.112154,0.026590,0.096638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,188,70,70,70,0,0,9582.0,21,21,21,0,0,0,0,0,2.0,0,22.978139,382.0,0,0,4.942773,77.232940,1.105717,0.015219,0.072470
6996,132,51,51,51,0,0,6733.0,26,26,26,0,0,0,0,0,2.0,0,27.942530,268.0,0,0,3.459421,70.310631,1.104079,0.010883,0.036803
6997,111,59,59,59,0,0,5879.0,24,24,24,0,0,0,0,0,2.0,0,31.511257,218.0,0,0,2.991017,68.124747,1.100098,0.008747,0.031883
6998,244,59,59,59,0,0,12474.0,28,28,28,0,0,0,0,0,2.0,0,26.264934,497.0,0,0,6.533386,84.655801,1.109667,0.019293,0.067099


In [159]:
portSamples.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,174.0,60.0,60.0,60.0,0.0,0.0,9048.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,26.856494,348.0,0.0,0.0,4.426891,77.303528,1.106032,0.013039,0.067295
std,58.727241,0.0,0.0,0.0,0.0,0.0,3053.816556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.291425,117.454483,0.0,0.0,1.188862,5.538596,0.002827,0.00096,0.009113
min,100.0,60.0,60.0,60.0,0.0,0.0,5200.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,26.472727,200.0,0.0,0.0,2.920097,68.490873,1.102872,0.011775,0.055129
25%,125.0,60.0,60.0,60.0,0.0,0.0,6500.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,26.629164,250.0,0.0,0.0,3.443908,72.77217,1.103998,0.012276,0.059629
50%,165.0,60.0,60.0,60.0,0.0,0.0,8580.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,26.819704,330.0,0.0,0.0,4.249731,77.544927,1.105377,0.01294,0.066489
75%,212.5,60.0,60.0,60.0,0.0,0.0,11050.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,27.088567,425.0,0.0,0.0,5.201395,81.663317,1.107209,0.0138,0.074478
max,280.0,60.0,60.0,60.0,0.0,0.0,14560.0,26.0,26.0,26.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,27.368421,560.0,0.0,0.0,6.582215,85.077743,1.1115,0.014674,0.082441


In [160]:
portDataset2.describe()

Unnamed: 0,Number of Ports,Average Packet Length,Packet Length Min,Packet Length Max,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Min,Fwd Packet Length Std,Bwd Packet Length Max,Bwd Packet Length Mean,Bwd Packet Length Min,Bwd Packet Length Std,Fwd Segment Size Avg,Bwd Segment Size Avg,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Max,IAT Mean,IAT Std
count,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0,7000.0
mean,214.145857,59.586571,59.586571,59.586571,0.0,0.0,11132.501286,24.493286,24.493286,24.493286,0.0,0.0,0.0,0.0,0.0,2.0,0.0,27.161529,427.694429,0.0,0.0,5.745508,80.978331,1.106487,0.016731,0.07236
std,81.062718,6.938667,6.938667,6.938667,0.0,0.0,4218.92572,2.874468,2.874468,2.874468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.330783,162.143025,0.0,0.0,2.17626,10.154567,0.003859,0.006379,0.029281
min,76.0,48.0,48.0,48.0,0.0,0.0,3874.0,20.0,20.0,20.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,21.376964,149.0,0.0,0.0,1.992725,63.5,1.1,0.005577,0.020908
25%,143.0,54.0,54.0,54.0,0.0,0.0,7458.25,22.0,22.0,22.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,24.327777,287.0,0.0,0.0,3.850933,72.137687,1.103177,0.011231,0.047491
50%,214.0,60.0,60.0,60.0,0.0,0.0,11110.5,25.0,25.0,25.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,27.152324,426.0,0.0,0.0,5.722592,80.872098,1.106519,0.016669,0.0714
75%,286.0,66.0,66.0,66.0,0.0,0.0,14856.5,27.0,27.0,27.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,29.964122,570.0,0.0,0.0,7.661797,89.92172,1.109749,0.022281,0.094531
max,353.0,71.0,71.0,71.0,0.0,0.0,18713.0,29.0,29.0,29.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,33.046549,720.0,0.0,0.0,9.690797,98.75,1.113,0.029273,0.142202


In [161]:
# adding a label to the dataset
portDataset2['Label'] = ATTACK_NAME

In [162]:
portDataset2 = portDataset2[portDataset2['Number of Ports'] >= 120]

---

### Merging the two samples tougether

In [163]:
mergedPortDataset = pd.concat([portDataset, portDataset2], axis=0)
mergedPortDataset = mergedPortDataset.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
# save the dataset
# mergedPortDataset.to_csv('port_scan_closed_port_dataset_new.csv', index=False)