In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import time
import random

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.svm import SVC

NUM_OF_ROWS = 25000
ATTACK_NAME = 'PortScan'

---

In [2]:
# import the attack sample dataset
portSamples = pd.read_csv('port_samples_open_ports.csv')
portSamples

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
0,5,0.000000,0,0,0.000000,0.000000,0.000000,0.0,0,0.000000,...,0.0,0,0,0,12.741404,1.098780,12.741404,11.226903,0.980108,2.982235
1,1,275.000000,0,1785,275.000000,522.677880,273192.166667,4318.0,1460,359.833333,...,0.0,2,23,1,1.071450,22.399552,1.071450,0.169452,0.046585,0.072713
2,4933,0.000000,0,0,0.000000,0.000000,0.000000,0.0,0,0.000000,...,0.0,9835,18,18,19.905170,494.997029,19.905170,1.138566,0.002020,0.022363
3,1,234.607143,0,1460,234.607143,461.564215,213041.524235,4868.0,1460,347.714286,...,0.0,2,27,1,0.327403,85.521495,0.327403,0.059512,0.012126,0.022799
4,1,0.500000,0,1,0.500000,0.500000,0.250000,0.0,0,0.000000,...,0.0,0,2,0,0.003521,568.026002,0.003521,0.003521,0.003521,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171,1,0.500000,0,1,0.500000,0.500000,0.250000,0.0,0,0.000000,...,0.0,0,2,0,0.056000,35.714289,0.056000,0.056000,0.056000,0.000000
172,1,0.500000,0,1,0.500000,0.500000,0.250000,0.0,0,0.000000,...,0.0,0,2,0,0.051864,38.562462,0.051864,0.051864,0.051864,0.000000
173,124,0.000000,0,0,0.000000,0.000000,0.000000,0.0,0,0.000000,...,0.0,197,4,4,2.326040,86.412958,2.326040,1.300108,0.011630,0.100762
174,0,0.000000,0,0,0.000000,0.000000,0.000000,0.0,0,0.000000,...,0.0,0,0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [3]:
# get all the relevant attack rows from the attack sample dataset
portSamples = portSamples[portSamples['Number of Ports'] >= 200]

In [4]:
# print some general information about the attack samples
print(f'Dataset Shape: {portSamples.shape}')
portSamples

Dataset Shape: (19, 26)


Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
2,4933,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,9835,18,18,19.90517,494.997029,19.90517,1.138566,0.00202,0.022363
8,5011,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,9970,18,18,18.178246,549.447949,18.178246,1.373936,0.00182,0.02144
12,4956,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,9924,18,18,21.178031,469.448742,21.178031,1.13131,0.00213,0.026489
27,4862,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,9725,20,20,21.701404,449.049285,21.701404,1.132786,0.002227,0.027287
41,4928,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,9835,15,15,17.001811,579.350046,17.001811,1.165532,0.001726,0.018701
52,4983,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,9954,17,17,19.567152,509.578501,19.567152,1.128272,0.001963,0.020107
64,4966,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,9939,15,15,16.68051,596.744343,16.68051,1.106393,0.001676,0.01739
76,4995,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,9950,17,17,19.314654,516.033063,19.314654,1.10483,0.001938,0.022142
81,5003,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,9968,17,17,17.780498,561.570322,17.780498,1.13277,0.001781,0.018714
85,4992,0.0,0,0,0.0,0.0,0.0,0.0,0,0.0,...,0.0,9966,16,16,18.764916,531.950158,18.764916,1.098799,0.00188,0.019345


In [5]:
# find the columns that we need to synthesis data for to produce an attack dataset
columnsToGather = portSamples.replace(0, np.nan) #replace all 0 values with null
columnsToGather = columnsToGather.dropna(how="all", axis=1).columns.tolist() #remove all columns where there are null values
columnsToGather #left with all columns that the values are not 0 (be know for a fact that the data is consistant and there are not missing values in the rows)

['Number of Ports',
 'SYN Flag Count',
 'ACK Flag Count',
 'RST Flag Count',
 'Flow Duration',
 'Packets Per Second',
 'IAT Total',
 'IAT Max',
 'IAT Mean',
 'IAT Std']

In [6]:
# find an approximate minimum and maximum values of each column and save that data into a dictionary
MinMaxDict = {col: (portSamples[col].min() * 0.85, portSamples[col].max() * 1.075) for col in columnsToGather}
MinMaxDict

{'Number of Ports': (3299.7, 5386.825),
 'SYN Flag Count': (6560.3, 10727.425),
 'ACK Flag Count': (12.75, 21.5),
 'RST Flag Count': (12.75, 21.5),
 'Flow Duration': (13.694767320156098, 23.32900940179825),
 'Packets Per Second': (381.69189255475305, 641.5001682629122),
 'IAT Total': (13.694767320156098, 23.32900940179825),
 'IAT Max': (0.8745675563812255, 1.4769813895225525),
 'IAT Mean': (0.00142453868557045, 0.0023941922620892902),
 'IAT Std': (0.014781451134644269, 0.029333969202090204)}

In [7]:
# change values to int for the columns that should be int (because when calculating approximate values they will be turned into floats)
intColumns = ['Number of Ports', 'SYN Flag Count', 'ACK Flag Count', 'RST Flag Count']
for key, val in MinMaxDict.items():
    if key in intColumns:
        MinMaxDict[key] = tuple([int(v) for v in val])
MinMaxDict

{'Number of Ports': (3299, 5386),
 'SYN Flag Count': (6560, 10727),
 'ACK Flag Count': (12, 21),
 'RST Flag Count': (12, 21),
 'Flow Duration': (13.694767320156098, 23.32900940179825),
 'Packets Per Second': (381.69189255475305, 641.5001682629122),
 'IAT Total': (13.694767320156098, 23.32900940179825),
 'IAT Max': (0.8745675563812255, 1.4769813895225525),
 'IAT Mean': (0.00142453868557045, 0.0023941922620892902),
 'IAT Std': (0.014781451134644269, 0.029333969202090204)}

In [8]:
# creating an empty dataframe before adding values to it
portDataset = pd.DataFrame(np.zeros((NUM_OF_ROWS, len(portSamples.columns))), columns=portSamples.columns)
portDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# adding zeros to all columns that should not have any values
zeroColumns = [col for col in portSamples.columns if col not in columnsToGather]
for col in zeroColumns:
    portDataset[col] = int(0)
zeroColumns

['Average Packet Size',
 'Packet Length Min',
 'Packet Length Max',
 'Packet Length Mean',
 'Packet Length Std',
 'Packet Length Variance',
 'Total Length of Fwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Mean',
 'Bwd Packet Length Max',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Bwd Packet Length Std',
 'Fwd Segment Size Avg',
 'Bwd Segment Size Avg',
 'Subflow Fwd Bytes']

In [10]:
portDataset.head(3)

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
0,0.0,0,0,0,0,0,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0,0,0,0,0,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0,0,0,0,0,0,0,0,0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# finding the correlation between the Number of Ports column to the rest of the columns in order to create new data
independentCol = portSamples[columnsToGather[0]].values.reshape(-1, 1) #column 'Number of Ports'
dependentCols = portSamples[columnsToGather[1:]].values #the rest of the columns that are not zeros

# find the scaling factors using least squares function
scalingFactors = np.linalg.lstsq(independentCol, dependentCols, rcond=None)[0]

scalingFactors = [(name,factor) for name, factor in zip(columnsToGather[1:], scalingFactors.flatten())]
for val in scalingFactors:
    print(val)

('SYN Flag Count', 1.997759225410172)
('ACK Flag Count', 0.0034908750278541327)
('RST Flag Count', 0.0034800190409305792)
('Flow Duration', 0.0038342276392289583)
('Packets Per Second', 0.10647873613941965)
('IAT Total', 0.0038342276392289583)
('IAT Max', 0.0002372508528349286)
('IAT Mean', 3.897098933467324e-07)
('IAT Std', 4.3287569249504e-06)


In [13]:
# adding the rest of the attack feature values to the dataset at random based on the smaple data
portDataset['Number of Ports'] = np.random.randint(MinMaxDict['Number of Ports'][0]*0.85, MinMaxDict['Number of Ports'][1]*1.075, NUM_OF_ROWS)

for index, row in portDataset.iterrows():
    i = 0
    for col, factor in zip(columnsToGather[1:], scalingFactors): #iterating over all rows we need to add values to except 'Number of Ports'
        # calculate a random small delta of the factor for adding some randomness
        delta = random.uniform(factor[1] * 0.01, factor[1] * 0.02) # select a dantom (how much is 1% and 2% from factor)

        # apply the randomness to the calculated number
        updatedFactor = factor[1] + random.choice([-1, 1]) * delta

        # calculate the value we want to add into the dataset in the given row
        portDataset.loc[index, col] = row['Number of Ports'] * updatedFactor
        i+=1

In [14]:
# making the SYN Flag Count column have int values instead of floats
portDataset['SYN Flag Count'] = portDataset['SYN Flag Count'].astype(int)
portDataset['ACK Flag Count'] = portDataset['ACK Flag Count'].astype(int)
portDataset['RST Flag Count'] = portDataset['RST Flag Count'].astype(int)

# adding a label to the dataset
portDataset['Label'] = ATTACK_NAME

In [15]:
portDataset

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std,Label
0,3066,0,0,0,0,0,0,0,0,0,...,6213,10,10,11.611609,323.075192,11.931991,0.713778,0.001216,0.013480,PortScan
1,3988,0,0,0,0,0,0,0,0,0,...,8069,14,14,15.512145,429.144158,15.503727,0.960733,0.001570,0.017592,PortScan
2,3714,0,0,0,0,0,0,0,0,0,...,7564,13,13,13.986972,400.895137,14.021729,0.867224,0.001465,0.016378,PortScan
3,3600,0,0,0,0,0,0,0,0,0,...,7051,12,12,14.068281,376.838806,13.605383,0.840088,0.001419,0.015401,PortScan
4,5425,0,0,0,0,0,0,0,0,0,...,10705,18,19,21.067675,567.850949,21.133070,1.269302,0.002086,0.023811,PortScan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,5228,0,0,0,0,0,0,0,0,0,...,10584,18,18,19.825272,564.739893,19.669798,1.253092,0.002072,0.022238,PortScan
24996,5211,0,0,0,0,0,0,0,0,0,...,10302,18,18,20.378241,544.242274,19.630524,1.257655,0.001996,0.022200,PortScan
24997,4421,0,0,0,0,0,0,0,0,0,...,8670,15,15,16.726157,464.297800,17.199425,1.038277,0.001754,0.018779,PortScan
24998,5723,0,0,0,0,0,0,0,0,0,...,11250,20,19,22.235196,618.271011,21.545548,1.342222,0.002201,0.025175,PortScan


In [16]:
portDataset.describe()

Unnamed: 0,Number of Ports,Average Packet Size,Packet Length Min,Packet Length Max,Packet Length Mean,Packet Length Std,Packet Length Variance,Total Length of Fwd Packet,Fwd Packet Length Max,Fwd Packet Length Mean,...,Subflow Fwd Bytes,SYN Flag Count,ACK Flag Count,RST Flag Count,Flow Duration,Packets Per Second,IAT Total,IAT Max,IAT Mean,IAT Std
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,...,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,4304.75012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8599.3944,14.53048,14.47992,16.504972,458.264292,16.502992,1.021244,0.001677,0.018635
std,860.93613,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1725.474987,3.031356,3.023462,3.309532,91.966793,3.309906,0.204858,0.000336,0.00374
min,2804.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5494.0,9.0,9.0,10.538101,292.900607,10.54612,0.652891,0.001073,0.011902
25%,3561.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7104.25,12.0,12.0,13.652658,378.755975,13.635772,0.843434,0.001389,0.015403
50%,4320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8618.0,15.0,15.0,16.565304,459.324035,16.55105,1.023808,0.001682,0.018694
75%,5049.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10087.0,17.0,17.0,19.356857,537.540771,19.365365,1.198306,0.001967,0.021855
max,5788.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,11789.0,20.0,20.0,22.607078,628.161676,22.612533,1.400044,0.002299,0.025523


In [17]:
# save the dataset
portDataset.to_csv('port_scan_dataset_updated_flows_open_ports.csv', index=False)

---

In [57]:
# using min-max scaler
# scaler = MinMaxScaler(feature_range=(0, 1))
# df['Scaled'] = scaler.fit_transform(df[['Value']]) 