# Imports

In [6]:
import numpy as np
import pandas as pd
import socket as sk
import struct as st
import datetime as dt
import ipaddress as ip
from time import perf_counter

# Configurations

In [3]:
# finding the bad guys, dah!

# slowloris 
slowloris_low  = st.unpack('!I', sk.inet_aton('10.128.0.1'))[0]
slowloris_high = st.unpack('!I', sk.inet_aton('10.128.0.50'))[0]

# slowhttptest
slowhttptest_low  = st.unpack('!I', sk.inet_aton('10.128.0.50'))[0]
slowhttptest_high = st.unpack('!I', sk.inet_aton('10.128.0.100'))[0]

# slowloris_ng
slowloris_ng_low  = st.unpack('!I', sk.inet_aton('10.128.0.100'))[0]
slowloris_ng_high = st.unpack('!I', sk.inet_aton('10.128.0.150'))[0]

# defining the TCP flags
tcp_flags = [2, 4, 16, 17, 18, 20, 24, 25, 82, 144, 152, 194]

# Prequel Preprocessing
* this function aims to cast the raw data into a 10 base integer represetation

In [5]:
def prequelProcessing(packages):
    
    dataset = packages.copy()
    
    # Casting IP to a single integer
    dataset['source_ip'] = dataset.source_ip.apply(lambda x: st.unpack('!I', sk.inet_aton(x))[0])
    dataset['dest_ip'] = dataset.dest_ip.apply(lambda x: st.unpack('!I', sk.inet_aton(x))[0])
    
    # Casting Hexa to decimal base
    dataset['tcp_flag'] = dataset.tcp_flag.apply(lambda x: int(x, 16))
    
    # Parsing string to datetime object
    dataset['date'] = dataset['date'] + ' ' + dataset['time']
    dataset['date'] = pd.to_datetime(dataset['date'], format='%Y%m%d %H:%M:%S', utc=True)
    
    # Getting rid of useless columns
    dataset.drop(columns=['data', 'time'], inplace=True)
    
    return dataset

# Features Engineering

In [6]:
def features(grouped_data): 
    
    number_requisitions = np.sum(grouped_data['dest_port'] == 80) + np.sum(grouped_data['dest_port'] == 443)
    number_different_destinations = len(np.unique(grouped_data['dest_ip']))
    mean_frame_length = grouped_data['frame_length'].mean()
    
    data = {
            'number_requisitions'           : [number_requisitions], 
            'number_different_destinations' : [number_different_destinations], 
            'mean_frame_length'             : [mean_frame_length]
           }

    for flag in tcp_flags:
        data['flag_' + str(flag)] = [np.sum(grouped_data['tcp_flag'] == flag)]
    
    return pd.DataFrame(data)

# Normalizing the Data

In [7]:
def turnToPercentil(dataset, summary, column_name):

    for i in range(len(summary.index)):
        
        if summary[column_name][i] > 0:

            data_percentil = dataset.loc[summary.index[i], column_name] / summary[column_name][i]
            dataset.loc[summary.index[i], column_name] = data_percentil.values

In [8]:
def normalizationPerTimePeriod(dataset):
    
    summary = dataset.groupby('date').sum()
    
    column_names= dataset.columns.values
    column_names = np.delete(column_names, 2)
    
    for column in column_names:
        
        turnToPercentil(dataset, summary, column)

# Creating the true label array

In [9]:
def generateLabelColumn(grouped):
    
    # setting all IPs with none intruser type
    grouped['y'] = 0
    
    # resetting the index 
    dataset = grouped.reset_index()
    
    # finding the bad guys
    slowloris    = (dataset.source_ip >= slowloris_low) & (dataset.source_ip < slowloris_high)
    slowhttptest = (dataset.source_ip >= slowhttptest_low) & (dataset.source_ip < slowhttptest_high)
    slowloris_ng = (dataset.source_ip >= slowloris_ng_low) & (dataset.source_ip < slowloris_ng_high)

    # and labeling them
    dataset.loc[slowloris, 'y']    = 1
    dataset.loc[slowhttptest, 'y'] = 1
    dataset.loc[slowloris_ng, 'y'] = 1
    
    # resuming the original index
    dataset.set_index(['date', 'source_ip'], inplace=True)
    
    # getting rid of useless columns
    dataset.drop(columns=['level_2'], inplace=True)
    
    return dataset

# Gathering those functions into a single one

In [10]:
def preprocessing(packages, frequency):
    
    dataset = prequelProcessing(packages)
    
    grouped = dataset.groupby([
            # groupping the data per a specific time frequency
            pd.Grouper(key='date', freq=frequency), 
            # groupping the remaining data by the IPs
            pd.Grouper(key='source_ip')
            # Applying the function who will create the news features
            ]).apply(features)
    
    # normalizing the data
    normalizationPerTimePeriod(grouped)
    
    # generating the true label array
    dataset = generateLabelColumn(grouped)
    
    return dataset

# Testing

In [7]:
packages = pd.read_csv('../dataset/raw/SUEE1.csv', low_memory=False, index_col=[0])
packages.head()

  mask |= (ar1 == a)


Unnamed: 0_level_0,protocol,source_ip,source_port,dest_ip,dest_port,frame_length,tcp_flag,data,date,time
frame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,6,192.168.0.1,80,192.168.0.2,39266,66,0x00000011,,1970-01-05,02:41:38
2,6,192.168.0.2,39266,192.168.0.1,80,66,0x00000010,,1970-01-05,02:41:38
3,6,192.168.0.1,80,192.168.0.3,9784,54,0x00000011,,1970-01-05,02:41:38
4,6,192.168.0.3,9784,192.168.0.1,80,54,0x00000010,,1970-01-05,02:41:38
5,6,192.168.0.1,80,192.168.0.4,62170,66,0x00000011,,1970-01-05,02:41:38


In [8]:
packages.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2089436 entries, 1 to 2089436
Data columns (total 10 columns):
protocol        int64
source_ip       object
source_port     int64
dest_ip         object
dest_port       int64
frame_length    int64
tcp_flag        object
data            object
date            object
time            object
dtypes: int64(4), object(6)
memory usage: 175.4+ MB


In [9]:
import sys; sys.path.append('..')

In [10]:
from modules.preprocessing import preprocessing

In [11]:
dataset = preprocessing(packages, '1s')

In [12]:
dataset.head(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,number_requisitions,number_different_destinations,mean_frame_length,flag_2,flag_4,flag_16,flag_17,flag_18,flag_20,flag_24,flag_25,flag_82,flag_144,flag_152,flag_194,y
date,source_ip,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1970-01-05 01:38:06+00:00,3232235521,0.607143,0.545455,62.142857,0.5,1.0,0.35,0.714286,0.5,0.0,0.529412,0.0,0.0,0.0,0.0,0.0,0
1970-01-05 01:38:06+00:00,3232235529,0.142857,0.090909,55.0,0.25,0.0,0.1,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0
1970-01-05 01:38:06+00:00,3232235531,0.214286,0.090909,54.666667,0.25,0.0,0.15,0.0,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0
1970-01-05 01:38:06+00:00,3232235532,0.0,0.090909,66.8,0.0,0.0,0.2,0.142857,0.25,0.0,0.235294,0.0,0.0,0.0,0.0,0.0,0
1970-01-05 01:38:06+00:00,3232235547,0.0,0.090909,67.333333,0.0,0.0,0.15,0.142857,0.25,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0
1970-01-05 01:38:06+00:00,3232235583,0.035714,0.090909,66.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
