In [78]:
from os import makedirs
from os.path import dirname

import pandas as pd
import numpy as np
import datetime
import math

In [79]:
df = pd.read_csv('../data/preprocessed/preprocessed.csv')
df_orig = df.copy()
df['StartTime'] = pd.to_datetime(df['StartTime'])
df['epoch'] = ((df['StartTime'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1ms')) / 1000

In [81]:
feature_df = df.copy().sort_values('StartTime', ignore_index=True)

def create_fwd_bwd_col(f_df, col, name):
    df = f_df.copy()
    
    #Forward direction
    fwd_name = f'{name}_fwd'
    df[fwd_name] = df[col]
    df.loc[df.is_fwd == 0, fwd_name] = np.NaN

    #Backword direction
    bwd_name = f'{name}_bwd'
    df[bwd_name] = df[col]
    df.loc[df.is_fwd == 1, bwd_name] = np.NaN
    return df

In [98]:
def print_status(index, total, percentage=5):
    threshold = math.ceil(total*(percentage/100))
    if index % threshold == 0:
        completion = index/total * 100
        print(f'TASK: {completion}% Completed')

def addr_prefix_name(addr):
    if addr == 'SrcAddr':
        return 'S'
    if addr == 'DstAddr':
        return 'D'
    return ''

def convert_to_int_64(df, columns):
    copy = df.copy()
    for col in columns:
        copy[col] = copy[col].astype('int64')
    return copy

def build_tot_flows_time_window(f_df, time):
    '''
        Total Flow in forward and backward 10 min
        10min = 10T
    '''
    df = f_df.copy()
    df_with_bwd = df.copy()
    df_with_bwd['is_bwd'] = df_with_bwd['is_fwd'].replace({0:1, 1:0})
    df[f'TotFlowFwd_{time}'] = df[['StartTime', 'is_fwd']].rolling(time, on='StartTime').sum()['is_fwd']
    df[f'TotFlowBwd_{time}'] = df_with_bwd[['StartTime', 'is_bwd']].rolling(time, on='StartTime').sum()['is_bwd']
    return convert_to_int_64(df, [f'TotFlowFwd_{time}', f'TotFlowBwd_{time}'])

def build_time_features(f_df, col_name, short_name, time):
    '''Time should be pandas rolling period'''
    df = f_df.copy()
    
    #Require a df with exta attributes b/c of the rolling.
    #See create_fwd_bwd_col
    #Forward direction and backward of the given column.
    df_rolling = create_fwd_bwd_col(df, col_name, short_name)

    fwd_col = short_name + '_fwd'
    bwd_col = short_name + '_bwd'
    
    time_window = df_rolling[['StartTime', fwd_col, bwd_col]].rolling(time, on='StartTime')

    #Total Size in Forward and Backward based on time
    sums = time_window.sum()
    sum_fwd_name, sum_bwd_name = f'{short_name}SumFwd_{time}', f'{short_name}SumBwd_{time}'
    df[sum_fwd_name] = sums[fwd_col]
    df[sum_bwd_name] = sums[bwd_col]

    #Min Size in Forward and Backward based on time
    mins = time_window.min()
    min_fwd_name, min_bwd_name = f'{short_name}MinFwd_{time}', f'{short_name}MinBwd_{time}'
    df[min_fwd_name] = mins[fwd_col]
    df[min_bwd_name] = mins[bwd_col]

    #Max Size in Forward and Backward based on time
    maxs = time_window.max()
    max_fwd_name, max_bwd_name = f'{short_name}MaxFwd_{time}', f'{short_name}MaxBwd_{time}'
    df[max_fwd_name] = maxs[fwd_col]
    df[max_bwd_name] = maxs[bwd_col]

    #Mean Size in Forward and Backward based on time
    means = time_window.mean()
    mean_fwd_name, mean_bwd_name = f'{short_name}MeanFwd_{time}', f'{short_name}MeanBwd_{time}'
    df[mean_fwd_name] = means[fwd_col]
    df[mean_bwd_name] = means[bwd_col]

    #Standard Deviation Size in Forward and Backward based on time
    stds = time_window.std()
    std_fwd_name, std_bwd_name = f'{short_name}StdFwd_{time}', f'{short_name}StdBwd_{time}'
    df[std_fwd_name] = stds[fwd_col]
    df[std_bwd_name] = stds[bwd_col]
    
    #Fill all columns with np.NaN with zero
    df[[sum_fwd_name,
        sum_bwd_name,
        min_fwd_name,
        min_bwd_name,
        max_fwd_name,
        max_bwd_name,
        mean_fwd_name,
        mean_bwd_name,
        std_fwd_name,
        std_bwd_name
       ]] = df[[sum_fwd_name,
                sum_bwd_name,
                min_fwd_name,
                min_bwd_name,
                max_fwd_name,
                max_bwd_name,
                mean_fwd_name,
                mean_bwd_name,
                std_fwd_name,
                std_bwd_name
               ]].fillna(0)
    
     #All columns that can be int, convert to int.
    return convert_to_int_64(df, [sum_fwd_name, sum_bwd_name, min_fwd_name, min_bwd_name, max_fwd_name, max_bwd_name])

In [5]:
#Bytes
feature_df['DstBytes'] = feature_df['TotBytes'] - feature_df['SrcBytes']
feature_df = build_tot_flows_time_window(feature_df, '10T')
feature_df = build_time_features(feature_df, 'TotBytes', 'TotB', '10T')
feature_df = build_time_features(feature_df, 'TotPkts', 'TotPkt', '10T')
feature_df = build_time_features(feature_df, 'SrcBytes', 'SrcB', '10T')
# feature_df['Rate'] = feature_df['TotBytes']/feature_df['Dur']
# feature_df['SrcRate'] = feature_df['SrcBytes']/feature_df['Dur']
# feature_df['DstRate'] = feature_df['DstBytes']/feature_df['Dur']

In [99]:
def helper_trim_dict_row_list(key, dictionary, curr_epoch, time_window):
    if key in dictionary:
        while len(dictionary[key]) > 0 and curr_epoch - dictionary[key][-1].epoch > time_window:
            dictionary[key].pop()
    else:
        dictionary[key] = []

def build_pkts_bytes_time_window(f_df, minute, addr):
    df = f_df[[addr, 'is_fwd', 'TotPkts', 'TotBytes', 'SrcBytes', 'epoch']].copy()
    total = len(df.index)
    window_range_sec = minute * 60
    prefix = addr_prefix_name(addr)

    #Total Flow
    tot_flow_fwd_l = []
    tot_flow_bwd_l = []
    
    #Total Bytes Forward
    tot_b_sum_fwd_l = []
    tot_b_min_fwd_l = []
    tot_b_max_fwd_l = []
    tot_b_mean_fwd_l = []
    tot_b_std_fwd_l = []
    
    #Total Bytes Backward
    tot_b_sum_bwd_l = []
    tot_b_min_bwd_l = []
    tot_b_max_bwd_l = []
    tot_b_mean_bwd_l = []
    tot_b_std_bwd_l = []
    
    #Total Packets Forward
    tot_pkt_sum_fwd_l = []
    tot_pkt_min_fwd_l = []
    tot_pkt_max_fwd_l = []
    tot_pkt_mean_fwd_l = []
    tot_pkt_std_fwd_l = []

    #Total Packets Backward
    tot_pkt_sum_bwd_l = []
    tot_pkt_min_bwd_l = []
    tot_pkt_max_bwd_l = []
    tot_pkt_mean_bwd_l = []
    tot_pkt_std_bwd_l = []
    
    #Total Src Bytes Forward
    src_b_sum_fwd_l = []
    src_b_min_fwd_l = []
    src_b_max_fwd_l = []
    src_b_mean_fwd_l = []
    src_b_std_fwd_l = []
    
    #Total Src Bytes Backward
    src_b_sum_bwd_l = []
    src_b_min_bwd_l = []
    src_b_max_bwd_l = []
    src_b_mean_bwd_l = []
    src_b_std_bwd_l = []
    
    fwd_dict = {}
    bwd_dict = {}

    for index, row in df.iterrows():
        print_status(index, total)

        #Independent whether the row itself is fwd or backward
        helper_trim_dict_row_list(row[addr], fwd_dict, row.epoch, window_range_sec)
        helper_trim_dict_row_list(row[addr], bwd_dict, row.epoch, window_range_sec)
        fwd_rows_list = fwd_dict[row[addr]]
        bwd_rows_list = bwd_dict[row[addr]]

        if row.is_fwd:
            fwd_rows_list.insert(0, row)
        else:
            bwd_rows_list.insert(0, row)

        if len(fwd_rows_list) == 0:
            fwd_df = pd.DataFrame(data={
                'TotPkts': [],
                'TotBytes': [],
                'SrcBytes': []
            })
        else:
            fwd_df = pd.DataFrame(data=fwd_rows_list)

        if len (bwd_rows_list) == 0:
            bwd_df = pd.DataFrame(data={
                'TotPkts': [],
                'TotBytes': [],
                'SrcBytes': []
            })
        else:
            bwd_df = pd.DataFrame(data=bwd_rows_list)
        
        #Total Flow
        tot_flow_fwd_l.append(len(fwd_rows_list))
        tot_flow_bwd_l.append(len(bwd_rows_list))

        if len(fwd_rows_list) != 0:
            #Total Bytes Fwd
            tot_b_fwd_l = fwd_df.TotBytes
            tot_b_sum_fwd_l.append(tot_b_fwd_l.sum())
            tot_b_min_fwd_l.append(tot_b_fwd_l.min())
            tot_b_max_fwd_l.append(tot_b_fwd_l.max())
            tot_b_mean_fwd_l.append(tot_b_fwd_l.mean())
            tot_b_std_fwd_l.append(tot_b_fwd_l.std() if len(tot_b_fwd_l) > 1 else 0)
            
            #Total Packets Fwd
            tot_pkts_fwd_l = fwd_df.TotPkts
            tot_pkt_sum_fwd_l.append(tot_pkts_fwd_l.sum())
            tot_pkt_min_fwd_l.append(tot_pkts_fwd_l.min())
            tot_pkt_max_fwd_l.append(tot_pkts_fwd_l.max())
            tot_pkt_mean_fwd_l.append(tot_pkts_fwd_l.mean())
            tot_pkt_std_fwd_l.append(tot_pkts_fwd_l.std() if len(tot_pkts_fwd_l) > 1 else 0)
            
            #Total Source Bytes Fwd
            src_b_fwd_l = fwd_df.SrcBytes
            src_b_sum_fwd_l.append(src_b_fwd_l.sum())
            src_b_min_fwd_l.append(src_b_fwd_l.min())
            src_b_max_fwd_l.append(src_b_fwd_l.max())
            src_b_mean_fwd_l.append(src_b_fwd_l.mean())
            src_b_std_fwd_l.append(src_b_fwd_l.std() if len(src_b_fwd_l) > 1 else 0)
                
        else:
            #Total Bytes Fwd
            tot_b_fwd_l = fwd_df.TotBytes
            tot_b_sum_fwd_l.append(0)
            tot_b_min_fwd_l.append(0)
            tot_b_max_fwd_l.append(0)
            tot_b_mean_fwd_l.append(0)
            tot_b_std_fwd_l.append(tot_b_fwd_l.std() if len(tot_b_fwd_l) > 1 else 0)
            
            #Total Packets Fwd
            tot_pkts_fwd_l = fwd_df.TotPkts
            tot_pkt_sum_fwd_l.append(0)
            tot_pkt_min_fwd_l.append(0)
            tot_pkt_max_fwd_l.append(0)
            tot_pkt_mean_fwd_l.append(0)
            tot_pkt_std_fwd_l.append(0)
            
            #Total Source Bytes Fwd
            src_b_fwd_l = fwd_df.SrcBytes
            src_b_sum_fwd_l.append(0)
            src_b_min_fwd_l.append(0)
            src_b_max_fwd_l.append(0)
            src_b_mean_fwd_l.append(0)
            src_b_std_fwd_l.append(0)
        
        if len(bwd_rows_list) != 0:
            #Total Bytes Bwd
            tot_b_bwd_l = bwd_df.TotBytes
            tot_b_sum_bwd_l.append(tot_b_bwd_l.sum())
            tot_b_min_bwd_l.append(tot_b_bwd_l.min())
            tot_b_max_bwd_l.append(tot_b_bwd_l.max())
            tot_b_mean_bwd_l.append(tot_b_bwd_l.mean())
            tot_b_std_bwd_l.append(tot_b_bwd_l.std() if len(tot_b_bwd_l) > 1 else 0)

            #Total Packets Bwd
            tot_pkts_bwd_l = bwd_df.TotPkts
            tot_pkt_sum_bwd_l.append(tot_pkts_bwd_l.sum())
            tot_pkt_min_bwd_l.append(tot_pkts_bwd_l.min())
            tot_pkt_max_bwd_l.append(tot_pkts_bwd_l.max())
            tot_pkt_mean_bwd_l.append(tot_pkts_bwd_l.mean())
            tot_pkt_std_bwd_l.append(tot_pkts_bwd_l.std() if len(tot_pkts_bwd_l) > 1 else 0)


            #Total Source Bytes Bwd
            src_b_bwd_l = bwd_df.SrcBytes
            src_b_sum_bwd_l.append(src_b_bwd_l.sum())
            src_b_min_bwd_l.append(src_b_bwd_l.min())
            src_b_max_bwd_l.append(src_b_bwd_l.max())
            src_b_mean_bwd_l.append(src_b_bwd_l.mean())
            src_b_std_bwd_l.append(src_b_bwd_l.std() if len(src_b_bwd_l) > 1 else 0)
        else:
            
            #Total Bytes Bwd
            tot_b_bwd_l = bwd_df.TotBytes
            tot_b_sum_bwd_l.append(0)
            tot_b_min_bwd_l.append(0)
            tot_b_max_bwd_l.append(0)
            tot_b_mean_bwd_l.append(0)
            tot_b_std_bwd_l.append(0)

            #Total Packets Bwd
            tot_pkts_bwd_l = bwd_df.TotPkts
            tot_pkt_sum_bwd_l.append(0)
            tot_pkt_min_bwd_l.append(0)
            tot_pkt_max_bwd_l.append(0)
            tot_pkt_mean_bwd_l.append(0)
            tot_pkt_std_bwd_l.append(0)


            #Total Source Bytes Bwd
            src_b_bwd_l = bwd_df.SrcBytes
            src_b_sum_bwd_l.append(0)
            src_b_min_bwd_l.append(0)
            src_b_max_bwd_l.append(0)
            src_b_mean_bwd_l.append(0)
            src_b_std_bwd_l.append(0)

    #Build Data Frame
    new_features_df = pd.DataFrame(data={
        f'{prefix}TotFlowFwd_{minute}T': tot_flow_fwd_l,
        f'{prefix}TotFlowBwd_{minute}T': tot_flow_bwd_l,
        f'{prefix}TotBSumFwd_{minute}T': tot_b_sum_fwd_l,
        f'{prefix}TotBMinFwd_{minute}T': tot_b_min_fwd_l,
        f'{prefix}TotBMaxFwd_{minute}T': tot_b_max_fwd_l,
        f'{prefix}TotBMeanFwd_{minute}T': tot_b_mean_fwd_l,
        f'{prefix}TotBStdFwd_{minute}T': tot_b_std_fwd_l,
        f'{prefix}TotBSumBwd_{minute}T': tot_b_sum_bwd_l,
        f'{prefix}TotBMinBwd_{minute}T': tot_b_min_bwd_l,
        f'{prefix}TotBMaxBwd_{minute}T': tot_b_max_bwd_l,
        f'{prefix}TotBMeanBwd_{minute}T': tot_b_mean_bwd_l,
        f'{prefix}TotBStdBwd_{minute}T': tot_b_std_bwd_l,
        f'{prefix}TotPktSumFwd_{minute}T': tot_pkt_sum_fwd_l,
        f'{prefix}TotPktMinFwd_{minute}T': tot_pkt_min_fwd_l,
        f'{prefix}TotPktMaxFwd_{minute}T': tot_pkt_max_fwd_l,
        f'{prefix}TotPktMeanFwd_{minute}T': tot_pkt_mean_fwd_l,
        f'{prefix}TotPktStdFwd_{minute}T': tot_pkt_std_fwd_l,
        f'{prefix}TotPktSumBwd_{minute}T': tot_pkt_sum_bwd_l,
        f'{prefix}TotPktMinBwd_{minute}T': tot_pkt_min_bwd_l,
        f'{prefix}TotPktMaxBwd_{minute}T': tot_pkt_max_bwd_l,
        f'{prefix}TotPktMeanBwd_{minute}T': tot_pkt_mean_bwd_l,
        f'{prefix}TotPktStdBwd_{minute}T': tot_pkt_std_bwd_l,
        f'{prefix}SrcBSumFwd_{minute}T': src_b_sum_fwd_l,
        f'{prefix}SrcBMinFwd_{minute}T': src_b_min_fwd_l,
        f'{prefix}SrcBMaxFwd_{minute}T': src_b_max_fwd_l,
        f'{prefix}SrcBMeanFwd_{minute}T': src_b_mean_fwd_l,
        f'{prefix}SrcBStdFwd_{minute}T': src_b_std_fwd_l,
        f'{prefix}SrcBSumBwd_{minute}T': src_b_sum_bwd_l,
        f'{prefix}SrcBMinBwd_{minute}T': src_b_min_bwd_l,
        f'{prefix}SrcBMaxBwd_{minute}T': src_b_max_bwd_l,
        f'{prefix}SrcBMeanBwd_{minute}T': src_b_mean_bwd_l,
        f'{prefix}SrcBStdBwd_{minute}T': src_b_std_bwd_l
    })
    return pd.concat([f_df, new_features_df], axis=1)

In [96]:
def build_pkts_bytes_x_window(f_df, num, addr):
    df = f_df[[addr, 'is_fwd', 'TotPkts', 'TotBytes', 'SrcBytes', 'epoch']].copy()
    total_length = len(df.index)
    window = []
    prefix = addr_prefix_name(addr)

    #Total Flow
    tot_flow_fwd_l = []
    tot_flow_bwd_l = []
    
    #Total Bytes Forward
    tot_b_sum_fwd_l = []
    tot_b_min_fwd_l = []
    tot_b_max_fwd_l = []
    tot_b_mean_fwd_l = []
    tot_b_std_fwd_l = []
    
    #Total Bytes Backward
    tot_b_sum_bwd_l = []
    tot_b_min_bwd_l = []
    tot_b_max_bwd_l = []
    tot_b_mean_bwd_l = []
    tot_b_std_bwd_l = []
    
    #Total Packets Forward
    tot_pkt_sum_fwd_l = []
    tot_pkt_min_fwd_l = []
    tot_pkt_max_fwd_l = []
    tot_pkt_mean_fwd_l = []
    tot_pkt_std_fwd_l = []

    #Total Packets Backward
    tot_pkt_sum_bwd_l = []
    tot_pkt_min_bwd_l = []
    tot_pkt_max_bwd_l = []
    tot_pkt_mean_bwd_l = []
    tot_pkt_std_bwd_l = []
    
    #Total Src Bytes Forward
    src_b_sum_fwd_l = []
    src_b_min_fwd_l = []
    src_b_max_fwd_l = []
    src_b_mean_fwd_l = []
    src_b_std_fwd_l = []
    
    #Total Src Bytes Backward
    src_b_sum_bwd_l = []
    src_b_min_bwd_l = []
    src_b_max_bwd_l = []
    src_b_mean_bwd_l = []
    src_b_std_bwd_l = []
    
    #Time Between Flows
    time_flow_fwd_sec_l = []
    time_flow_bwd_sec_l = []
    
    for index, row in df.iterrows():
        print_status(index, total_length)
        
        #Add row to window
        if len(window) == num:
            window.pop(0)
        window.append(row)
        
        #Get all rows with the same address and in forward or backward directions.
        current_row_addr = row[addr]
        
        
        window_df = pd.DataFrame(data=window)
        addr_df = window_df.loc[window_df[addr] == current_row_addr]

        fwd_df = addr_df.loc[addr_df.is_fwd == 1]
        bwd_df = addr_df.loc[addr_df.is_fwd == 0]
        
        #Total Flow
        tot_flow_fwd_l.append(len(fwd_df))
        tot_flow_bwd_l.append(len(bwd_df))
        
        if len(fwd_df) != 0:
            #Total Bytes Fwd
            tot_b_fwd_l = fwd_df.TotBytes
            tot_b_sum_fwd_l.append(tot_b_fwd_l.sum())
            tot_b_min_fwd_l.append(tot_b_fwd_l.min())
            tot_b_max_fwd_l.append(tot_b_fwd_l.max())
            tot_b_mean_fwd_l.append(tot_b_fwd_l.mean())
            tot_b_std_fwd_l.append(tot_b_fwd_l.std() if len(tot_b_fwd_l) > 1 else 0)
            
            #Total Packets Fwd
            tot_pkts_fwd_l = fwd_df.TotPkts
            tot_pkt_sum_fwd_l.append(tot_pkts_fwd_l.sum())
            tot_pkt_min_fwd_l.append(tot_pkts_fwd_l.min())
            tot_pkt_max_fwd_l.append(tot_pkts_fwd_l.max())
            tot_pkt_mean_fwd_l.append(tot_pkts_fwd_l.mean())
            tot_pkt_std_fwd_l.append(tot_pkts_fwd_l.std() if len(tot_pkts_fwd_l) > 1 else 0)
            
            #Total Source Bytes Fwd
            src_b_fwd_l = fwd_df.SrcBytes
            src_b_sum_fwd_l.append(src_b_fwd_l.sum())
            src_b_min_fwd_l.append(src_b_fwd_l.min())
            src_b_max_fwd_l.append(src_b_fwd_l.max())
            src_b_mean_fwd_l.append(src_b_fwd_l.mean())
            src_b_std_fwd_l.append(src_b_fwd_l.std() if len(src_b_fwd_l) > 1 else 0)
                
        else:
            #Total Bytes Fwd
            tot_b_fwd_l = fwd_df.TotBytes
            tot_b_sum_fwd_l.append(0)
            tot_b_min_fwd_l.append(0)
            tot_b_max_fwd_l.append(0)
            tot_b_mean_fwd_l.append(0)
            tot_b_std_fwd_l.append(tot_b_fwd_l.std() if len(tot_b_fwd_l) > 1 else 0)
            
            #Total Packets Fwd
            tot_pkts_fwd_l = fwd_df.TotPkts
            tot_pkt_sum_fwd_l.append(0)
            tot_pkt_min_fwd_l.append(0)
            tot_pkt_max_fwd_l.append(0)
            tot_pkt_mean_fwd_l.append(0)
            tot_pkt_std_fwd_l.append(0)
            
            #Total Source Bytes Fwd
            src_b_fwd_l = fwd_df.SrcBytes
            src_b_sum_fwd_l.append(0)
            src_b_min_fwd_l.append(0)
            src_b_max_fwd_l.append(0)
            src_b_mean_fwd_l.append(0)
            src_b_std_fwd_l.append(0)
        
        if len(bwd_df) != 0:
            #Total Bytes Bwd
            tot_b_bwd_l = bwd_df.TotBytes
            tot_b_sum_bwd_l.append(tot_b_bwd_l.sum())
            tot_b_min_bwd_l.append(tot_b_bwd_l.min())
            tot_b_max_bwd_l.append(tot_b_bwd_l.max())
            tot_b_mean_bwd_l.append(tot_b_bwd_l.mean())
            tot_b_std_bwd_l.append(tot_b_bwd_l.std() if len(tot_b_bwd_l) > 1 else 0)

            #Total Packets Bwd
            tot_pkts_bwd_l = bwd_df.TotPkts
            tot_pkt_sum_bwd_l.append(tot_pkts_bwd_l.sum())
            tot_pkt_min_bwd_l.append(tot_pkts_bwd_l.min())
            tot_pkt_max_bwd_l.append(tot_pkts_bwd_l.max())
            tot_pkt_mean_bwd_l.append(tot_pkts_bwd_l.mean())
            tot_pkt_std_bwd_l.append(tot_pkts_bwd_l.std() if len(tot_pkts_bwd_l) > 1 else 0)


            #Total Source Bytes Bwd
            src_b_bwd_l = bwd_df.SrcBytes
            src_b_sum_bwd_l.append(src_b_bwd_l.sum())
            src_b_min_bwd_l.append(src_b_bwd_l.min())
            src_b_max_bwd_l.append(src_b_bwd_l.max())
            src_b_mean_bwd_l.append(src_b_bwd_l.mean())
            src_b_std_bwd_l.append(src_b_bwd_l.std() if len(src_b_bwd_l) > 1 else 0)
        else:
            
            #Total Bytes Bwd
            tot_b_bwd_l = bwd_df.TotBytes
            tot_b_sum_bwd_l.append(0)
            tot_b_min_bwd_l.append(0)
            tot_b_max_bwd_l.append(0)
            tot_b_mean_bwd_l.append(0)
            tot_b_std_bwd_l.append(0)

            #Total Packets Bwd
            tot_pkts_bwd_l = bwd_df.TotPkts
            tot_pkt_sum_bwd_l.append(0)
            tot_pkt_min_bwd_l.append(0)
            tot_pkt_max_bwd_l.append(0)
            tot_pkt_mean_bwd_l.append(0)
            tot_pkt_std_bwd_l.append(0)


            #Total Source Bytes Bwd
            src_b_bwd_l = bwd_df.SrcBytes
            src_b_sum_bwd_l.append(0)
            src_b_min_bwd_l.append(0)
            src_b_max_bwd_l.append(0)
            src_b_mean_bwd_l.append(0)
            src_b_std_bwd_l.append(0)
        
        #Time Between Flows
        if len(fwd_df) < 2:
            time_flow_fwd_sec_l.append(0)
        else:
            time_flow_fwd_sec_l.append(fwd_df.iloc[-1].epoch - fwd_df.iloc[-2].epoch)

        if len(bwd_df) < 2:
            time_flow_bwd_sec_l.append(0)
        else:
            time_flow_bwd_sec_l.append(bwd_df.iloc[-1].epoch - bwd_df.iloc[-2].epoch)

    #Build Data Frame
    new_features_df = pd.DataFrame(data={
        f'{prefix}TotFlowFwdN_{num}': tot_flow_fwd_l,
        f'{prefix}TotFlowBwdN_{num}': tot_flow_bwd_l,
        f'{prefix}TotBSumFwdN_{num}': tot_b_sum_fwd_l,
        f'{prefix}TotBMinFwdN_{num}': tot_b_min_fwd_l,
        f'{prefix}TotBMaxFwdN_{num}': tot_b_max_fwd_l,
        f'{prefix}TotBMeanFwdN_{num}': tot_b_mean_fwd_l,
        f'{prefix}TotBStdFwdN_{num}': tot_b_std_fwd_l,
        f'{prefix}TotBSumBwdN_{num}': tot_b_sum_bwd_l,
        f'{prefix}TotBMinBwdN_{num}': tot_b_min_bwd_l,
        f'{prefix}TotBMaxBwdN_{num}': tot_b_max_bwd_l,
        f'{prefix}TotBMeanBwdN_{num}': tot_b_mean_bwd_l,
        f'{prefix}TotBStdBwdN_{num}': tot_b_std_bwd_l,
        f'{prefix}TotPktSumFwdN_{num}': tot_pkt_sum_fwd_l,
        f'{prefix}TotPktMinFwdN_{num}': tot_pkt_min_fwd_l,
        f'{prefix}TotPktMaxFwdN_{num}': tot_pkt_max_fwd_l,
        f'{prefix}TotPktMeanFwdN_{num}': tot_pkt_mean_fwd_l,
        f'{prefix}TotPktStdFwdN_{num}': tot_pkt_std_fwd_l,
        f'{prefix}TotPktSumBwdN_{num}': tot_pkt_sum_bwd_l,
        f'{prefix}TotPktMinBwdN_{num}': tot_pkt_min_bwd_l,
        f'{prefix}TotPktMaxBwdN_{num}': tot_pkt_max_bwd_l,
        f'{prefix}TotPktMeanBwdN_{num}': tot_pkt_mean_bwd_l,
        f'{prefix}TotPktStdBwdN_{num}': tot_pkt_std_bwd_l,
        f'{prefix}SrcBSumFwdN_{num}': src_b_sum_fwd_l,
        f'{prefix}SrcBMinFwdN_{num}': src_b_min_fwd_l,
        f'{prefix}SrcBMaxFwdN_{num}': src_b_max_fwd_l,
        f'{prefix}SrcBMeanFwdN_{num}': src_b_mean_fwd_l,
        f'{prefix}SrcBStdFwdN_{num}': src_b_std_fwd_l,
        f'{prefix}SrcBSumBwdN_{num}': src_b_sum_bwd_l,
        f'{prefix}SrcBMinBwdN_{num}': src_b_min_bwd_l,
        f'{prefix}SrcBMaxBwdN_{num}': src_b_max_bwd_l,
        f'{prefix}SrcBMeanBwdN_{num}': src_b_mean_bwd_l,
        f'{prefix}SrcBStdBwdN_{num}': src_b_std_bwd_l,
        f'{prefix}Time2FlowFwdN_{num}': time_flow_fwd_sec_l,
        f'{prefix}Time2FlowBwdN_{num}': time_flow_bwd_sec_l
    })
    return pd.concat([f_df, new_features_df], axis=1)

In [27]:
def helper_trim_dict_value_list(key, dictionary, size):
    if key in dictionary:
        while len(dictionary[key]) >= size:
            dictionary[key].pop(0)
    else:
        dictionary[key] = []

def helper_calc_flow_diff(l, window_range_sec):
    if len(l) < 2 or (l[-1] - l[-2]) > window_range_sec:
        return 0
    return l[-1] - l[-2]

def build_time_bet_2_flow_time_window(f_df, minutes):
    df = f_df[['SrcAddr', 'DstAddr', 'is_fwd', 'epoch']].copy()
    total = len(df.index)
    window_range_sec = minutes * 60
    
    src_fwd_dict = {}
    src_bwd_dict = {}
    dst_fwd_dict = {}
    dst_bwd_dict = {}
    
    #Time Between Flows
    src_fwd_list = []
    src_bwd_list = []
    
    dst_fwd_list = []
    dst_bwd_list = []

    for index, row in df.iterrows():
        print_status(index, total)
        if row.is_fwd:
            helper_trim_dict_value_list(row.SrcAddr, src_fwd_dict, 2)
            helper_trim_dict_value_list(row.DstAddr, dst_fwd_dict, 2)
            
            src_fwd_dict[row.SrcAddr].append(row.epoch)
            dst_fwd_dict[row.DstAddr].append(row.epoch)
            
            src_bwd_list.append(0)
            dst_bwd_list.append(0)
            
            #Forward Src
            src_fwd_list.append(helper_calc_flow_diff(src_fwd_dict[row.SrcAddr], window_range_sec))
            #Forward Dst
            dst_fwd_list.append(helper_calc_flow_diff(dst_fwd_dict[row.DstAddr], window_range_sec))
        else:
            helper_trim_dict_value_list(row.SrcAddr, src_bwd_dict, 2)
            helper_trim_dict_value_list(row.DstAddr, dst_bwd_dict, 2)
            
            src_bwd_dict[row.SrcAddr].append(row.epoch)
            dst_bwd_dict[row.DstAddr].append(row.epoch)
            
            src_fwd_list.append(0)
            dst_fwd_list.append(0)
            
            #Backward Src
            src_bwd_list.append(helper_calc_flow_diff(src_bwd_dict[row.SrcAddr], window_range_sec))
            #Backward Dst
            dst_bwd_list.append(helper_calc_flow_diff(dst_bwd_dict[row.DstAddr], window_range_sec))
    #Build Data Frame
    new_features_df = pd.DataFrame(data={
        f'STime2FlowFwd_{minutes}T': src_fwd_list,
        f'STime2FlowBwd_{minutes}T': src_bwd_list,
        f'DTime2FlowFwd_{minutes}T': dst_fwd_list,
        f'DTime2FlowBwd_{minutes}T': dst_bwd_list
    })
    return pd.concat([f_df, new_features_df], axis=1)

In [None]:
feature_df = build_pkts_bytes_x_window(feature_df, 10000, 'SrcAddr')

In [None]:
feature_df = build_pkts_bytes_x_window(feature_df, 10000, 'DstAddr')

In [None]:
feature_df = build_pkts_bytes_time_window(feature_df, 10, 'SrcAddr')

In [14]:
feature_df = build_pkts_bytes_time_window(feature_df, 10, 'DstAddr')

TASK: 0.0% Completed
TASK: 4.999969252042762% Completed
TASK: 9.999938504085524% Completed
TASK: 14.999907756128286% Completed
TASK: 19.999877008171048% Completed
TASK: 24.99984626021381% Completed
TASK: 29.99981551225657% Completed
TASK: 34.999784764299335% Completed
TASK: 39.999754016342095% Completed
TASK: 44.99972326838486% Completed
TASK: 49.99969252042762% Completed
TASK: 54.99966177247039% Completed
TASK: 59.99963102451314% Completed
TASK: 64.99960027655591% Completed
TASK: 69.99956952859867% Completed
TASK: 74.99953878064144% Completed
TASK: 79.99950803268419% Completed
TASK: 84.99947728472696% Completed
TASK: 89.99944653676972% Completed
TASK: 94.99941578881248% Completed
TASK: 99.99938504085524% Completed


In [17]:
feature_df = build_time_bet_2_flow_time_window(feature_df, 10)

TASK: 0.0% Completed
TASK: 4.999969252042762% Completed
TASK: 9.999938504085524% Completed
TASK: 14.999907756128286% Completed
TASK: 19.999877008171048% Completed
TASK: 24.99984626021381% Completed
TASK: 29.99981551225657% Completed
TASK: 34.999784764299335% Completed
TASK: 39.999754016342095% Completed
TASK: 44.99972326838486% Completed
TASK: 49.99969252042762% Completed
TASK: 54.99966177247039% Completed
TASK: 59.99963102451314% Completed
TASK: 64.99960027655591% Completed
TASK: 69.99956952859867% Completed
TASK: 74.99953878064144% Completed
TASK: 79.99950803268419% Completed
TASK: 84.99947728472696% Completed
TASK: 89.99944653676972% Completed
TASK: 94.99941578881248% Completed
TASK: 99.99938504085524% Completed


In [34]:
#Write Sample to CSV
makedirs(dirname('../data/processed/'), exist_ok=True)
feature_df = feature_df.drop(columns=['epoch']) #Remove as it is not part of the feature list
feature_df.head(50).to_csv('../data/processed/sample_processed.csv', index=False) #Includes epoch in sample file to help validate.
#Write Raw and Features to CSV file.
feature_df.to_csv('../data/processed/processed.csv', index=False)