In [395]:
import pandas as pd
import numpy as np
import datetime

df = pd.read_csv('../data/preprocessed/preprocessed.csv')
df_orig = df.copy()
df['StartTime'] = pd.to_datetime(df['StartTime'])
df['epoch'] = (df['StartTime'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

In [396]:
dir_map = {
    'Dir':['   ->','   ?>','  <?>','  <->','  <-','  <?'],
    'dir_map': [1, 2, 3, 4, 5, 6]
}
dirs_df = pd.DataFrame(data=dir_map)

feature_df = df.copy()
feature_df = feature_df.merge(dirs_df,how='left', on=['Dir'])
feature_df['is_forward'] = feature_df.dir_map
feature_df['is_backward'] = feature_df.dir_map
feature_df.loc[feature_df.dir_map != 1, 'is_forward'] = 0
feature_df.loc[feature_df.dir_map != 5, 'is_backward'] = 0
feature_df.loc[feature_df.dir_map == 5, 'is_backward'] = 1

feature_df['totbytes_f'] = feature_df.TotBytes
feature_df.loc[feature_df.is_forward == 0, 'totbytes_f'] = np.NaN

feature_df['totbytes_b'] = feature_df.TotBytes
feature_df.loc[feature_df.is_backward == 0, 'totbytes_b'] = np.NaN

In [397]:
max_epoch = 0
file_sep_list = []

for index, value in feature_df.epoch.items():
    if index == 0:
        max_epoch = value
        file_sep_list.append([index, value])
        continue
    if value > max_epoch:
        max_epoch = value
    if value < max_epoch and value < 4000:
        file_sep_list.append([index, value])
        max_epoch = value

all_df = []
for i in range (0, len(file_sep_list)):
    if i == 0:
        all_df.append(feature_df.iloc[:file_sep_list[i+1][0], :len(feature_df.columns)])
    elif i + 1 == len(file_sep_list):
        all_df.append(feature_df.iloc[file_sep_list[i][0]:len(feature_df.index)-1, :len(feature_df.columns)])
    
    else:
        all_df.append(feature_df.iloc[file_sep_list[i][0]:file_sep_list[i+1][0], :len(feature_df.columns)])


In [398]:
split_frames = []
for frames in all_df:
    split_frames.append(frames.reset_index(drop=True).sort_values('StartTime'))

In [399]:
#Total Flow in forward and backward 10 min
for frames in split_frames:
    frames['total_flow_f_time_10'] = frames[['StartTime', 'is_forward']].rolling('10T', on='StartTime').sum()['is_forward']
    frames['total_flow_b_time_10'] = frames[['StartTime', 'is_backward']].rolling('10T', on='StartTime').sum()['is_backward']

In [400]:
for frames in split_frames:
    forward_min_10_rolling_tot_bytes = frames[['StartTime', 'totbytes_f']].rolling('10T', on='StartTime')
    back_min_10_rolling_tot_bytes = frames[['StartTime', 'totbytes_b']].rolling('10T', on='StartTime')

    #Total Size in Forward and Backward 10 minutes
    frames['sum_size_f_time_10'] = forward_min_10_rolling_tot_bytes.sum()['totbytes_f']
    frames['sum_size_b_time_10'] = back_min_10_rolling_tot_bytes.sum()['totbytes_b']

    #Min Size in Forward and Backward 10 minutes
    frames['min_size_f_time_10'] = forward_min_10_rolling_tot_bytes.min()['totbytes_f']
    frames['min_size_b_time_10'] = back_min_10_rolling_tot_bytes.min()['totbytes_b']

    #Max Size in Forward and Backward 10 minutes
    frames['max_size_f_time_10'] = forward_min_10_rolling_tot_bytes.max()['totbytes_f']
    frames['max_size_b_time_10'] = back_min_10_rolling_tot_bytes.max()['totbytes_b']

    #Mean Size in Forward and Backward 10 minutes
    frames['mean_size_f_time_10'] = forward_min_10_rolling_tot_bytes.mean()['totbytes_f']
    frames['mean_size_b_time_10'] = back_min_10_rolling_tot_bytes.mean()['totbytes_b']

    #Standard Deviation Size in Forward and Backward 10 minutes
    frames['std_size_f_time_10'] = forward_min_10_rolling_tot_bytes.std()['totbytes_f']
    frames['std_size_b_time_10'] = back_min_10_rolling_tot_bytes.std()['totbytes_b']


In [None]:
def src_flow_total(window, f_or_b):
    current_row_src = window[-1]['SrcAddr']
    if f_or_b == 'is_forward':
        return np.array([ row['is_forward'] for row in window if row['SrcAddr'] == current_row_src and row['is_forward'] == 1])
    else:
        return np.array([ row['is_backward'] for row in window if row['SrcAddr'] == current_row_src and row['is_backward'] == 1])

def sum_flow_window(row, window, f_or_b, num):
    if len(window) == num:
        window.pop(0)
    window.append(row)
    arr = src_flow_total(window, f_or_b)
    return arr.sum() if len(arr) != 0 else 0.0

def src_windows(window, f_or_b):
    current_row_src = window[-1]['SrcAddr']
    if f_or_b == 'is_forward':
        return np.array([ row['totbytes_f'] for row in window if row['SrcAddr'] == current_row_src and row['is_forward'] == 1])
    else:
        return np.array([ row['totbytes_b'] for row in window if row['SrcAddr'] == current_row_src and row['is_backward'] == 1])

def sum_bytes_window(row, window, f_or_b, num):
    if len(window) == num:
        window.pop(0)
    window.append(row)
    arr = src_windows(window, f_or_b)
    return arr.sum() if len(arr) != 0 else np.NaN

def min_bytes_window(row, window, f_or_b, num):
    if len(window) == num:
        window.pop(0)
    window.append(row)
    arr = src_windows(window, f_or_b)
    return arr.min() if len(arr) != 0 else np.NaN

def max_bytes_window(row, window, f_or_b, num):
    if len(window) == num:
        window.pop(0)
    window.append(row)
    arr = src_windows(window, f_or_b)
    return arr.max() if len(arr) != 0 else np.NaN

def mean_bytes_window(row, window, f_or_b, num):
    if len(window) == num:
        window.pop(0)
    window.append(row)
    arr = src_windows(window, f_or_b)
    return arr.mean() if len(arr) != 0 else np.NaN

def std_bytes_window(row, window, f_or_b, num):
    if len(window) == num:
        window.pop(0)
    window.append(row)
    arr = src_windows(window, f_or_b)
    return arr.std() if len(arr) != 0 else np.NaN

for frames in split_frames:
    forward_df = frames[['SrcAddr', 'totbytes_f', 'is_forward']].copy()
    backward_df = frames[['SrcAddr', 'totbytes_b', 'is_backward']].copy()

    queue = []
    frames['sum_size_f_num_5'] = [sum_bytes_window(row, queue, 'is_forward', 5) for index, row in forward_df.iterrows()]
    queue = []
    frames['sum_size_b_num_5'] = [sum_bytes_window(row, queue, 'is_backward', 5) for index, row in backward_df.iterrows()]

    queue = []
    frames['min_size_f_num_5'] = [min_bytes_window(row, queue, 'is_forward', 5) for index, row in forward_df.iterrows()]
    queue = []
    frames['min_size_b_num_5'] = [min_bytes_window(row, queue, 'is_backward', 5) for index, row in backward_df.iterrows()]

    queue = []
    frames['max_size_f_num_5'] = [max_bytes_window(row, queue, 'is_forward', 5) for index, row in forward_df.iterrows()]
    queue = []
    frames['max_size_b_num_5'] = [max_bytes_window(row, queue, 'is_backward', 5) for index, row in backward_df.iterrows()]

    queue = []
    frames['mean_size_f_num_5'] = [mean_bytes_window(row, queue, 'is_forward', 5) for index, row in forward_df.iterrows()]
    queue = []
    frames['mean_size_b_num_5'] = [mean_bytes_window(row, queue, 'is_backward', 5) for index, row in backward_df.iterrows()]

    queue = []
    frames['std_size_f_num_5'] = [std_bytes_window(row, queue, 'is_forward', 5) for index, row in forward_df.iterrows()]
    queue = []
    frames['std_size_b_num_5'] = [std_bytes_window(row, queue, 'is_backward', 5) for index, row in backward_df.iterrows()]

    queue = []
    frames['total_flow_f_num_5'] = [sum_flow_window(row, queue, 'is_forward', 5) for index, row in forward_df.iterrows()]
    queue = []
    frames['total_flow_b_num_5'] = [sum_flow_window(row, queue, 'is_backward', 5) for index, row in backward_df.iterrows()]

In [None]:
# feature_df[['TotBytes', 
#             'min_size_f_time_10', 
#             'min_size_b_time_10', 
#             'max_size_f_time_10',
#             'max_size_b_time_10',
#             'mean_size_f_time_10',
#             'mean_size_b_time_10',
#             'std_size_f_time_10',
#             'std_size_b_time_10',
#             'epoch']].head(25)

In [None]:
#Combine all frames
final_df = split_frames[0].copy()
for i in range(1, len(split_frames)):
    final_df = final_df.append(split_framesp[i], ignore_index=True)

#Write to csv file.
final_df.to_csv('../feature.csv', index=False)
