# Robust Test - Noised Values - Features Groups
for each features group in the data we will permute the values by setting randomly noise around the original value.

# Import

In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *
import time

# Settings

In [2]:
# set constants
target_label = 'tuple'
learning_model = 'et' # ['rf','et','lightgbm','xgboost']
num_features = ['min_packet_size', 'min_fpkt', 'min_bpkt']
file_name = "new_all_features_"
path = "../Datasets/" + target_label + "_dataset/"
maps_array = [{},{},{},{}]

In [3]:
# function for making model-prediction over the data set and measure the run time 
def timed_prediction(in_data,in_model):
    t = time.process_time()
    predicted = predict_model(in_model, data=in_data)
    elapsed_time = time.process_time() - t
    print("prediction took: " + str(elapsed_time))
    return predicted

In [4]:
# compare answers and labeled test
def compare_prediction_with_answers(in_predicted, in_answers):
    count=0
    index = in_predicted.index
    number_of_rows = len(index)
    errors_arr = []
    for i in range(0,number_of_rows):
        if str(int(in_answers.iloc[i])) != str(int(in_predicted.iloc[i]['Label'])):
            count=count+1
            cur_error = str(in_answers[i]) + "!=" + str(in_predicted.iloc[i]['Label'])
            errors_arr.append(cur_error)
    return count

In [5]:
def get_rand_data(in_data, in_column_indx,noise):
    cur_col=in_data[in_column_indx]
    col_len = len(cur_col)
    max_value = cur_col.max()
    value = cur_col[0]
    min_value = cur_col.min()
    buffer = 0
    for i in range(0,col_len):
        cur_val = cur_col[i]
        buffer = cur_val*noise
        coeff = np.random.randint(0,1)
        if coeff == 0: coeff=-1
        cur_col[i] = cur_val + (coeff * buffer)
    in_data[in_column_indx] = cur_col
    return in_data

In [6]:
# set up features groups
SSL_features = ['fSSL_session_id_len', 'fSSL_num_extensions', 'fcipher_suites', 'ssl_v', ]
size_features = ['size_histogram_1','size_histogram_2','size_histogram_3',
                 'size_histogram_4','size_histogram_5','size_histogram_6',
                 'size_histogram_7','size_histogram_8','size_histogram_9', 'size_histogram_10',
                'fpackets', 'bpackets', 'fbytes', 'bbytes','min_packet_size',
                 'max_packet_size', 'mean_packet_size','sizevar']
peak_features = ['fpeak_features_1','fpeak_features_2','fpeak_features_3',
                 'fpeak_features_4','fpeak_features_5','fpeak_features_6',
                 'fpeak_features_7','fpeak_features_8','fpeak_features_9',
                 'bpeak_features_1','bpeak_features_2','bpeak_features_3',
                 'bpeak_features_4','bpeak_features_5','bpeak_features_6',
                 'bpeak_features_7','bpeak_features_8','bpeak_features_9']
TCP_features = ['SYN_tcp_scale', 'SYN_tcp_winsize']
common_features = ['packet_count','num_keep_alive', 'mean_fttl',
                   'max_fpkt','max_bpkt','std_fpkt','std_bpkt','mean_fpkt','mean_bpkt']
stat_features = ['min_packet_size', 'max_packet_size', 'mean_packet_size',
                 'sizevar', 'std_fiat','max_fiat','max_biat','std_biat','mean_fiat','mean_biat',
                'min_fpkt','min_bpkt','max_fpkt','max_bpkt','std_fpkt','std_bpkt','mean_fpkt','mean_bpkt']
time_features = ['std_fiat','max_fiat','max_biat','std_biat','mean_fiat','mean_biat']

# Read Data Setup Data and Build Model

In [7]:
data = pd.read_csv(path+file_name+target_label+'_train.csv',
                      sep='\t',
                      skiprows=[1])

setup(data=data,
      target=target_label,
      numeric_features=num_features,
      silent=True)

model=create_model(learning_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9664,0.0,0.8571,0.9646,0.9649,0.9606,0.9607
1,0.9763,0.0,0.9277,0.9776,0.975,0.9722,0.9723
2,0.9782,0.0,0.9264,0.9783,0.9772,0.9745,0.9746
3,0.9822,0.0,0.9185,0.9824,0.9817,0.9792,0.9792
4,0.9763,0.0,0.9212,0.9778,0.9765,0.9722,0.9723
5,0.9664,0.0,0.8837,0.9686,0.966,0.9607,0.9607
6,0.9773,0.0,0.8872,0.9769,0.9763,0.9734,0.9734
7,0.9733,0.0,0.8802,0.9724,0.9715,0.9687,0.9688
8,0.9782,0.0,0.8743,0.9767,0.9764,0.9745,0.9746
9,0.9832,0.0,0.8695,0.9815,0.9821,0.9803,0.9803


### Read Test Data

In [8]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Predicte and Check

In [9]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [10]:
predicted = predict_model(model, data=unseen_data)

In [11]:
compare_prediction_with_answers(predicted,answers)

138

Now, for each group we will Read Test Data again, Change values, Predicte, Check and Saves result.

# SSL Features Group

In [12]:
current_group='SSL_Features'
noise=0.05
features_group = SSL_features
print ('current columns are : ' + str(features_group))
for index in range (0,4):
    
    unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])
    
    for i in features_group:
        unseen_data = get_rand_data(unseen_data,i,noise)
    answers = unseen_data[target_label]
    unseen_data = unseen_data.drop(columns=[target_label])
    predicted = predict_model(model, data=unseen_data)
    result = compare_prediction_with_answers(predicted,answers)
    maps_array[index][current_group] = result
    
    noise=noise+0.05

current columns are : ['fSSL_session_id_len', 'fSSL_num_extensions', 'fcipher_suites', 'ssl_v']


# Size Features Group

In [13]:
current_group='Size_features'
noise=0.05
features_group = size_features
print ('current columns are : ' + str(features_group))
for index in range (0,4):
    
    unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])
    
    for i in features_group:
        unseen_data = get_rand_data(unseen_data,i,noise)
    answers = unseen_data[target_label]
    unseen_data = unseen_data.drop(columns=[target_label])
    predicted = predict_model(model, data=unseen_data)
    result = compare_prediction_with_answers(predicted,answers)
    maps_array[index][current_group] = result
    
    noise=noise+0.05

current columns are : ['size_histogram_1', 'size_histogram_2', 'size_histogram_3', 'size_histogram_4', 'size_histogram_5', 'size_histogram_6', 'size_histogram_7', 'size_histogram_8', 'size_histogram_9', 'size_histogram_10', 'fpackets', 'bpackets', 'fbytes', 'bbytes', 'min_packet_size', 'max_packet_size', 'mean_packet_size', 'sizevar']


# COMMON Features Group

In [14]:
current_group='Common_features'
noise=0.05
features_group = common_features
print ('current columns are : ' + str(features_group))
for index in range (0,4):
    
    unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])
    
    for i in features_group:
        unseen_data = get_rand_data(unseen_data,i,noise)
    answers = unseen_data[target_label]
    unseen_data = unseen_data.drop(columns=[target_label])
    predicted = predict_model(model, data=unseen_data)
    result = compare_prediction_with_answers(predicted,answers)
    maps_array[index][current_group] = result
    
    noise=noise+0.05

current columns are : ['packet_count', 'num_keep_alive', 'mean_fttl', 'max_fpkt', 'max_bpkt', 'std_fpkt', 'std_bpkt', 'mean_fpkt', 'mean_bpkt']


# TCP Features Group

In [15]:
current_group='TCP_features'
noise=0.05
features_group = TCP_features
print ('current columns are : ' + str(features_group))
for index in range (0,4):
    
    unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])
    
    for i in features_group:
        unseen_data = get_rand_data(unseen_data,i,noise)
    answers = unseen_data[target_label]
    unseen_data = unseen_data.drop(columns=[target_label])
    predicted = predict_model(model, data=unseen_data)
    result = compare_prediction_with_answers(predicted,answers)
    maps_array[index][current_group] = result
    
    noise=noise+0.05

current columns are : ['SYN_tcp_scale', 'SYN_tcp_winsize']


# STAT Features Group

In [16]:
current_group='Statistics_features'
noise=0.05
features_group = stat_features
print ('current columns are : ' + str(features_group))
for index in range (0,4):
    
    unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])
    
    for i in features_group:
        unseen_data = get_rand_data(unseen_data,i,noise)
    answers = unseen_data[target_label]
    unseen_data = unseen_data.drop(columns=[target_label])
    predicted = predict_model(model, data=unseen_data)
    result = compare_prediction_with_answers(predicted,answers)
    maps_array[index][current_group] = result
    
    noise=noise+0.05

current columns are : ['min_packet_size', 'max_packet_size', 'mean_packet_size', 'sizevar', 'std_fiat', 'max_fiat', 'max_biat', 'std_biat', 'mean_fiat', 'mean_biat', 'min_fpkt', 'min_bpkt', 'max_fpkt', 'max_bpkt', 'std_fpkt', 'std_bpkt', 'mean_fpkt', 'mean_bpkt']


# Time Features Group

In [17]:
current_group='Time_features'
noise=0.05
features_group = time_features
print ('current columns are : ' + str(features_group))
for index in range (0,4):
    
    unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])
    
    for i in features_group:
        unseen_data = get_rand_data(unseen_data,i,noise)
    answers = unseen_data[target_label]
    unseen_data = unseen_data.drop(columns=[target_label])
    predicted = predict_model(model, data=unseen_data)
    result = compare_prediction_with_answers(predicted,answers)
    maps_array[index][current_group] = result
    
    noise=noise+0.05

current columns are : ['std_fiat', 'max_fiat', 'max_biat', 'std_biat', 'mean_fiat', 'mean_biat']


# Results Map

In [19]:
noise=0.05
sorted_results_array = []
for index in range (0,4):
    print("\n")
    print("number of errors for group feature with " + str(noise) + " randomly noise around the original value.")
    sorted_results_array.append([(k, v) for k, v in sorted(maps_array[index].items(), key=lambda x: x[1], reverse=True)])
    print(sorted_results_array[index])
    print("\n")
    noise=noise+0.05



number of errors for group feature with 0.05 randomly noise around the original value.
[('TCP_features', 2209), ('SSL_Features', 2059), ('Statistics_features', 188), ('Size_features', 164), ('Common_features', 146), ('Time_features', 142)]




number of errors for group feature with 0.1 randomly noise around the original value.
[('TCP_features', 2209), ('SSL_Features', 2082), ('Statistics_features', 408), ('Size_features', 183), ('Common_features', 155), ('Time_features', 144)]




number of errors for group feature with 0.15000000000000002 randomly noise around the original value.
[('TCP_features', 2337), ('SSL_Features', 2100), ('Statistics_features', 747), ('Size_features', 243), ('Common_features', 159), ('Time_features', 150)]




number of errors for group feature with 0.2 randomly noise around the original value.
[('TCP_features', 2337), ('SSL_Features', 2117), ('Statistics_features', 1119), ('Size_features', 320), ('Common_features', 171), ('Time_features', 149)]




so our prediction is very sensitive to noise with TCP_features and SSL_Features