# Robust Test - Noised values - Columns
for each column in the data we will permute the values by setting randomly noise around the original value.

# Import

In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *
import time

# Settings

In [2]:
# set constants
target_label = 'tuple'
learning_model = 'rf' # ['rf','et','lightgbm','xgboost']
num_features = ['min_packet_size', 'min_fpkt', 'min_bpkt']
file_name = "new_all_features_"
path = "../Datasets/" + target_label + "_dataset/"
maps_array = [{},{},{},{}]
interval=0.05
noise=0

In [3]:
# function for making model-prediction over the data set and measure the run time 
def timed_prediction(in_data,in_model):
    t = time.process_time()
    predicted = predict_model(in_model, data=in_data)
    elapsed_time = time.process_time() - t
    print("prediction took: " + str(elapsed_time))
    return predicted

In [4]:
# compare answers and labeled test
def compare_prediction_with_answers(in_predicted, in_answers):
    count=0
    index = in_predicted.index
    number_of_rows = len(index)
    errors_arr = []
    for i in range(0,number_of_rows):
        if str(int(in_answers.iloc[i])) != str(int(in_predicted.iloc[i]['Label'])):
            count=count+1
            cur_error = str(in_answers[i]) + "!=" + str(in_predicted.iloc[i]['Label'])
            errors_arr.append(cur_error)
    return count

In [5]:
def get_rand_dataframe(in_data, in_column_indx,in_noise):
    interval=np.random.randint(0,100)
    interval=interval/100
    cur_col=in_data[in_column_indx]
    col_len = len(cur_col)
    max_value = cur_col.max()
    value = cur_col[0]
    min_value = cur_col.min()
    buffer = 0
    for i in range(0,col_len):
        cur_val = cur_col[i]
        buffer = cur_val*in_noise
        coeff = np.random.randint(0,1)
        if coeff == 0: coeff=-1
        cur_col[i] = cur_val + (coeff * buffer)
    in_data[in_column_indx] = cur_col
    return in_data

# Read Data

In [6]:
data = pd.read_csv(path+r'new_all_features_'+target_label+'_train.csv',
                      sep='\t',
                      skiprows=[1])

# Setup Data and Build Model

In [7]:
setup(data=data,
      target=target_label,
      numeric_features=num_features,
      silent=True)
model=create_model(learning_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9604,0.0,0.8575,0.9592,0.959,0.9537,0.9537
1,0.9585,0.0,0.8297,0.9579,0.957,0.9514,0.9514
2,0.9723,0.0,0.8681,0.9706,0.9703,0.9676,0.9676
3,0.9713,0.0,0.8341,0.968,0.9688,0.9664,0.9664
4,0.9674,0.0,0.8221,0.9657,0.9652,0.9618,0.9618
5,0.9713,0.0,0.8125,0.9686,0.9689,0.9664,0.9665
6,0.9644,0.0,0.8858,0.964,0.9632,0.9583,0.9584
7,0.9743,0.0,0.9023,0.9749,0.9741,0.9699,0.9699
8,0.9683,0.0,0.8565,0.9667,0.9666,0.963,0.963
9,0.9683,0.0,0.8409,0.9672,0.9672,0.9629,0.9629


### Read Test Data

In [8]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Predicte and Check

In [9]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [10]:
predicted = predict_model(model, data=unseen_data)

### number of errors of the model for current test set

In [11]:
compare_prediction_with_answers(predicted,answers)

205

# Add noise and run test
each round noise will be increased in 5%

In [12]:
noise=interval
results = []
results_map = []
for index in range(0,4):
    ### for any clolumn in the data set
    for cur_col in data.columns:
        ### Read Test Data
        unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                          sep='\t',
                          skiprows=[1])
        ### Change column values
        unseen_data = get_rand_dataframe(unseen_data,cur_col,noise)
        ### Predicte and Check
        answers = unseen_data[target_label]
        unseen_data = unseen_data.drop(columns=[target_label])
        predicted = predict_model(model, data=unseen_data)
        results.append(compare_prediction_with_answers(predicted,answers))

    ### creaing map of results and column names for printing
    count = 0
    for cur_col in data.columns:
        maps_array[index][cur_col] = results[count]
        count=count+1
    noise = noise+interval

In [13]:
noise=interval
sorted_results_array = []
for index in range (0,4):
    print("\n")
    print("number of errors for noised column with " + str(noise) + " around the original value")
    sorted_results_array.append([(k, v) for k, v in sorted(maps_array[index].items(), key=lambda x: x[1], reverse=True)])
    print(sorted_results_array[index])
    print("\n")
    noise=noise+interval



number of errors for noised column with 0.05 around the original value
[('tuple', 6189), ('ssl_v', 611), ('SYN_tcp_winsize', 606), ('fcipher_suites', 368), ('fSSL_num_extensions', 261), ('mean_fttl', 257), ('SYN_tcp_scale', 233), ('max_fiat', 221), ('fpeak_features_7', 213), ('bpeak_features_9', 212), ('min_packet_size', 211), ('num_keep_alive', 211), ('bpeak_features_4', 210), ('min_bpkt', 210), ('size_histogram_2', 209), ('fpeak_features_8', 209), ('bpeak_features_2', 209), ('max_fpkt', 209), ('size_histogram_9', 208), ('fpeak_features_5', 208), ('std_fiat', 208), ('fbytes', 208), ('std_fpkt', 208), ('mean_fpkt', 208), ('size_histogram_4', 207), ('size_histogram_5', 207), ('size_histogram_8', 207), ('fpeak_features_6', 207), ('bpeak_features_8', 207), ('packet_count', 207), ('max_packet_size', 207), ('bbytes', 207), ('max_biat', 207), ('max_bpkt', 207), ('size_histogram_3', 206), ('fpeak_features_9', 206), ('bpeak_features_1', 206), ('bpeak_features_3', 206), ('bpeak_features_6', 2

we can see that noise around the original values with rates of 0.05, 0.1, 0.15, 0.2 cdosnt effect on the learning model to predict correctly