# Robust Test - Random in Range - columns
for each column in the data we will permute the values by setting randomly values in the original values range.

# Import

In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *
import time
#from DB_scripts import rnd_bln_split_CSV as shf

# Settings

In [2]:
# set constants
target_label = 'tuple'
learning_model = 'rf' # ['rf','et','lightgbm','xgboost']
num_features = ['min_packet_size', 'min_fpkt', 'min_bpkt']
file_name = "new_all_features_"
path = "../Datasets/" + target_label + "_dataset/"

In [3]:
# function for making model-prediction over the data set and measure the run time 
def timed_prediction(in_data,in_model):
    t = time.process_time()
    predicted = predict_model(in_model, data=in_data)
    elapsed_time = time.process_time() - t
    print("prediction took: " + str(elapsed_time))
    return predicted

In [4]:
# compare answers and labeled test
def compare_prediction_with_answers(in_predicted, in_answers):
    count=0
    index = in_predicted.index
    number_of_rows = len(index)
    errors_arr = []
    for i in range(0,number_of_rows):
        if str(int(in_answers.iloc[i])) != str(int(in_predicted.iloc[i]['Label'])):
            count=count+1
            cur_error = str(in_answers[i]) + "!=" + str(in_predicted.iloc[i]['Label'])
            errors_arr.append(cur_error)
#             print("error in line " + str(i) +
#                   " " + str(in_answers[i]) +
#                   "!=" + str(in_predicted.iloc[i]['Label']))
#     print("Errors: " + str(errors_arr))
#     print("Number of error: " + str(count) + " from " +
#           str(number_of_rows) + " test samples \nWhich is "
#           + str(100*count/number_of_rows) + "% of error.")
    return count

In [5]:
# function for checkign the correction of the model-prediction over the data
def check_correction(in_predicted):
    in_answers=in_predicted['Label']
    return compare_prediction_with_answers(in_predicted, in_answers)

In [6]:
# activating balanced random data shuffling
# shf.split_CSV_randomly_balanced(target_label,file_name)

In [7]:
def get_rand_dataframe(in_data):
    np.random.seed(1)
    row_len = len(in_data)
    max_value = 0
    value = 0
    min_value = in_data.values[0]
    for value in in_data:
        if value > max_value: max_value = value
        if value < min_value: min_value = value
#     print ('max_value = ' + str(max_value))
#     print ('min_value = ' + str(min_value))
    if 'int64' == in_data.dtypes:
#         print('data type is ' + str(in_data.dtypes))
#         print('values are: ' + str(in_data.unique()))
        return pd.DataFrame(np.random.randint(min_value, max_value, size=row_len, dtype=np.int64))
    if 'float64' == in_data.dtypes:
#         print('data type is ' + str(in_data.dtypes))
#         print('values are: ' + str(in_data.unique()))
        return pd.DataFrame((max_value - min_value) * np.random.random_sample(row_len) + min_value)

# Read Data

In [8]:
data = pd.read_csv(path+file_name+target_label+'_train.csv',
                      sep='\t',
                      skiprows=[1])

# Setup Data and Build Model

In [9]:
setup(data=data,
      target=target_label,
      numeric_features=num_features,
      silent=True)
model=create_model(learning_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9683,0.0,0.8061,0.965,0.9651,0.9629,0.963
1,0.9683,0.0,0.8545,0.969,0.9655,0.9629,0.963
2,0.9614,0.0,0.8962,0.9642,0.9601,0.9548,0.9549
3,0.9723,0.0,0.8456,0.9712,0.9708,0.9676,0.9676
4,0.9693,0.0,0.85,0.9679,0.9673,0.9641,0.9641
5,0.9674,0.0,0.8684,0.9671,0.9657,0.9618,0.9618
6,0.9713,0.0,0.8387,0.968,0.9688,0.9664,0.9665
7,0.9763,0.0,0.8731,0.9768,0.976,0.9722,0.9723
8,0.9614,0.0,0.8717,0.9629,0.9605,0.9549,0.955
9,0.9653,0.0,0.8805,0.9653,0.9642,0.9594,0.9595


### Read Test Data

In [10]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Predicte and Check

In [11]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [12]:
predicted = predict_model(model, data=unseen_data)

### number of errors of the model for current test set

In [13]:
compare_prediction_with_answers(predicted,answers)

191

# Change column values and run test

In [14]:
results = []
### for any clolumn in the data set
for cur_col in data.columns:
    ### Read Test Data
    unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])
    ### Change column values
    unseen_data[cur_col] = get_rand_dataframe(unseen_data[cur_col])
    ### Predicte and Check
    answers = unseen_data[target_label]
    unseen_data = unseen_data.drop(columns=[target_label])
    predicted = predict_model(model, data=unseen_data)
    results.append(compare_prediction_with_answers(predicted,answers))
    
### creaing map of results and column names for printing
results_map = {}
count = 0
for cur_col in data.columns:
    results_map[cur_col] = results[count]
    count=count+1

In [15]:
### prints results map sorted by values
[(k, v) for k, v in sorted(results_map.items(), key=lambda x: x[1], reverse=True)]

[('tuple', 6188),
 ('fcipher_suites', 410),
 ('fSSL_num_extensions', 315),
 ('std_biat', 252),
 ('ssl_v', 234),
 ('SYN_tcp_winsize', 230),
 ('SYN_tcp_scale', 223),
 ('max_biat', 220),
 ('mean_bpkt', 220),
 ('size_histogram_10', 213),
 ('max_fiat', 211),
 ('mean_fiat', 211),
 ('std_fpkt', 209),
 ('std_bpkt', 209),
 ('bpeak_features_7', 208),
 ('max_fpkt', 208),
 ('bpeak_features_1', 206),
 ('mean_packet_size', 205),
 ('num_keep_alive', 205),
 ('size_histogram_4', 204),
 ('bpeak_features_9', 204),
 ('max_packet_size', 204),
 ('std_fiat', 204),
 ('bpeak_features_8', 203),
 ('size_histogram_2', 202),
 ('size_histogram_6', 202),
 ('fpeak_features_1', 201),
 ('fpeak_features_3', 201),
 ('fpeak_features_5', 201),
 ('bpeak_features_2', 201),
 ('bpeak_features_5', 201),
 ('fbytes', 201),
 ('mean_biat', 200),
 ('min_fpkt', 200),
 ('fpeak_features_8', 199),
 ('bpeak_features_3', 199),
 ('mean_fpkt', 199),
 ('min_packet_size', 198),
 ('bpeak_features_4', 197),
 ('bpackets', 197),
 ('size_histogram

obviously the tuple feature is the most sensative one, but it is the predicted feature so just ignore it.
we can see that the Cipher-suite, SSL-number-of-Extenstion and Standart-Backwards-Internal-Arrival-Time of packets features (each one represented with one sigle column in our dataset) are the most effective (maximum errors) features when noise added.