# Robust Test - Distributed - Columns values
for each column in the data we will permute the values by setting randomly values that distrebuted the same as the original values.

# Import

In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *
import time

# Settings

In [2]:
# set constants
target_label = 'tuple'
learning_model = 'rf' # ['rf','et','lightgbm','xgboost']
num_features = ['min_packet_size', 'min_fpkt', 'min_bpkt']
file_name = "new_all_features_"
path = "../Datasets/" + target_label + "_dataset/"

In [3]:
# function for making model-prediction over the data set and measure the run time 
def timed_prediction(in_data,in_model):
    t = time.process_time()
    predicted = predict_model(in_model, data=in_data)
    elapsed_time = time.process_time() - t
    print("prediction took: " + str(elapsed_time))
    return predicted

In [4]:
# compare answers and labeled test
def compare_prediction_with_answers(in_predicted, in_answers):
    count=0
    index = in_predicted.index
    number_of_rows = len(index)
    errors_arr = []
    for i in range(0,number_of_rows):
        if str(int(in_answers.iloc[i])) != str(int(in_predicted.iloc[i]['Label'])):
            count=count+1
            cur_error = str(in_answers[i]) + "!=" + str(in_predicted.iloc[i]['Label'])
            errors_arr.append(cur_error)
#             print("error in line " + str(i) +
#                   " " + str(in_answers[i]) +
#                   "!=" + str(in_predicted.iloc[i]['Label']))
#     print("Errors: " + str(errors_arr))
#     print("Number of error: " + str(count) + " from " +
#           str(number_of_rows) + " test samples \nWhich is "
#           + str(100*count/number_of_rows) + "% of error.")
    return count

In [7]:
def get_rand_dataframe(in_data):
    return pd.DataFrame(np.random.permutation(in_data.values))

# Read Data

In [8]:
data = pd.read_csv(path+r'new_all_features_'+target_label+'_train.csv',
                      sep='\t',
                      skiprows=[1])

# Setup Data and Build Model

In [9]:
setup(data=data,
      target=target_label,
      numeric_features=num_features,
      silent=True)
model=create_model(learning_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9703,0.0,0.8551,0.9695,0.9684,0.9653,0.9653
1,0.9614,0.0,0.8026,0.9608,0.9598,0.9549,0.9549
2,0.9683,0.0,0.8566,0.9686,0.9659,0.963,0.9631
3,0.9674,0.0,0.8383,0.9649,0.9656,0.9618,0.9618
4,0.9683,0.0,0.8946,0.9709,0.9667,0.9629,0.963
5,0.9703,0.0,0.848,0.9676,0.9676,0.9653,0.9653
6,0.9644,0.0,0.7839,0.9611,0.9609,0.9583,0.9583
7,0.9624,0.0,0.8159,0.9592,0.9586,0.9559,0.9561
8,0.9693,0.0,0.8183,0.9683,0.9671,0.9641,0.9642
9,0.9614,0.0,0.8007,0.9618,0.9586,0.9548,0.9549


### Read Test Data

In [10]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Predicte and Check

In [11]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [12]:
predicted = predict_model(model, data=unseen_data)

### number of errors of the model for current test set

In [13]:
compare_prediction_with_answers(predicted,answers)

188

# Permute values and run test
for any column (feature) in the test dataset, we will change the values, while keeping original distrubution probability (by shuffling the values) and test (builted) model preformance for classification.

In [None]:
results = []
### for any clolumn in the data set
for cur_col in data.columns:
    ### Read Test Data
    data = pd.read_csv(path+r'new_all_features_'+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])
    ### Change column values
    unseen_data[cur_col] = get_rand_dataframe(unseen_data[cur_col])
    ### Predicte and Check
    answers = unseen_data[target_label]
    unseen_data = unseen_data.drop(columns=[target_label])
    predicted = predict_model(model, data=unseen_data)
    results.append(compare_prediction_with_answers(predicted,answers))
    
### creaing map of results and column names for printing
results_map = {}
count = 0
for cur_col in data.columns:
    results_map[cur_col] = results[count]
    count=count+1

In [22]:
### prints results map sorted by values
[(k, v) for k, v in sorted(results_map.items(), key=lambda x: x[1], reverse=True)]

[('tuple', 5296),
 ('fcipher_suites', 278),
 ('fSSL_num_extensions', 244),
 ('ssl_v', 228),
 ('SYN_tcp_scale', 218),
 ('max_fpkt', 217),
 ('mean_packet_size', 203),
 ('SYN_tcp_winsize', 202),
 ('size_histogram_4', 200),
 ('bpeak_features_8', 200),
 ('size_histogram_1', 199),
 ('bpeak_features_6', 199),
 ('min_packet_size', 199),
 ('max_packet_size', 199),
 ('fpeak_features_1', 198),
 ('fpeak_features_8', 198),
 ('bpeak_features_3', 198),
 ('fSSL_session_id_len', 197),
 ('sizevar', 197),
 ('max_biat', 197),
 ('mean_bpkt', 197),
 ('bpeak_features_9', 195),
 ('fpackets', 195),
 ('min_fpkt', 195),
 ('size_histogram_6', 194),
 ('size_histogram_9', 194),
 ('std_bpkt', 194),
 ('num_keep_alive', 194),
 ('size_histogram_3', 193),
 ('fpeak_features_4', 193),
 ('fpeak_features_5', 193),
 ('std_biat', 193),
 ('mean_biat', 193),
 ('fpeak_features_3', 192),
 ('bpeak_features_1', 192),
 ('min_bpkt', 192),
 ('mean_fpkt', 192),
 ('mean_fttl', 192),
 ('size_histogram_2', 191),
 ('std_fiat', 191),
 ('fby

the above present the number of error in prediction while premuting the specified feature (all other features has not been changed)