# Robust Test - Random values - Features Groups
for each group features in the data we will permute the values by setting randomly values in the original values range.

# Import

In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *
import time

# Settings

In [2]:
# set constants
target_label = 'tuple'
learning_model = 'rf' # ['rf','et','lightgbm','xgboost']
num_features = ['min_packet_size', 'min_fpkt', 'min_bpkt']
file_name = "new_all_features_"
path = "../Datasets/" + target_label + "_dataset/"

In [3]:
# function for making model-prediction over the data set and measure the run time 
def timed_prediction(in_data,in_model):
    t = time.process_time()
    predicted = predict_model(in_model, data=in_data)
    elapsed_time = time.process_time() - t
    print("prediction took: " + str(elapsed_time))
    return predicted

In [4]:
# compare answers and labeled test
def compare_prediction_with_answers(in_predicted, in_answers):
    count=0
    index = in_predicted.index
    number_of_rows = len(index)
    errors_arr = []
    for i in range(0,number_of_rows):
        if str(int(in_answers.iloc[i])) != str(int(in_predicted.iloc[i]['Label'])):
            count=count+1
            cur_error = str(in_answers[i]) + "!=" + str(in_predicted.iloc[i]['Label'])
            errors_arr.append(cur_error)
#             print("error in line " + str(i) +
#                   " " + str(in_answers[i]) +
#                   "!=" + str(in_predicted.iloc[i]['Label']))
#     print("Errors: " + str(errors_arr))
    print("Number of error: " + str(count) + " from " +
          str(number_of_rows) + " test samples \nWhich is "
          + str(count/number_of_rows) + "% of error.")

In [6]:
def get_rand_column_data(in_data, in_column_indx):
    cur_col=in_data[in_column_indx]
    row_len = len(cur_col)
    max_value = cur_col.max()
    value = cur_col[0]
    min_value = cur_col.min()
    if 'int64' == in_data[in_column_indx].dtypes:
        cur_col = pd.DataFrame(np.random.randint(min_value, max_value, size=row_len, dtype=np.int64))
    if 'float64' == in_data[in_column_indx].dtypes:
        cur_col = pd.DataFrame((max_value - min_value) * np.random.random_sample(row_len) + min_value)
    in_data[in_column_indx] = cur_col
    return in_data

In [8]:
# set up features groups
SSL_features = ['fSSL_session_id_len', 'fSSL_num_extensions', 'fcipher_suites', 'ssl_v', ]
size_features = ['size_histogram_1','size_histogram_2','size_histogram_3',
                 'size_histogram_4','size_histogram_5','size_histogram_6',
                 'size_histogram_7','size_histogram_8','size_histogram_9', 'size_histogram_10',
                'fpackets', 'bpackets', 'fbytes', 'bbytes','min_packet_size',
                 'max_packet_size', 'mean_packet_size','sizevar']
peak_features = ['fpeak_features_1','fpeak_features_2','fpeak_features_3',
                 'fpeak_features_4','fpeak_features_5','fpeak_features_6',
                 'fpeak_features_7','fpeak_features_8','fpeak_features_9',
                 'bpeak_features_1','bpeak_features_2','bpeak_features_3',
                 'bpeak_features_4','bpeak_features_5','bpeak_features_6',
                 'bpeak_features_7','bpeak_features_8','bpeak_features_9']
TCP_features = ['SYN_tcp_scale', 'SYN_tcp_winsize']
common_features = ['packet_count','num_keep_alive', 'mean_fttl',
                   'max_fpkt','max_bpkt','std_fpkt','std_bpkt','mean_fpkt','mean_bpkt']
stat_features = ['min_packet_size', 'max_packet_size', 'mean_packet_size',
                 'sizevar', 'std_fiat','max_fiat','max_biat','std_biat','mean_fiat','mean_biat',
                'min_fpkt','min_bpkt','max_fpkt','max_bpkt','std_fpkt','std_bpkt','mean_fpkt','mean_bpkt']
time_features = ['std_fiat','max_fiat','max_biat','std_biat','mean_fiat','mean_biat']

# Read Data

In [9]:
data = pd.read_csv(path+file_name+target_label+'_train.csv',
                      sep='\t',
                      skiprows=[1])

# Setup Data and Build Model

In [10]:
setup(data=data,
      target=target_label,
      numeric_features=num_features,
      silent=True)
model=create_model(learning_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9668,0.0,0.722,0.9579,0.961,0.9607,0.9608
1,0.9668,0.0,0.7531,0.9671,0.9631,0.9607,0.9608
2,0.9668,0.0,0.7849,0.9598,0.9584,0.9606,0.9608
3,0.9703,0.0,0.7395,0.9647,0.9652,0.9648,0.9649
4,0.9685,0.0,0.7688,0.9668,0.9669,0.9627,0.9628
5,0.965,0.0,0.7187,0.959,0.9577,0.9585,0.9587
6,0.9615,0.0,0.7695,0.9534,0.9563,0.9544,0.9545
7,0.9615,0.0,0.7732,0.9647,0.9606,0.9546,0.9547
8,0.9737,0.0,0.8218,0.9756,0.9729,0.9689,0.969
9,0.9772,0.0,0.8216,0.9716,0.9733,0.973,0.9731


### Read Test Data

In [11]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Predicte and Check

In [12]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [13]:
predicted = predict_model(model, data=unseen_data)

In [14]:
compare_prediction_with_answers(predicted,answers)

Number of error: 98 from 3501 test samples 
Which is 0.02799200228506141% of error.


# SSL Features Group

### Read Test Data

In [15]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Change values

In [16]:
features_group = SSL_features
print ('current columns are : ' + str(features_group))
for i in SSL_features:
    unseen_data = get_rand_column_data(unseen_data,i)

current columns are : ['fSSL_session_id_len', 'fSSL_num_extensions', 'fcipher_suites', 'ssl_v']


### Predicte and Check

In [17]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [18]:
predicted = predict_model(model, data=unseen_data)

In [19]:
compare_prediction_with_answers(predicted,answers)

Number of error: 336 from 3501 test samples 
Which is 0.0959725792630677% of error.


# Size Features Group

### Read Test Data

In [20]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Change values

In [21]:
features_group = size_features
print ('current columns are : ' + str(features_group))
for i in features_group:
    unseen_data = get_rand_column_data(unseen_data,i)

current columns are : ['size_histogram_1', 'size_histogram_2', 'size_histogram_3', 'size_histogram_4', 'size_histogram_5', 'size_histogram_6', 'size_histogram_7', 'size_histogram_8', 'size_histogram_9', 'size_histogram_10', 'fpackets', 'bpackets', 'fbytes', 'bbytes', 'min_packet_size', 'max_packet_size', 'mean_packet_size', 'sizevar']


### Predicte and Check

In [22]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [23]:
predicted = predict_model(model, data=unseen_data)

In [24]:
compare_prediction_with_answers(predicted,answers)

Number of error: 439 from 3501 test samples 
Which is 0.12539274493002% of error.


# COMMON Features Group

### Read Test Data

In [25]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Change values

In [26]:
features_group = common_features
print ('current columns are : ' + str(features_group))
for i in features_group:
    unseen_data = get_rand_column_data(unseen_data,i)

current columns are : ['packet_count', 'num_keep_alive', 'mean_fttl', 'max_fpkt', 'max_bpkt', 'std_fpkt', 'std_bpkt', 'mean_fpkt', 'mean_bpkt']


### Predicte and Check

In [27]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [28]:
predicted = predict_model(model, data=unseen_data)

In [29]:
compare_prediction_with_answers(predicted,answers)

Number of error: 326 from 3501 test samples 
Which is 0.09311625249928591% of error.


# TCP Features Group

### Read Test Data

In [30]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Change values

In [31]:
features_group = TCP_features
print ('current columns are : ' + str(features_group))
for i in features_group:
    unseen_data = get_rand_column_data(unseen_data,i)

current columns are : ['SYN_tcp_scale', 'SYN_tcp_winsize']


### Predicte and Check

In [32]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [33]:
predicted = predict_model(model, data=unseen_data)

In [34]:
compare_prediction_with_answers(predicted,answers)

Number of error: 274 from 3501 test samples 
Which is 0.07826335332762067% of error.


# STAT Features Group

### Read Test Data

In [35]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Change values

In [36]:
features_group = stat_features
print ('current columns are : ' + str(features_group))
for i in features_group:
    unseen_data = get_rand_column_data(unseen_data,i)

current columns are : ['min_packet_size', 'max_packet_size', 'mean_packet_size', 'sizevar', 'std_fiat', 'max_fiat', 'max_biat', 'std_biat', 'mean_fiat', 'mean_biat', 'min_fpkt', 'min_bpkt', 'max_fpkt', 'max_bpkt', 'std_fpkt', 'std_bpkt', 'mean_fpkt', 'mean_bpkt']


### Predicte and Check

In [37]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [38]:
predicted = predict_model(model, data=unseen_data)

In [39]:
compare_prediction_with_answers(predicted,answers)

Number of error: 1356 from 3501 test samples 
Which is 0.3873179091688089% of error.


# Time Features Group

### Read Test Data

In [40]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Change values

In [41]:
features_group = time_features
print ('current columns are : ' + str(features_group))
for i in features_group:
    unseen_data = get_rand_column_data(unseen_data,i)

current columns are : ['std_fiat', 'max_fiat', 'max_biat', 'std_biat', 'mean_fiat', 'mean_biat']


### Predicte and Check

In [42]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [43]:
predicted = predict_model(model, data=unseen_data)

In [44]:
compare_prediction_with_answers(predicted,answers)

Number of error: 445 from 3501 test samples 
Which is 0.12710654098828905% of error.
