# Robust Test - Distributed values - Features Groups
for each group features (which presented by several columns in the data) we will permute the values by setting randomly values that distrebuted the same as the original values.

# Import

In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *
import time

# Settings

In [2]:
# set constants
target_label = 'tuple'
learning_model = 'rf' # ['rf','et','lightgbm','xgboost']
num_features = ['min_packet_size', 'min_fpkt', 'min_bpkt']
file_name = "new_all_features_"
path = "../Datasets/" + target_label + "_dataset/"

In [3]:
# function for making model-prediction over the data set and measure the run time 
def timed_prediction(in_data,in_model):
    t = time.process_time()
    predicted = predict_model(in_model, data=in_data)
    elapsed_time = time.process_time() - t
    print("prediction took: " + str(elapsed_time))
    return predicted

In [4]:
# compare answers and labeled test
def compare_prediction_with_answers(in_predicted, in_answers):
    count=0
    index = in_predicted.index
    number_of_rows = len(index)
    errors_arr = []
    for i in range(0,number_of_rows):
        if str(int(in_answers.iloc[i])) != str(int(in_predicted.iloc[i]['Label'])):
            count=count+1
            cur_error = str(in_answers[i]) + "!=" + str(in_predicted.iloc[i]['Label'])
            errors_arr.append(cur_error)
#             print("error in line " + str(i) +
#                   " " + str(in_answers[i]) +
#                   "!=" + str(in_predicted.iloc[i]['Label']))
#     print("Errors: " + str(errors_arr))
    print("Number of error: " + str(count) + " from " +
          str(number_of_rows) + " test samples \nWhich is "
          + str(100*count/number_of_rows) + "% of error.")

In [7]:
# set up features groups
SSL_features = ['fSSL_session_id_len', 'fSSL_num_extensions', 'fcipher_suites', 'ssl_v', ]
size_features = ['size_histogram_1','size_histogram_2','size_histogram_3',
                 'size_histogram_4','size_histogram_5','size_histogram_6',
                 'size_histogram_7','size_histogram_8','size_histogram_9', 'size_histogram_10',
                'fpackets', 'bpackets', 'fbytes', 'bbytes','min_packet_size',
                 'max_packet_size', 'mean_packet_size','sizevar']
peak_features = ['fpeak_features_1','fpeak_features_2','fpeak_features_3',
                 'fpeak_features_4','fpeak_features_5','fpeak_features_6',
                 'fpeak_features_7','fpeak_features_8','fpeak_features_9',
                 'bpeak_features_1','bpeak_features_2','bpeak_features_3',
                 'bpeak_features_4','bpeak_features_5','bpeak_features_6',
                 'bpeak_features_7','bpeak_features_8','bpeak_features_9']
TCP_features = ['SYN_tcp_scale', 'SYN_tcp_winsize']
common_features = ['packet_count','num_keep_alive', 'mean_fttl',
                   'max_fpkt','max_bpkt','std_fpkt','std_bpkt','mean_fpkt','mean_bpkt']
stat_features = ['min_packet_size', 'max_packet_size', 'mean_packet_size',
                 'sizevar', 'std_fiat','max_fiat','max_biat','std_biat','mean_fiat','mean_biat',
                'min_fpkt','min_bpkt','max_fpkt','max_bpkt','std_fpkt','std_bpkt','mean_fpkt','mean_bpkt']
time_features = ['std_fiat','max_fiat','max_biat','std_biat','mean_fiat','mean_biat']

# Read Data

In [8]:
data = pd.read_csv(path+file_name+target_label+'_train.csv',
                      sep='\t',
                      skiprows=[1])

# Setup Data and Build Model

In [9]:
setup(data=data,
      target=target_label,
      numeric_features=num_features,
      silent=True)
model=create_model(learning_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9755,0.0,0.876,0.9753,0.9734,0.971,0.9711
1,0.9633,0.0,0.6446,0.955,0.9567,0.9565,0.9566
2,0.9668,0.0,0.7314,0.9667,0.9651,0.9607,0.9607
3,0.9703,0.0,0.8318,0.9688,0.9673,0.9648,0.9649
4,0.9563,0.0,0.7536,0.9537,0.9507,0.9482,0.9484
5,0.9703,0.0,0.8247,0.9603,0.9634,0.9648,0.965
6,0.9598,0.0,0.7481,0.9553,0.956,0.9524,0.9525
7,0.9685,0.0,0.7728,0.966,0.9659,0.9627,0.9628
8,0.9772,0.0,0.7713,0.9659,0.9708,0.973,0.9731
9,0.972,0.0,0.8083,0.9694,0.9685,0.9668,0.9669


### Read Test Data

In [10]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Predicte and Check

In [11]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [12]:
predicted = predict_model(model, data=unseen_data)

In [13]:
compare_prediction_with_answers(predicted,answers)

Number of error: 94 from 3501 test samples 
Which is 0.0268494715795487% of error.


# SSL Features Group

### Read Test Data

In [14]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Change values

In [15]:
features_group = SSL_features
print ('current columns are : ' + str(features_group))
for i in SSL_features:
    unseen_data[i] = np.random.permutation(unseen_data[i].values)

current columns are : ['fSSL_session_id_len', 'fSSL_num_extensions', 'fcipher_suites', 'ssl_v']


### Predicte and Check

In [16]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [17]:
predicted = predict_model(model, data=unseen_data)

In [18]:
compare_prediction_with_answers(predicted,answers)

Number of error: 322 from 3501 test samples 
Which is 0.09197372179377321% of error.


# Size Features Group

### Read Test Data

In [19]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Change values

In [20]:
features_group = size_features
print ('current columns are : ' + str(features_group))
for i in features_group:
    unseen_data[i] = np.random.permutation(unseen_data[i].values)

current columns are : ['size_histogram_1', 'size_histogram_2', 'size_histogram_3', 'size_histogram_4', 'size_histogram_5', 'size_histogram_6', 'size_histogram_7', 'size_histogram_8', 'size_histogram_9', 'size_histogram_10', 'fpackets', 'bpackets', 'fbytes', 'bbytes', 'min_packet_size', 'max_packet_size', 'mean_packet_size', 'sizevar']


### Predicte and Check

In [21]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [22]:
predicted = predict_model(model, data=unseen_data)

In [23]:
compare_prediction_with_answers(predicted,answers)

Number of error: 254 from 3501 test samples 
Which is 0.07255069980005713% of error.


# COMMON Features Group

### Read Test Data

In [24]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Change values

In [25]:
features_group = common_features
print ('current columns are : ' + str(features_group))
for i in features_group:
    unseen_data[i] = np.random.permutation(unseen_data[i].values)

current columns are : ['packet_count', 'num_keep_alive', 'mean_fttl', 'max_fpkt', 'max_bpkt', 'std_fpkt', 'std_bpkt', 'mean_fpkt', 'mean_bpkt']


### Predicte and Check

In [26]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [27]:
predicted = predict_model(model, data=unseen_data)

In [28]:
compare_prediction_with_answers(predicted,answers)

Number of error: 190 from 3501 test samples 
Which is 0.054270208511853754% of error.


# TCP Features Group

### Read Test Data

In [29]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Change values

In [30]:
features_group = TCP_features
print ('current columns are : ' + str(features_group))
for i in features_group:
    unseen_data[i] = np.random.permutation(unseen_data[i].values)

current columns are : ['SYN_tcp_scale', 'SYN_tcp_winsize']


### Predicte and Check

In [31]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [32]:
predicted = predict_model(model, data=unseen_data)

In [33]:
compare_prediction_with_answers(predicted,answers)

Number of error: 292 from 3501 test samples 
Which is 0.08340474150242788% of error.


# STAT Features Group

### Read Test Data

In [34]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Change values

In [35]:
features_group = stat_features
print ('current columns are : ' + str(features_group))
for i in features_group:
    unseen_data[i] = np.random.permutation(unseen_data[i].values)

current columns are : ['min_packet_size', 'max_packet_size', 'mean_packet_size', 'sizevar', 'std_fiat', 'max_fiat', 'max_biat', 'std_biat', 'mean_fiat', 'mean_biat', 'min_fpkt', 'min_bpkt', 'max_fpkt', 'max_bpkt', 'std_fpkt', 'std_bpkt', 'mean_fpkt', 'mean_bpkt']


### Predicte and Check

In [36]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [37]:
predicted = predict_model(model, data=unseen_data)

In [38]:
compare_prediction_with_answers(predicted,answers)

Number of error: 505 from 3501 test samples 
Which is 0.14424450157097973% of error.


# Time Features Group

### Read Test Data

In [39]:
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

### Change values

In [40]:
features_group = time_features
print ('current columns are : ' + str(features_group))
for i in features_group:
    unseen_data[i] = np.random.permutation(unseen_data[i].values)

current columns are : ['std_fiat', 'max_fiat', 'max_biat', 'std_biat', 'mean_fiat', 'mean_biat']


### Predicte and Check

In [41]:
answers = unseen_data[target_label]
unseen_data = unseen_data.drop(columns=[target_label])

In [42]:
predicted = predict_model(model, data=unseen_data)

In [43]:
compare_prediction_with_answers(predicted,answers)

Number of error: 148 from 3501 test samples 
Which is 0.042273636103970294% of error.
