# Minimal Features
This notebook check classification preformance for the minimal and  most important features in our dataset.

# Import

In [1]:
import pandas as pd
from pycaret.classification import *
import numpy as np

# Functions and Constants

In [2]:
# set target feature
target_label = 'tuple'
path = "../Datasets/" + target_label + "_dataset/"
num_features = ['min_packet_size', 'min_fpkt', 'min_bpkt']


# set up features groups
SSL_features = ['fSSL_session_id_len', 'fSSL_num_extensions', 'fcipher_suites', 'ssl_v', ]
size_features = ['size_histogram_1','size_histogram_2','size_histogram_3',
                 'size_histogram_4','size_histogram_5','size_histogram_6',
                 'size_histogram_7','size_histogram_8','size_histogram_9', 'size_histogram_10']
peak_features = ['fpeak_features_1','fpeak_features_2','fpeak_features_3',
                 'fpeak_features_4','fpeak_features_5','fpeak_features_6',
                 'fpeak_features_7','fpeak_features_8','fpeak_features_9',
                 'bpeak_features_1','bpeak_features_2','bpeak_features_3',
                 'bpeak_features_4','bpeak_features_5','bpeak_features_6',
                 'bpeak_features_7','bpeak_features_8','bpeak_features_9']
TCP_features = ['SYN_tcp_scale', 'SYN_tcp_winsize']
common_features = ['packet_count', 'fpackets', 'bpackets', 'fbytes', 'bbytes','num_keep_alive', 'mean_fttl']
stat_features = ['min_packet_size', 'max_packet_size', 'mean_packet_size',
                 'sizevar', 'std_fiat', # 'min_fiat', 'min_biat',
                'max_fiat','max_biat','std_biat','mean_fiat','mean_biat',
                'min_fpkt','min_bpkt','max_fpkt','max_bpkt','std_fpkt','std_bpkt','mean_fpkt','mean_bpkt']
time_features = []
forward_features = ['fpeak_features_1','fpeak_features_2','fpeak_features_3','fpeak_features_4',
                    'fpeak_features_5','fpeak_features_6','fpeak_features_7','fpeak_features_8',
                    'fpeak_features_9','std_fiat','fpackets','fbytes','max_fiat', #'min_fiat'
                    'mean_fiat','min_fpkt','max_fpkt','std_fpkt','mean_fpkt','fcipher_suites','ssl_v','mean_fttl']
backward_features = ['bpeak_features_1','bpeak_features_2','bpeak_features_3',
                    'bpeak_features_4','bpeak_features_5','bpeak_features_6',
                     'bpeak_features_7','bpeak_features_8','bpeak_features_9',
                     'bpackets','bbytes','max_biat','std_biat','mean_biat', #'min_biat'
                     'min_bpkt','max_bpkt','std_bpkt','mean_bpkt']
both_features = ['fSSL_session_id_len','fSSL_num_extensions','SYN_tcp_scale',
                 'SYN_tcp_winsize','size_histogram_1','size_histogram_2',
                 'size_histogram_3','size_histogram_4','size_histogram_5',
                 'size_histogram_6','size_histogram_7','size_histogram_8',
                 'size_histogram_9','size_histogram_10','packet_count',
                 'min_packet_size','max_packet_size','mean_packet_size','sizevar','num_keep_alive']

# Read Data

In [12]:
data = pd.read_csv(path+r'new_all_features_'+target_label+'.csv',
                      sep='\t',
                      skiprows=[1])
data.shape

(20632, 60)

# Setup Classifier and Compare

In [8]:
setup(data=data,
      target=target_label,
      numeric_features=num_features,
      silent=True)
model=create_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9737,0.0,0.8757,0.9738,0.9728,0.9692,0.9693
1,0.9619,0.0,0.8546,0.9631,0.9613,0.9555,0.9555
2,0.9723,0.0,0.8698,0.974,0.9716,0.9676,0.9676
3,0.9778,0.0,0.892,0.9772,0.9768,0.9741,0.9741
4,0.9792,0.0,0.8804,0.9769,0.9779,0.9757,0.9757
5,0.9758,0.0,0.9139,0.9756,0.975,0.9716,0.9717
6,0.9737,0.0,0.9338,0.9739,0.9729,0.9692,0.9693
7,0.9626,0.0,0.8296,0.9607,0.961,0.9562,0.9563
8,0.9737,0.0,0.8543,0.9715,0.9721,0.9692,0.9692
9,0.9771,0.0,0.8718,0.9754,0.9756,0.9732,0.9733


# Create new data set

In [9]:
features_group = SSL_features + common_features + stat_features
new_data = pd.DataFrame(columns=features_group+[target_label])
print ('current columns are : ' + str(features_group))
for i in features_group:
    new_data[i] = data[i]
new_data[target_label] = data[target_label]

current columns are : ['fSSL_session_id_len', 'fSSL_num_extensions', 'fcipher_suites', 'ssl_v', 'packet_count', 'fpackets', 'bpackets', 'fbytes', 'bbytes', 'num_keep_alive', 'mean_fttl', 'min_packet_size', 'max_packet_size', 'mean_packet_size', 'sizevar', 'std_fiat', 'max_fiat', 'max_biat', 'std_biat', 'mean_fiat', 'mean_biat', 'min_fpkt', 'min_bpkt', 'max_fpkt', 'max_bpkt', 'std_fpkt', 'std_bpkt', 'mean_fpkt', 'mean_bpkt']


In [10]:
for i in backward_features:
    if i in new_data.columns:
        new_data = new_data.drop(i, axis=1)

In [11]:
new_data.shape

(20632, 21)

using 20 features instead of 59

In [13]:
setup(data=new_data,
      target=target_label,
      silent=True)
model=create_model('rf')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9709,0.0,0.8489,0.9709,0.9698,0.966,0.966
1,0.9702,0.0,0.8689,0.9692,0.9686,0.9652,0.9652
2,0.9709,0.0,0.8316,0.9708,0.9695,0.9659,0.966
3,0.973,0.0,0.9184,0.9742,0.9725,0.9684,0.9684
4,0.9598,0.0,0.8827,0.9614,0.9602,0.953,0.953
5,0.9737,0.0,0.9079,0.9739,0.9729,0.9692,0.9692
6,0.9716,0.0,0.8706,0.9707,0.9707,0.9668,0.9668
7,0.9702,0.0,0.8753,0.9709,0.9695,0.9652,0.9652
8,0.9654,0.0,0.8865,0.9646,0.9645,0.9595,0.9595
9,0.9633,0.0,0.8347,0.9622,0.9619,0.957,0.9571


We can see the using only SSL, Common and Statistics Features can establish proper results