# Low Variance Columns
this notebook will check the mean variance of each column in the data. furthermore we will remove all those low variance columns and compare classification scores of the new data (without the low variance columns) and the original data.

# Import

In [1]:
import pandas as pd
from pycaret.classification import *
import time

# Settings

In [2]:
# set constants
target_label = 'tuple'
learning_model = ['rf'] # ['rf','et','lightgbm','xgboost']
num_features = ['min_packet_size', 'min_fpkt', 'min_bpkt']
file_name = "all_features_"
path = "../Datasets/" + target_label + "_dataset/"
mean_variance_lim = 0.2
low_mean_variance_columns = []
mean_variance_sum = 0
mean_variance_min = float("inf")
mean_variance_max = 0

In [3]:
# function for making model-prediction over the data set and measure the run time 
def timed_prediction(in_data,in_model):
    t = time.process_time()
    predicted = predict_model(in_model, data=in_data)
    elapsed_time = time.process_time() - t
    print("prediction took: " + str(elapsed_time))
    return predicted

In [4]:
# compare answers and labeled test
def compare_prediction_with_answers(in_predicted, in_answers):
    count=0
    index = in_predicted.index
    number_of_rows = len(index)
    errors_arr = []
    for i in range(0,number_of_rows):
        if str(int(in_answers.iloc[i])) != str(int(in_predicted.iloc[i]['Label'])):
            count=count+1
            cur_error = str(in_answers[i]) + "!=" + str(in_predicted.iloc[i]['Label'])
            errors_arr.append(cur_error)
    print("Number of error: " + str(count) + " from " +
          str(number_of_rows) + " test samples \nWhich is "
          + str(100*count/number_of_rows) + "% of error.")
    return count

# Read Data Setup and Create Model

In [5]:
data = pd.read_csv(path+file_name+target_label+'_train.csv',
                      sep='\t',
                      skiprows=[1])

setup(data=data,
      target=target_label,
      numeric_features=num_features,
      silent=True)

model = create_model('et')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9822,0.0,0.8501,0.9789,0.9797,0.9792,0.9792
1,0.9763,0.0,0.8775,0.9761,0.9744,0.9722,0.9723
2,0.9763,0.0,0.9455,0.9785,0.9755,0.9722,0.9723
3,0.9693,0.0,0.8894,0.9688,0.968,0.9641,0.9642
4,0.9693,0.0,0.9014,0.9708,0.9681,0.9641,0.9642
5,0.9743,0.0,0.9025,0.976,0.9742,0.9699,0.9699
6,0.9743,0.0,0.8678,0.9727,0.9726,0.9699,0.9699
7,0.9753,0.0,0.908,0.9752,0.9747,0.9711,0.9711
8,0.9792,0.0,0.9277,0.9784,0.9785,0.9757,0.9757
9,0.9752,0.0,0.8803,0.9728,0.9735,0.971,0.971


# Make Unseen Test

In [8]:
# read unseen data
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

# saving the target column
answers = unseen_data[target_label]

# dropping traget column from test.
unseen_data = unseen_data.drop(columns=[target_label])

In [9]:
predicted = timed_prediction(unseen_data,model)

prediction took: 0.671875


In [10]:
compare_prediction_with_answers(predicted,answers)

Number of error: 169 from 6189 test samples 
Which is 2.730651155275489% of error.


169

# Calculation Mean Variance for each column of the data

In [12]:
total = len(data)
for i in data.columns:
    for value in data[i]:
        if value > mean_variance_max: mean_variance_max = value
        if value < mean_variance_min: mean_variance_min = value
        mean_variance = abs(value - data[i].mean()) / total
        mean_variance_sum = mean_variance_sum + mean_variance
    if mean_variance_lim > mean_variance_sum:
        print(str(i) + " got low mean variance of " +
              str(mean_variance_sum) + " while values are between "
              + str(mean_variance_min) + " and " + str(mean_variance_max))
        low_mean_variance_columns.append(i)
    mean_variance_sum = 0
    mean_variance_max = 0
    mean_variance_min = float("inf")
    
print("\n so all low mean variance columns are: " + str(low_mean_variance_columns))

fSSL_num_compression_methods got low mean variance of 0.0 while values are between 1.0 and 1.0
SYN_MSS got low mean variance of 0.0 while values are between 1460.0 and 1460.0
fSSLv_1 got low mean variance of 0.0 while values are between 0.0 and 0
fSSLv_3 got low mean variance of 0.0 while values are between 0.0 and 0
bpeak_features_7 got low mean variance of 0.08566631681398115 while values are between 0.0 and 120.84436011314392
min_fiat got low mean variance of 3.738576499888164e-09 while values are between 0.0 and 2.2e-05
min_biat got low mean variance of 6.158689389402098e-05 while values are between -1e-06 and 0.058487

 so all low mean variance columns are: ['fSSL_num_compression_methods', 'SYN_MSS', 'fSSLv_1', 'fSSLv_3', 'bpeak_features_7', 'min_fiat', 'min_biat']


# Read Data Setup and Create Model

droping the low variance column and make unseen data test

In [13]:
data = pd.read_csv(path+file_name+target_label+'_train.csv',
                      sep='\t',
                      skiprows=[1])

# droping low variance columns
data.drop(low_mean_variance_columns, axis=1)

setup(data=data,
      target=target_label,
      numeric_features=num_features,
      silent=True)

model = create_model('et')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9802,0.0,0.902,0.9794,0.9792,0.9769,0.9769
1,0.9782,0.0,0.8983,0.9769,0.9768,0.9745,0.9746
2,0.9723,0.0,0.8926,0.9707,0.9708,0.9676,0.9676
3,0.9743,0.0,0.8785,0.9738,0.9736,0.9699,0.9699
4,0.9654,0.0,0.8854,0.9676,0.9653,0.9595,0.9595
5,0.9743,0.0,0.8722,0.9768,0.9743,0.9699,0.97
6,0.9802,0.0,0.8706,0.9812,0.9797,0.9769,0.9769
7,0.9792,0.0,0.8528,0.9769,0.9775,0.9757,0.9757
8,0.9753,0.0,0.8403,0.9749,0.974,0.9711,0.9711
9,0.9792,0.0,0.9321,0.9809,0.9795,0.9757,0.9757


# Make Unseen Test

In [16]:
#read unseen data
unseen_data = pd.read_csv(path+file_name+target_label+'_test.csv',
                      sep='\t',
                      skiprows=[1])

# droping low variance columns
data.drop(low_mean_variance_columns, axis=1)

# saving the target column.
answers = unseen_data[target_label]

# dropping traget column from test.
unseen_data = unseen_data.drop(columns=[target_label])

In [17]:
predicted = timed_prediction(unseen_data,model)

prediction took: 0.609375


In [18]:
compare_prediction_with_answers(predicted,answers)

Number of error: 161 from 6189 test samples 
Which is 2.601389562126353% of error.


161

we can see improvment with our scores while removing 7 low mean variance columns