# Feature selection and rows filtering

In [1]:
import ipaddress
import pandas as pd
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, GridSearchCV
%matplotlib inline


def convert_ip2int(ip_add):
    '''
    converts ip address to equivalent integer value
    '''
    # ip = ipaddress.ip_address(ip_add)
    # ip_ver = 4 if '.' in ip_add else 6
    try:
        if '.' in ip_add:  # ipv4
            ip = ipaddress.IPv4Address(ip_add)
        elif ':' in ip_add:  # ipv6
            ip = ipaddress.IPv6Address(ip_add)
        else:
            print('Invalid IP version', ip_add)
            ip = 0  # 0 refers invalid ip address
    except Exception as e:
        print('Invalid IP address: ', e)
        ip = 0  # 0 refers invalid ip address
    return int(ip)


def normalize_data(df_exp):
    '''
    Normalize data using MinMaxScaler
    '''
    # Normalize data: Method 1 
    df_hist = df_exp.astype(float)
    x = df_hist.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(x)
    df_scaled = pd.DataFrame(x_scaled)
    df_scaled.columns = df_hist.columns

    # # Normalize data: Method 2
    # df_hist = df_hist.astype(float)
    # df_scaled = df_hist.apply(preprocessing.normalize)
    # # df_scaled = (df_hist-df_hist.mean())/df_hist.std()
    return df_scaled


# remove the unique columns from the dataframe because they will not contribute for the machine learning model
def filter_columns(df):
    '''
    returns the list of all unique values and single value columns in the dataframe
    '''
    cols_delete = []
    for col in df.columns:
        n_unique = df[col].nunique()
        if n_unique == 1 or n_unique == len(df):
            cols_delete.append(col)
    print('Columns to remove from the dataset:', cols_delete)
    return cols_delete

In [7]:
df_master = pd.read_csv('../data/processed/iot_23_small_4_10000000_encoded.csv')
df_master

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,id_orig_h,id_orig_p,id_resp_h,id_resp_p,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,orig_ip_bytes,...,history_ShArr,history_ShDadAttt,history_ShR,history_ShwA,history_ShwAr,history_ShwR,history_Sr,history_^d,history_^f,history_^hR
0,3232235976,52724,2808329966,80,1.097859,149,119442,0,174,11698,...,0,0,0,0,0,0,0,0,0,0
1,3232235976,52726,2808329966,80,2.018225,149,119442,0,172,11570,...,0,0,0,0,0,0,0,0,0,0
2,3232235976,52728,2808329966,80,1.067555,152,83118,0,124,8120,...,0,0,0,0,0,0,0,0,0,0
3,3232235976,52730,2808329966,80,1.079897,149,88797,0,136,8578,...,0,0,0,0,0,0,0,0,0,0
4,3232235976,52732,2808329966,80,1.093814,149,117700,0,170,11602,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,3232235973,38370,696565946,37215,0.000000,0,0,0,1,40,...,0,0,0,0,0,0,0,0,0,0
9999996,3232235973,38370,3318964195,37215,0.000000,0,0,0,1,40,...,0,0,0,0,0,0,0,0,0,0
9999997,3232235973,38370,3320176062,37215,0.000000,0,0,0,1,40,...,0,0,0,0,0,0,0,0,0,0
9999998,3232235973,38370,3310127778,37215,0.000000,0,0,0,1,40,...,0,0,0,0,0,0,0,0,0,0
