In [1]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)


dataset_base_path = r'/Users/kripik123/Documents/dataset/processedx/'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import scikitplot as skplt
from sklearn.model_selection import train_test_split
import re
from scipy.stats import describe


from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline


In [2]:
types = {
    'dst_port': 'uint32',
    'protocol': 'uint8',
    'timestamp': 'object',
    'flow_duration': 'int64',
    'tot_fwd_pkts': 'uint32',
    'tot_bwd_pkts': 'uint32',
    'totlen_fwd_pkts': 'uint32',
    'totlen_bwd_pkts': 'uint32',
    'fwd_pkt_len_max': 'uint16',
    'fwd_pkt_len_min': 'uint16',
    'fwd_pkt_len_mean': 'float32',
    'fwd_pkt_len_std': 'float32',
    'bwd_pkt_len_max': 'uint16',
    'bwd_pkt_len_min': 'uint16',
    'bwd_pkt_len_mean': 'float32',
    'bwd_pkt_len_std': 'float32',
    'flow_byts_s': 'float64',
    'flow_pkts_s': 'float64',
    'flow_iat_mean': 'float32',
    'flow_iat_std': 'float32',
    'flow_iat_max': 'int64',
    'flow_iat_min': 'int64',
    'fwd_iat_tot': 'int64',
    'fwd_iat_mean': 'float32',
    'fwd_iat_std': 'float32',
    'fwd_iat_max': 'int64',
    'fwd_iat_min': 'int64',
    'bwd_iat_tot': 'uint32',
    'bwd_iat_mean': 'float32',
    'bwd_iat_std': 'float32',
    'bwd_iat_max': 'uint32',
    'bwd_iat_min': 'uint32',
    'fwd_psh_flags': 'uint8',
    'bwd_psh_flags': 'uint8',
    'fwd_urg_flags': 'uint8',
    'bwd_urg_flags': 'uint8',
    'fwd_header_len': 'uint32',
    'bwd_header_len': 'uint32',
    'fwd_pkts_s': 'float32',
    'bwd_pkts_s': 'float32',
    'pkt_len_min': 'uint16',
    'pkt_len_max': 'uint16',
    'pkt_len_mean': 'float32',
    'pkt_len_std': 'float32',
    'pkt_len_var': 'float32',
    'fin_flag_cnt': 'uint8',
    'syn_flag_cnt': 'uint8',
    'rst_flag_cnt': 'uint8',
    'psh_flag_cnt': 'uint8',
    'ack_flag_cnt': 'uint8',
    'urg_flag_cnt': 'uint8',
    'cwe_flag_count': 'uint8',
    'ece_flag_cnt': 'uint8',
    'down_up_ratio': 'uint16',
    'pkt_size_avg': 'float32',
    'fwd_seg_size_avg': 'float32',
    'bwd_seg_size_avg': 'float32',
    'fwd_byts_b_avg': 'uint8',
    'fwd_pkts_b_avg': 'uint8',
    'fwd_blk_rate_avg': 'uint8',
    'bwd_byts_b_avg': 'uint8',
    'bwd_pkts_b_avg': 'uint8',
    'bwd_blk_rate_avg': 'uint8',
    'subflow_fwd_pkts': 'uint32',
    'subflow_fwd_byts': 'uint32',
    'subflow_bwd_pkts': 'uint32',
    'subflow_bwd_byts': 'uint32',
    'init_fwd_win_byts': 'int32',
    'init_bwd_win_byts': 'int32',
    'fwd_act_data_pkts': 'uint32',
    'fwd_seg_size_min': 'uint8',
    'active_mean': 'float32',
    'active_std': 'float32',
    'active_max': 'uint32',
    'active_min': 'uint32',
    'idle_mean': 'float32',
    'idle_std': 'float32',
    'idle_max': 'uint64',
    'idle_min': 'uint64',
    'label': 'category'
}

def replace_infinity_with_mean(df):
    inf_columns = [c for c in df.columns if df[df[c] == np.inf][c].count() > 0]
    for col in inf_columns:
        df[col].replace([np.inf, -np.inf], np.nan, inplace=True)
        mean = df[col].mean()
        df[col].fillna(mean, inplace=True)
    return df


def replace_negative_values_with_mean(df):
    numeric_cols = df.select_dtypes(include=[np.number]).columns.values
    
    columns = [c for c in numeric_cols if df[df[c] < 0][c].count() > 0]
    for col in columns:
        mask = df[col] < 0
        df.loc[mask, col] = np.nan
        mean = df[col].mean()
        df[col].fillna(mean, inplace=True)
    return df


def load_dataset(files, dtypes, cols=None):
    df = pd.concat((pd.read_csv(f, dtype=dtypes, usecols=cols) for f in files))
    
    df = replace_infinity_with_mean(df)
    df = replace_negative_values_with_mean(df)
        
    ##df['label_cat'] = df.label.astype('category').cat.codes
    df['label_is_attack'] = (df.label != 'Benign').astype('int')
    return df

In [3]:
#join all csv
#split data jadi-2 X dan Y
csv_files = glob.glob(os.path.join(dataset_base_path, 'dos-*.csv'))

df = load_dataset(csv_files, types)
X = df.drop(columns=['label', 'label_is_attack','timestamp', 'dst_port'])
y = df[['label_is_attack']]


In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097149 entries, 0 to 1048574
Data columns (total 77 columns):
 #   Column             Dtype  
---  ------             -----  
 0   protocol           uint8  
 1   flow_duration      int64  
 2   tot_fwd_pkts       uint32 
 3   tot_bwd_pkts       uint32 
 4   totlen_fwd_pkts    uint32 
 5   totlen_bwd_pkts    uint32 
 6   fwd_pkt_len_max    uint16 
 7   fwd_pkt_len_min    uint16 
 8   fwd_pkt_len_mean   float32
 9   fwd_pkt_len_std    float32
 10  bwd_pkt_len_max    uint16 
 11  bwd_pkt_len_min    uint16 
 12  bwd_pkt_len_mean   float32
 13  bwd_pkt_len_std    float32
 14  flow_byts_s        float64
 15  flow_pkts_s        float64
 16  flow_iat_mean      float32
 17  flow_iat_std       float32
 18  flow_iat_max       int64  
 19  flow_iat_min       int64  
 20  fwd_iat_tot        int64  
 21  fwd_iat_mean       float32
 22  fwd_iat_std        float32
 23  fwd_iat_max        int64  
 24  fwd_iat_min        int64  
 25  bwd_iat_tot       

In [4]:
df["label_is_attack"].value_counts(normalize=True)

0    0.688005
1    0.311995
Name: label_is_attack, dtype: float64

In [5]:
df["label"].value_counts(normalize=True)

Benign                      0.688005
DoS attacks-Hulk            0.220257
DoS attacks-SlowHTTPTest    0.066705
DoS attacks-GoldenEye       0.019793
DoS attacks-Slowloris       0.005240
Name: label, dtype: float64

In [8]:
df.head(5)
## fitur "label", "time-stamp", "dst-port" dihapus saja karena bentuk tidak numerik

Unnamed: 0,dst_port,protocol,timestamp,flow_duration,tot_fwd_pkts,tot_bwd_pkts,totlen_fwd_pkts,totlen_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,...,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label,label_is_attack
0,0,0,16/02/2018 08:27:23,112640768,3,0,0,0,0,0,...,0.0,0.0,0,0,56300000.0,138.592926,56300000,56300000,Benign,0
1,0,0,16/02/2018 08:30:12,112641773,3,0,0,0,0,0,...,0.0,0.0,0,0,56300000.0,263.750824,56300000,56300000,Benign,0
2,35605,6,16/02/2018 08:26:55,20784143,23,44,2416,1344,240,64,...,2624734.0,0.0,2624734,2624734,9058214.0,0.0,9058214,9058214,Benign,0
3,0,0,16/02/2018 08:33:01,112640836,3,0,0,0,0,0,...,0.0,0.0,0,0,56300000.0,82.024384,56300000,56300000,Benign,0
4,23,6,16/02/2018 08:27:59,20,1,1,0,0,0,0,...,0.0,0.0,0,0,0.0,0.0,0,0,Benign,0


In [9]:
### normalisasi 
from sklearn import preprocessing

X_normalized = preprocessing.normalize(X, norm='l2')


In [10]:
X_normalized

array([[0.00000000e+00, 4.84799829e-01, 1.29118392e-08, ...,
        5.96496527e-07, 2.42312183e-01, 2.42312183e-01],
       [0.00000000e+00, 4.84803138e-01, 1.29118122e-08, ...,
        1.13516703e-06, 2.42311675e-01, 2.42311675e-01],
       [1.40968028e-07, 4.88316608e-01, 5.40377440e-07, ...,
        0.00000000e+00, 2.12819761e-01, 2.12819761e-01],
       ...,
       [4.08162327e-08, 3.65183467e-01, 3.40135273e-08, ...,
        0.00000000e+00, 3.63412471e-01, 3.63412471e-01],
       [4.00118407e-08, 3.64663675e-01, 3.33432006e-08, ...,
        0.00000000e+00, 3.63598219e-01, 3.63598219e-01],
       [2.40411300e-08, 4.68229700e-01, 7.21233900e-08, ...,
        7.34141045e-04, 2.33246398e-01, 2.32208166e-01]])

In [11]:
normalizer = preprocessing.Normalizer().fit(X)

In [12]:
aaa=normalizer.transform(X)  

In [14]:
newdata=pd.DataFrame(aaa, columns= X.columns)
newdata.head()

Unnamed: 0,protocol,flow_duration,tot_fwd_pkts,tot_bwd_pkts,totlen_fwd_pkts,totlen_bwd_pkts,fwd_pkt_len_max,fwd_pkt_len_min,fwd_pkt_len_mean,fwd_pkt_len_std,...,fwd_act_data_pkts,fwd_seg_size_min,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min
0,0.0,0.4848,1.291184e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.242312,5.964965e-07,0.242312,0.242312
1,0.0,0.484803,1.291181e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.242312,1.135167e-06,0.242312,0.242312
2,1.40968e-07,0.488317,5.403774e-07,1e-06,5.7e-05,3.2e-05,6e-06,2e-06,2e-06,1e-06,...,5.168828e-07,4.698934e-07,0.061667,0.0,0.061667,0.061667,0.21282,0.0,0.21282,0.21282
3,0.0,0.4848,1.291184e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.242312,3.530285e-07,0.242312,0.242312
4,4.415582e-05,0.000147,7.359304e-06,7e-06,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0001471861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:

#Melakukan concat
newdata2= pd.concat([newdata,y], axis = 1)

#Eksport to CSV
##newdata2test.to_csv('normalisasi_chi2.csv')

In [15]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(aaa, y)
print(model.feature_importances_)

  This is separate from the ipykernel package so we can avoid doing imports until


[0.03180115 0.01779871 0.01634458 0.00634504 0.00227642 0.00253078
 0.0040306  0.00253998 0.00756627 0.00370576 0.00979475 0.00547126
 0.0039137  0.00674321 0.02215166 0.03392352 0.05435493 0.00840805
 0.01601105 0.05025154 0.0234698  0.05348939 0.01889416 0.01512744
 0.08512857 0.00977254 0.00755148 0.00792729 0.00575236 0.00348013
 0.00045774 0.         0.         0.         0.07101921 0.00736143
 0.03658921 0.04703655 0.00395625 0.00498231 0.00547696 0.00411896
 0.00296804 0.00106984 0.00027867 0.0001453  0.01040827 0.05356146
 0.01153829 0.         0.00012979 0.00667238 0.00378177 0.00693691
 0.00733859 0.         0.         0.         0.         0.
 0.         0.01799803 0.00372053 0.00545193 0.00234545 0.01771264
 0.03618732 0.00328817 0.07098365 0.0006244  0.00017022 0.00053718
 0.00068473 0.0054399  0.00109153 0.00601738 0.00536289]


In [1]:
## 2. cara kedua tanpa proses normalisasi
num_feats = 30

X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
chi_selector.fit(X_norm,y)
xx= chi_selector.transform(X_norm)
scores = xx.scores_

chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()


NameError: name 'MinMaxScaler' is not defined

In [6]:
scores

array([2.23046760e+04, 3.68126587e+04, 1.36536001e+02, 1.47841843e+02,
       3.44618426e+01, 8.23290272e+01, 3.17190069e+03, 4.45701016e+03,
       2.60138030e+03, 4.40561622e+03, 1.37297874e+04, 1.36083800e+04,
       2.64534105e+04, 3.29912718e+04, 1.39717100e+02, 2.70388623e+04,
       5.55534239e+03, 3.04355156e+03, 1.47368949e+04, 5.51988869e+03,
       3.66040342e+04, 4.81108063e+03, 8.35533107e+03, 1.43731686e+04,
       1.90426661e+03, 2.05700566e+04, 1.24573201e+02, 1.52050950e+03,
       2.50459288e+03, 2.00556403e+03, 1.51137906e+04,            nan,
                  nan,            nan, 5.40164633e+01, 1.61322835e+02,
       7.38923265e+03, 5.02288378e+04, 4.49845046e+03, 3.59189274e+03,
       1.37957052e+04, 7.00056416e+03, 2.27964391e+02, 5.33440515e+02,
       1.51137906e+04, 2.90008127e+04, 3.87744834e+03, 3.36050654e+04,
       1.17280845e+04,            nan, 2.89999057e+04, 2.11441437e+02,
       1.59432243e+04, 2.60138030e+03, 2.64534105e+04,            nan,
      

In [7]:
chi_feature

['protocol',
 'flow_duration',
 'bwd_pkt_len_max',
 'bwd_pkt_len_min',
 'bwd_pkt_len_mean',
 'bwd_pkt_len_std',
 'flow_pkts_s',
 'flow_iat_mean',
 'flow_iat_max',
 'flow_iat_min',
 'fwd_iat_tot',
 'fwd_iat_std',
 'fwd_iat_max',
 'bwd_iat_tot',
 'fwd_psh_flags',
 'fwd_pkts_s',
 'bwd_pkts_s',
 'pkt_len_mean',
 'pkt_len_std',
 'syn_flag_cnt',
 'rst_flag_cnt',
 'ack_flag_cnt',
 'urg_flag_cnt',
 'ece_flag_cnt',
 'pkt_size_avg',
 'bwd_seg_size_avg',
 'init_fwd_win_byts',
 'init_bwd_win_byts',
 'fwd_seg_size_min',
 'idle_min']

In [117]:
num_feats = 30

##X_norm = MinMaxScaler().fit_transform(X)
chi_selector = SelectKBest(chi2, k=num_feats)
##c2= chi_selector.fit(X,y)
chi_selector.fit_transform(X,y)

print (chi_selector.scores_)
##chi_support = c2.get_support()
##chi_feature = X.loc[:,chi_support].columns.tolist()


[3.79179492e+05 4.41751905e+12 9.36976067e+05 2.83575439e+06
 3.01103940e+08 2.29741077e+09 2.04397281e+08 6.50723483e+06
 4.29990330e+07 8.10703082e+07 1.19284393e+08 1.60034549e+07
 5.17419086e+07 4.38573007e+07 1.81422655e+11 1.08155423e+11
 6.66593978e+11 2.56603902e+11 1.76830247e+12 6.62339857e+11
 4.39248041e+12 5.77288878e+11 7.05100975e+11 1.72465841e+12
 2.28495852e+11 2.46840293e+12 1.49379562e+10 1.28486872e+11
 3.00333449e+11 2.40493362e+11 1.51137906e+04            nan
            nan            nan 1.10690537e+07 6.18886018e+07
 2.95569306e+10 1.00457676e+11 6.56773767e+06 2.31461568e+08
 4.58982238e+07 7.40646357e+07 2.55165923e+10 5.33440515e+02
 1.51137906e+04 2.90008127e+04 3.87744834e+03 3.36050654e+04
 1.17280845e+04            nan 2.89999057e+04 2.66416210e+04
 5.30638148e+07 4.29990330e+07 5.17419086e+07            nan
            nan            nan            nan            nan
            nan 9.36976067e+05 3.01103940e+08 2.83575439e+06
 2.29741077e+09 8.276189

In [110]:
from scipy import stats
chi2_stat, p_val, dof, ex = stats.chi2_contingency(scores)
print("===Chi2 Stat===")
print(chi2_stat)
print("\n")
print("===Degrees of Freedom===")
print(dof)
print("\n")
print("===P-Value===")
print(p_val)
print("\n")
print("===Contingency Table===")
print(ex)

===Chi2 Stat===
0.0


===Degrees of Freedom===
0


===P-Value===
1.0


===Contingency Table===
[2.23046760e+04 3.68126587e+04 1.36536001e+02 1.47841843e+02
 3.44618426e+01 8.23290272e+01 3.17190069e+03 4.45701016e+03
 2.60138030e+03 4.40561622e+03 1.37297874e+04 1.36083800e+04
 2.64534105e+04 3.29912718e+04 1.39717100e+02 2.70388623e+04
 5.55534239e+03 3.04355156e+03 1.47368949e+04 5.51988869e+03
 3.66040342e+04 4.81108063e+03 8.35533107e+03 1.43731686e+04
 1.90426661e+03 2.05700566e+04 1.24573201e+02 1.52050950e+03
 2.50459288e+03 2.00556403e+03 1.51137906e+04            nan
            nan            nan 5.40164633e+01 1.61322835e+02
 7.38923265e+03 5.02288378e+04 4.49845046e+03 3.59189274e+03
 1.37957052e+04 7.00056416e+03 2.27964391e+02 5.33440515e+02
 1.51137906e+04 2.90008127e+04 3.87744834e+03 3.36050654e+04
 1.17280845e+04            nan 2.89999057e+04 2.11441437e+02
 1.59432243e+04 2.60138030e+03 2.64534105e+04            nan
            nan            nan            nan      

  if np.any(observed < 0):
