In [213]:
import matplotlib.pyplot as plt 
import pandas as pd
import seaborn as sns
import numpy as np

from matplotlib import pyplot
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics.pairwise import cosine_similarity
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.seasonal import seasonal_decompose

from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

In [252]:
window_size = 350.0
protocols = ['all', 'dns', 'ftp', 'ftp-data', 'http', 'pop3', 'smtp', 'ssh']
protocol = protocols[1]

In [253]:
df = pd.read_csv('processed_dataset/{0}/{1}/{1}_time_series.csv'.format(window_size, protocol))
df.drop(columns=['time_sum', 'Label_sum'], inplace=True)

In [254]:
df.head()

Unnamed: 0,sbytes_sum,dbytes_sum,sttl_sum,dttl_sum,sloss_sum,dloss_sum,Sload_sum,Dload_sum,Spkts_sum,Dpkts_sum,...,tc_flw_http_mthd_sum,is_ftp_login_sum,ct_srv_src_sum,ct_srv_dst_sum,ct_dst_ltm_sum,ct_src_ltm_sum,ct_src_dport_ltm_sum,ct_dst_sport_ltm_sum,ct_dst_src_ltm_sum,connections_sum
0,196706,241762,43648,40832,0,0,736774612,905632184,2816,2816,...,0,0,4406,4372,4252,5113,2002,1488,1844,1408
1,207316,252252,46642,42514,0,0,2368500639,949837464,2946,2932,...,0,0,3836,3799,4519,6047,2065,1549,2034,1473
2,274412,276076,51031,46817,4,2,894039819,1035900670,3470,3208,...,0,0,4385,4499,5210,6748,2221,1665,2189,1603
3,236524,283494,48537,44595,0,0,1113895962,1006504085,3268,3256,...,0,0,4189,4064,4302,4894,2152,1606,1935,1534
4,202622,248108,44429,41325,0,0,799474775,928763180,2852,2850,...,0,0,4210,4140,4036,4560,2055,1511,1819,1426


In [248]:
#Remove columns if it contains less than 2% unique values
percent = 2

for i, v in enumerate(df.nunique()):
    p = float(v) / df.shape[0] * 100
    if p < percent:
        print('{:.2f}'.format(p), df.columns[i])
        df.drop(columns=df.columns[i], inplace=True)

0.40 trans_depth_sum
0.40 synack_sum
0.40 ct_state_ttl_sum
0.40 ct_dst_sport_ltm_sum


In [249]:
max_similarity = 0.98

sim_cols = set()
for i in range(df.shape[1]):
    col = df.iloc[:, i]
    for j in range(i + 1, df.shape[1]):
        otherCol = df.iloc[:, j]
        cs = cosine_similarity(col.values.reshape(1, -1), otherCol.values.reshape(1, -1))
        if cs > max_similarity:
            #print('\t', cs, df.columns[i], '\t', df.columns[j])
            sim_cols.add(df.columns.values[j])
            
print('Remove (cosine) similar columns to another one:', len(sim_cols))
df.drop(columns=sim_cols, inplace=True)

Remove (cosine) similar columns to another one: 8


In [250]:
print(len(df.columns))
print(df.columns)

# predtym 37 stlpcov
# potom 18 stlpcov

8
Index(['sbytes_sum', 'dbytes_sum', 'sttl_sum', 'dloss_sum', 'stcpb_sum',
       'Sjit_sum', 'tcprtt_sum', 'ackdat_sum'],
      dtype='object')


In [251]:
#Augmented Dickey-Fuller (ADF) unit root test
failed_columns = []
for col in df.columns:
    dftest = adfuller(df[col], autolag='AIC')
    if (dftest[1] > 0.05):
        failed_columns.append(col)
        print(col, dftest[1])
    if (dftest[1] < 0.05):
        print(col)
        #print("Statistic Test : " , dftest[0])
        #print('p-value: ', dftest[1], '\nNum of lags: ', dftest[2])
        #for key,value in dftest[4].items():
        #    print(f" critical value {key} : {value}")
        #print("--------------------------------------------------")
    #else:
    #    print(col)
    #    print("Statistic Test : " , dftest[0])
    #    print('p-value: ', dftest[1], '\nNum of lags: ', dftest[2])
    #    for key,value in dftest[4].items():
    #        print(f" critical value {key} : {value}")
    #print("--------------------------------------------------")
    
df.drop(columns=failed_columns, inplace=True)

sbytes_sum 0.7271643308998651
dbytes_sum
sttl_sum 0.7337257457911974
dloss_sum 0.4236810177240871
stcpb_sum 0.38646887270523983
Sjit_sum 0.33563319194915936


  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2
  llf = -nobs2*np.log(2*np.pi) - nobs2*np.log(ssr / nobs) - nobs2


In [221]:
for col in df.columns:
    res = acorr_ljungbox(df[col], lags=[3, 6, 24], return_df=True)
    if all(p > 0.05 for p in res.lb_pvalue): # if true data are random
        print(col, '\t', res.lb_pvalue)

In [222]:
df.columns

Index(['sbytes_sum', 'dbytes_sum', 'dttl_sum', 'Dload_sum', 'smeansz_sum',
       'trans_depth_sum', 'res_bdy_len_sum', 'Sjit_sum',
       'tc_flw_http_mthd_sum', 'is_ftp_login_sum'],
      dtype='object')

In [223]:
#odseknutie hodnot ktore su prilis vysoke
percent = 0.05
df.clip(lower=df.quantile(0), upper=df.quantile(1-percent), axis=1, inplace=True)

In [224]:
#fig, ax = pyplot.subplots(figsize=(8, 6))
#sns.heatmap(df.corr(), ax=ax, annot=True, fmt=".2f", cmap="YlGnBu")

corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find and drop features with correlation greater than 0.95
df.drop([column for column in upper.columns if any(upper[column] > 0.95)], axis=1, inplace=True)

In [225]:
'''cols=df.columns
df_result = pd.DataFrame()
for col in cols:
    column = df[col]
    result = seasonal_decompose(column, period=7, model='additive')
    print(result)
    
    plt.rcParams["figure.figsize"] = (15, 12)
    result.plot()
    plt.show()'''

'cols=df.columns\ndf_result = pd.DataFrame()\nfor col in cols:\n    column = df[col]\n    result = seasonal_decompose(column, period=7, model=\'additive\')\n    print(result)\n    \n    plt.rcParams["figure.figsize"] = (15, 12)\n    result.plot()\n    plt.show()'

In [226]:
scaler = MinMaxScaler(feature_range=(0, 1))
data_trans = scaler.fit_transform(df)
#df.to_numpy()

In [227]:
train_size = int(len(data_trans) * 0.80)
test_size = len(data_trans) - train_size
train, test = data_trans[0:train_size], data_trans[train_size:len(data_trans)]

In [228]:
n_input = 3
train_data_gen = TimeseriesGenerator(train, 
                                     train,
                                     length=n_input, 
                                     sampling_rate=1,
                                     stride=1,
                                     batch_size=1
                                    )

test_data_gen = TimeseriesGenerator(test, 
                                    test,
                                    length=n_input, 
                                    sampling_rate=1,
                                    stride=1,
                                    batch_size=1
                                   )

In [229]:
print('Samples: %d' % len(train_data_gen))
for i in range(5):
    x, y = train_data_gen[i]
    print('%s => %s' % (x, y))
    print("---------------------------------------------------------------------------------------------------------------")


Samples: 197
[[[0.44891016 0.6797037  0.70657345 0.57277059 0.45808687 0.57013946
   0.32281835 1.         0.51821862]
  [0.64024733 0.69002511 1.         0.77600153 0.57503923 0.67350287
   0.67989855 0.88242363 0.65991903]
  [0.89956772 0.8339905  1.         0.82990853 0.60711919 0.83921247
   0.6916288  1.         0.53036437]]] => [[1.         0.98313605 0.89397535 0.63953884 0.56903637 0.70057424
  0.89734371 0.95492972 0.66396761]]
---------------------------------------------------------------------------------------------------------------
[[[0.64024733 0.69002511 1.         0.77600153 0.57503923 0.67350287
   0.67989855 0.88242363 0.65991903]
  [0.89956772 0.8339905  1.         0.82990853 0.60711919 0.83921247
   0.6916288  1.         0.53036437]
  [1.         0.98313605 0.89397535 0.63953884 0.56903637 0.70057424
   0.89734371 0.95492972 0.66396761]]] => [[0.51841571 0.81288086 0.76163439 0.71751305 0.46553091 0.73995078
  0.87863716 1.         0.4534413 ]]
-------------------