https://github.com/RadionBik/Statistical-estimator-of-network-traffic

This is notebook is under active development. DO NOT expect something meaningful here.

The idea is to utilize RNN as a model for predicting network traffic time-series (e.g. packet length and IAT)

Currently, many-to-many architecture is being tested.

Many-to-one is the next step

## Load and normalize original traffic

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import sys
#sys.path.append("..")
from stat_estimator import *
from packet_transceiver import *
from helper_functions import *
from hmm_helpers import *

plt.rcParams['figure.figsize'] = [10, 5]
pcapfile = 'traffic_dumps/skypeLANhome.pcap'
traffic_dfs = getTrafficFeatures(pcapfile,'all','flow',(1,99),100)[0]
norm_traffic, scalers = normalize_dfs(traffic_dfs, std_scaler=False)

## Prepare training and test data for RNN

2 approaches:

1. Windowed slicing of time-series with step=1 to allow for more testing data.
2. Batched slicing of data, when we just split all the data into chunks without overlapping.

In [None]:
def get_windowed_training_set(df, window_size, shift=1):
    feature_number = df.shape[1]
    sample_number = df.shape[0]
    X = np.zeros((sample_number-window_size+1, window_size, feature_number))
    y = np.zeros((sample_number-window_size+1, window_size, feature_number))

    for batch in range(sample_number-window_size+1):
        X[batch, :, :] = df.iloc[batch:batch+window_size, :]
        y[batch, :, :] = df.shift(shift).fillna(0).iloc[batch:batch+window_size, :]
    return X, y


def get_batched_training_set(df, window_size, shift=1):
    batch_num = int(np.ceil(df.shape[0]/window_size))
    zeros_to_append = np.zeros((batch_num*window_size - df.shape[0], df.shape[1]))
    df = df.append(pd.DataFrame(zeros_to_append, columns=df.columns), ignore_index=True)

    X = np.reshape(df.values, (batch_num, window_size, 2)) 
    y = np.reshape(df.shift(shift).fillna(0).values, (batch_num, window_size, 2) )
    
    #return only non-zero batches 
    return X[1:,:,:], y[1:,:,:]

for df in iterate_dfs(norm_traffic):
    #df.plot(subplots=True, layout = (2,1))
    y = df.shift(1).fillna(0).values

#prepare data

window_size = 100

feature_number = df.shape[1]
sample_number = df.shape[0]


X, y = get_windowed_training_set(df, window_size)
#X, y = get_batched_training_set(df, window_size)
print(X.shape)

In [None]:
#https://machinelearningmastery.com/text-generation-lstm-recurrent-neural-networks-python-keras/
    
#https://chunml.github.io/ChunML.github.io/project/Creating-Text-Generator-Using-Recurrent-Neural-Network/ 

#https://stackoverflow.com/questions/38714959/understanding-keras-lstms?rq=1

#http://colah.github.io/posts/2015-08-Understanding-LSTMs/
#help(pd.DataFrame.append)

In [None]:
import tensorflow as tf

from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, GRU

def get_random_model_parameters(input_size):
    
    return {'numb_of_layers' : np.random.choice([1,2,3,4]),
          'layer_nodes' : [input_size] + [int(np.random.choice([0.5*input_size, input_size, 2*input_size])) for i in range(3)],
          'dropouts' : [np.random.choice([0.05,0.1,0.2,0.3]) for i in range(4)] }

def set_model_parameters(numb_of_layers, layer_nodes, dropouts):
    return {'numb_of_layers' : numb_of_layers,
            'layer_nodes' : layer_nodes,
            'dropouts' : dropouts }

def build_gru_model(model_parameters, input_size, feature_number):
    
    model = tf.keras.Sequential()

    model.add(GRU(units=model_parameters['layer_nodes'][0], 
                  input_shape=(input_size,  feature_number), 
                  return_sequences=True, 
                  dropout=model_parameters['dropouts'][0]))

    for layer in range(1,model_parameters['numb_of_layers']):
        model.add(GRU(units=model_parameters['layer_nodes'][layer], 
                      return_sequences=True,
                      dropout=model_parameters['dropouts'][layer]))

    model.add(TimeDistributed(Dense(feature_number, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    model.summary()
    
    return model

def build_lstm_model(model_parameters, input_size, feature_number):
    
    model = tf.keras.Sequential()

    model.add(LSTM(units=model_parameters['layer_nodes'][0], 
                  input_shape=(input_size,  feature_number), 
                  return_sequences=True, 
                  dropout=model_parameters['dropouts'][0]))

    for layer in range(1,model_parameters['numb_of_layers']):
        model.add(LSTM(units=model_parameters['layer_nodes'][layer], 
                      return_sequences=True,
                      dropout=model_parameters['dropouts'][layer]))

    model.add(TimeDistributed(Dense(feature_number, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    model.summary()
    
    return model

#for i in range(8):

params = get_random_model_parameters(window_size)

#model = build_gru_model(params, window_size, feature_number)
model = build_lstm_model(params, window_size, feature_number)


# define the checkpoint
checkpoint_path="rnn_models/lstm-loss-{loss:.4f}-epochs-{epoch:02d}"+\
                "-{}layers-{}x{}x{}-drops-{}x{}x{}-batched.hdf5".format(params['numb_of_layers'],
                                                              *params['layer_nodes'],
                                                              *params['dropouts'])

cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path, 
                                                 #save_weights_only=True,
                                                 verbose=1,
                                                 #save_best_only=True,
                                                 monitor='loss',
                                                 period = 20)


# fit the model
model.fit(X, y, epochs=10, callbacks=[cp_callback])

In [None]:
#model = tf.keras.models.load_model('rnn_models/gru-loss-0.4782-epochs-200-2layers-100x50x50-drops-50x0.05x0.05-batched.hdf5')

In [None]:
model.save("rnn_models/lstm-loss-0.4560-epochs-10-{}layers-{}x{}x{}-drops-{}x{}x{}-batched.hdf5".format(params['numb_of_layers'],
                                                              *params['layer_nodes'],
                                                              *params['dropouts']))

In [None]:
sample_number_to_gener = 6000

def generate_batched_samples(model, init_X, sample_number_to_gener, window_size, scalers, feature_number=2):

    #when shift=window size
    batch_number = int(np.ceil(sample_number_to_gener/window_size))
    predicted_X = np.zeros((batch_number, window_size, feature_number))

    for device, direction, scaler in iterate_traffic_dict(scalers):
        pass

    X_to_predict = init_X
    for batch in range(batch_number):
        
        predicted_X[batch, :, :] = model.predict(X_to_predict)
        X_to_predict = predicted_X[batch:batch+1:, :, :]

    return pd.DataFrame(scaler.inverse_transform( predicted_X.reshape(-1, predicted_X.shape[-1]) ))

def generate_windowed_samples(model, init_X, sample_number_to_gener, window_size, scalers, shift=1, feature_number=2):
    
    gen_X = np.zeros((sample_number_to_gener, feature_number))
    
    for device, direction, scaler in iterate_traffic_dict(scalers):
        pass

    X_to_predict = init_X
    batch_number = int(np.ceil(sample_number_to_gener/window_size))
    for batch in range(batch_number):
        X_to_predict = model.predict(X_to_predict)
        gen_X[batch*window_size:(batch+1)*window_size, :] = X_to_predict
    
    return pd.DataFrame(scaler.inverse_transform(gen_X))

gener_df = generate_windowed_samples(model, X[100:101:,:,:], sample_number_to_gener, window_size, scalers)
gener_df.hist(bins=30)
generate_batched_samples(model, X[0:1:,:,:], sample_number_to_gener, window_size, scalers).hist(bins=30)
#gener_df.hist(bins=30)
pd.DataFrame(scaler.inverse_transform(df)).hist(bins=30)

In [None]:
gener_df.index = pd.to_datetime(gener_df.iloc[:,0].cumsum(), unit='s')
goodput_dfs = gener_df.resample('1S').sum()
#plt.figure()
ax = (goodput_dfs[1]/1024).plot(grid=True, label=direction, lw=3)


In [None]:
#background process

import numpy as np

def spatial_correlation_factor(distance, alpha):
    return np.exp(-alpha * distance)

spatial_correlation_factor(30, 1)