# <b>Task02 - Darknet Traffic Analysis</b>

___
## Table of Content

* Load raw features
* Train the multi-modal autoencoder
* Train the raw features models
* Validate the models
* k-nearest-neighbors class probability

In [18]:
# type: ignore

from mltoolbox.classification import DeepClassifier, KnnClassifier
from mltoolbox.representation import MultimodalAE
from sklearn.metrics import classification_report
from mltoolbox.metrics import k_class_proba_report
from src.utils import get_datasets
from keras import layers
import pandas as pd
import numpy as np
import joblib

In [17]:
task = 'task02' # Define the task. This variable is used mainly for savings
K = 0 # Specify the considered fold (Stratified k fold validation)

## Load raw features



In [10]:
# Load ports word2vec embeddings - entity
ports=pd.read_csv(f'../data/{task}/features/ports.csv', index_col=[0])
# Load statistics features - quantity
statistics=pd.read_csv(f'../data/{task}/features/statistics.csv', index_col=[0])
# Load ip address word2vec embeddings - entity
ipaddress=pd.read_csv(f'../data/{task}/features/ipaddress.csv', index_col=[0])
# Merge the features as raw concatenation
concat = ports.reset_index().drop(columns=['label'])\
     .merge(statistics.reset_index().drop(columns=['label']), 
            on='src_ip', how='inner')\
     .merge(ipaddress.reset_index(), on='src_ip', how='inner')\
     .set_index('src_ip')
# Load stratified k folds
kfolds = joblib.load(f'../data/{task}/skfolds/folds.save')

## Train the multi-modal autoencoder

In [9]:
feature, fname = concat, 'mae'
# Get the features size. Last column is the label one
P,S,I = ports.shape[1]-1, statistics.shape[1]-1, ipaddress.shape[1]-1

# Retrieve the training and validation samples from the k-folds order
X_train, X_val, y_train, y_val = get_datasets(kfolds, K, feature)

In [10]:
# Define the classifier architecture
inputs = layers.Input(X_train.shape[1],)

# Encoder branch of modality 1 - ports embeddings
hidden1 = layers.Lambda(lambda x: x[:, :P])(inputs)
hidden1 = layers.Dense(32, activation='relu')(hidden1)
# Encoder branch of modality 2 - statistics
hidden2 = layers.Lambda(lambda x: x[:, P:S])(inputs)
hidden2 = layers.Dense(32, activation='relu')(hidden2)
# Encoder branch of modality 3 - ip address embeddings
hidden3 = layers.Lambda(lambda x: x[:, P+S:P+S+I])(inputs)
hidden3 = layers.Dense(32, activation='relu')(hidden3)

# Concatenate
hidden = layers.Concatenate()([hidden1, hidden2, hidden3])
# Common encoder
hidden = layers.Dense(512, activation='relu')(hidden)
hidden = layers.Dense(256, activation='relu')(hidden)
# Bottleneck
hidden = layers.Dense(64, activation='relu', name='Coded')(hidden)
# Common decoder
hidden = layers.Dense(256, activation='relu')(hidden)
hidden = layers.Dense(512, activation='relu')(hidden)
hidden = layers.Dense(32*3, activation='relu')(hidden)

hidden1 = layers.Dense(32, activation='relu')(hidden)
output1 = layers.Dense(P, activation='linear', name='ports')(hidden1)

hidden2 = layers.Dense(32, activation='relu')(hidden)
output2 = layers.Dense(S, activation='linear', name='statistics')(hidden2)

hidden3 = layers.Dense(32, activation='relu')(hidden)
output3 = layers.Dense(I, activation='linear', name='ipaddress')(hidden3)

outputs = [output1, output2, output3]

# Mean Squared Errors
loss = {'ports':'mse', 'statistics':'mse', 'ipaddress':'mse'} 
weights = {'ports':1/P, 'statistics':1/S, 'ipaddress':1/I} # Balance losses

2022-12-07 13:19:21.163988: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/anaconda3/envs/bigdatalab/lib/libfabric:/usr/local/cuda/lib64
2022-12-07 13:19:21.164054: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-07 13:19:21.164081: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (vm5-bigdata): /proc/driver/nvidia/version does not exist
2022-12-07 13:19:21.164411: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
# Initialize the classifier
mae = MultimodalAE(model_path=f'../data/{task}/mae/{fname}_k{K}',
                   io=(inputs, outputs), losses=loss, weights=weights)
# Fit the multi-modal autoencoder
mae.fit(training_data=(X_train, X_train), y_sizes=[P, S, I], batch_size=256, 
        scale_data=True, epochs=3, validation_data=(X_val, X_val), save=True, 
        verbose=0)
embeddings = np.vstack([mae.transform(X_train), mae.transform(X_val)])

In [None]:
# Save the embeddings
embeddings = pd.DataFrame(embeddings, index=np.hstack(kfolds[K][:2]))
embeddings[['label']] = np.hstack(kfolds[K][2:])
embeddings.to_csv(f'../data/{task}/embeddings/mae_embeddings_k{K}.csv')

## Train the raw-features-models

In [13]:
embeddings=pd.read_csv(f'../data/{task}/embeddings/mae_embeddings_k{K}.csv', 
                       index_col=[0])
n_classes = ipaddress.value_counts('label').shape[0]

In [None]:
features = [ports, statistics, ipaddress, concat, embeddings]
feature_names = ['ports', 'statistics', 'ipaddress', 'rawcat', 'mae']

for feature, fname in zip(features, feature_names):
    print(f'\nTraining {fname} deep classifier')
    
    # Retrieve the training and validation samples from the k-folds order
    X_train, X_val, y_train, y_val = get_datasets(kfolds, K, feature)
    # Define the classifier architecture
    inputs = layers.Input(X_train.shape[1],)
    hidden = layers.Dense(512, activation='relu')(inputs)
    hidden = layers.Dropout(.3)(hidden)
    hidden = layers.Dense(256, activation='relu')(hidden)
    hidden = layers.Dropout(.3)(hidden)
    outputs = layers.Dense(n_classes, activation='softmax')(hidden)
    # Initialize the classifier
    mpath = f'../data/{task}/classifiers/{fname}_k{K}'
    classifier = DeepClassifier(io=(inputs, outputs), model_path=mpath)
    # Train the classifier for 3 epochs. Standardize data before training
    classifier.fit(training_data=(X_train, y_train), 
                   validation_data=(X_val, y_val), 
                   scale_data=True, batch_size=256, epochs=3, save=True)

## Validate the models

In [103]:
features = [ports, statistics, ipaddress, concat, embeddings]
feature_names = ['ports', 'statistics', 'ipaddress', 'rawcat', 'mae']

for feature, fname in zip(features, feature_names):
    # Retrieve the training and validation samples from the k-folds order
    X_train, X_val, y_train, y_val = get_datasets(kfolds, K, feature)
    mpath = f'../data/{task}/classifiers/{fname}_k{K}'
    classifier = DeepClassifier(_load_model=True, model_path=mpath)
    y_pred = classifier.predict(X_val, scale_data=True)
    report = classification_report(y_val, y_pred, labels=np.unique(y_val), 
                                   output_dict=True)

    f1 = report['macro avg']['f1-score']
    
    pd.DataFrame(report).T.to_csv(f'../data/interim/{fname}_deep_k{K}.csv')
    print(f'\nValidating {fname} deep classifier:\n\tMacro avg f1:{f1}')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Validating ports deep classifier:
	Macro avg f1:0.7537346661897472

Validating statistics deep classifier:
	Macro avg f1:0.8506145522148253

Validating ipaddress deep classifier:
	Macro avg f1:0.8903880259645202

Validating rawcat deep classifier:
	Macro avg f1:0.9147246398146704

Validating mae deep classifier:
	Macro avg f1:0.8215870555870808


## k-nearest-neighbors class probability</b>

In [None]:
features = [concat, embeddings]
feature_names = ['rawcat', 'mae']
for feature, fname in zip(features, feature_names):
    for k in range(1, 20):
        # Retrieve the training and validation samples from the k-folds order
        X_train, X_val, y_train, y_val = get_datasets(kfolds, K, feature)
        X, y = np.vstack([X_train, X_val]), np.hstack([y_train, y_val])
        
        knn = KnnClassifier(n_neighbors=k, metric='cosine')
        knn.fit(X, y, scale_data=True)

        to_keep = np.where(y!='unknown')[0].reshape(-1, 1)
        pcs = knn.predict_proba(to_keep)
        
        y_true = y[np.ravel(to_keep)]
        report = k_class_proba_report(y_true, pcs, output_dict=True)
        
        kpc = report['macro avg']['kpc']

        pd.DataFrame(report).T.to_csv(f'../data/interim/{fname}_{k}pc_k{K}.csv')
        print(f'\nMacro avg {k}Pc - {fname}: {kpc}')