In [None]:
import pandas as pd
import numpy as np
import pickle as pkl
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [None]:
# Load the dataset and extract the subset with the important features
important_features = pkl.load(open("selected_features.pkl", 'rb'))
data = pd.read_csv('./Master.csv', usecols = important_features)

In [None]:
# Extract the benign and malicious data and normalize 
benign_df = data.loc[data['actiontype'] == 0]
malicious_df = data.loc[data['actiontype'] == 1]
benign_df = pd.DataFrame(scaler.fit_transform(benign_df), columns = important_features)
malicious_df = pd.DataFrame(scaler.fit_transform(malicious_df), columns = important_features)

In [None]:
# Plotting kde distribution plot for the feature with benign and malicious data
for col in important_features[:-1]:
    g = sns.kdeplot(benign_df[col], shade = True)
    g = sns.kdeplot(malicious_df[col], shade = True)    
    plt.legend(title = col, loc = 'upper right', labels = ['Benign', 'Malicious'])
    plt.show(g)    
    g.figure.savefig('./Visual_' + col + '.png')


* Plotting the distribution plots for the most important features

In [None]:
sns.set(style="white", palette="muted", color_codes=True)
rs = np.random.RandomState(10)

# Set up the matplotlib figure
f, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=False)
sns.despine(left=True)

# Plot distribution plots for each of the important features considered
sns.kdeplot(B_df['totalmemory_used_size'], shade=True,ax=axes[0, 0], legend=False)
sns.kdeplot(M_df['totalmemory_used_size'], shade=True,ax=axes[0, 0], legend=False)

sns.kdeplot(B_df['num_threads_SherLock'], shade=True,ax=axes[0, 1], legend=False)
sns.kdeplot(M_df['num_threads_SherLock'], shade=True,ax=axes[0, 1], legend=False)

sns.kdeplot(B_df['totalmemory_freesize'], shade=True,ax=axes[1, 0], legend=False)
sns.kdeplot(M_df['totalmemory_freesize'], shade=True,ax=axes[1, 0], legend=False)

sns.kdeplot(B_df['otherpss_SherLock'], shade=True,ax=axes[1, 1], legend=False)
sns.kdeplot(M_df['otherpss_SherLock'], shade=True,ax=axes[1, 1], legend=False)

f.legend([ax,ax],                              # The line objects
         labels = ['Benign', 'Malicious'],     # The labels for each line
         loc = "upper left",                   # Position of legend
         title = "Data distribution"           # Title for the legend
         )

### Plotting the convergence plots for malicious detection and target classification

In [None]:
# For malicious detection features
convergence_df = pd.read_csv('./action_convergence.csv')
ax = sns.lineplot(x = 'feature', y = '1_f1-score', data = convergence_df)
ax.set_title('Malicious Classification')
ax.set(xlabel = 'Number Of Features', ylabel = 'Malicious F1-score')
ax.figure.savefig('./malicious.png')

In [None]:
# For target classification features
convergence_df = pd.read_csv('./service_convergence.csv')
ax = sns.lineplot(x = 'feature', y = 'macro_avg_f1-score', data = convergence_df)
ax.set_title('Target Classification')
ax.set(xlabel='Number Of Features', ylabel = 'Macro average F1-score')
plt.grid()
ax.figure.savefig('./target_convergence.png', pad_inches = 100)