In [1]:
import sys
import os
from glob import iglob
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from keras.models import model_from_yaml
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Activation
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
devices = {}

In [3]:
def read_file(f, devs, c):
    d = pd.read_csv(f)
    #print(f)
    dev = f[8: f.find('/', 8)]
    if dev not in devs:
        devs[dev] = {}
    if c not in devs[dev]:
        devs[dev][c] = 0
    devs[dev][c] += d.shape[0]
    #print(d.shape)
    return d

In [4]:
df_gafgyt = pd.concat((read_file(f, devices, 'gafgyt') for f in iglob('../**/gafgyt_attacks*/*.csv', recursive=True)), ignore_index=True)
print('Loaded, shape: ')
print(df_gafgyt.shape)
df_gafgyt['class'] = 'attack'
print('Loading mirai data')
df_mirai = pd.concat((read_file(f, devices, 'mirai') for f in iglob('../**/mirai_attacks/*.csv', recursive=True)), ignore_index=True)
print('Loaded, shape: ')
print(df_mirai.shape)
df_mirai['class'] = 'attack'
print('Loading benign data')
df_benign = pd.concat((read_file(f, devices, 'benign') for f in iglob('../**/benign_traffic.csv', recursive=True)), ignore_index=True)
print('Loaded, shape: ')
print(df_benign.shape)
df_benign['class'] = 'benign'
df = df_benign.append(df_gafgyt.sample(frac=1, random_state=17)).append(df_mirai.sample(frac=1, random_state=17))

Loaded, shape: 
(2838272, 115)
Loading mirai data
Loaded, shape: 
(1029720, 115)
Loading benign data
Loaded, shape: 
(58628, 115)


In [6]:
devices

{'ining\\gafgyt_attacks_02WHT_SCam\\combo.cs': {'gafgyt': 54283},
 'ining\\gafgyt_attacks_02WHT_SCam\\junk.cs': {'gafgyt': 28579},
 'ining\\gafgyt_attacks_02WHT_SCam\\scan.cs': {'gafgyt': 27825},
 'ining\\gafgyt_attacks_02WHT_SCam\\tcp.cs': {'gafgyt': 88816},
 'ining\\gafgyt_attacks_02WHT_SCam\\udp.cs': {'gafgyt': 103720},
 'ining\\gafgyt_attacks_03WHT_seCam\\combo.cs': {'gafgyt': 59398},
 'ining\\gafgyt_attacks_03WHT_seCam\\junk.cs': {'gafgyt': 27413},
 'ining\\gafgyt_attacks_03WHT_seCam\\scan.cs': {'gafgyt': 28572},
 'ining\\gafgyt_attacks_03WHT_seCam\\tcp.cs': {'gafgyt': 98075},
 'ining\\gafgyt_attacks_03WHT_seCam\\udp.cs': {'gafgyt': 102980},
 'ining\\gafgyt_attacks_737_SeCam\\combo.cs': {'gafgyt': 61380},
 'ining\\gafgyt_attacks_737_SeCam\\junk.cs': {'gafgyt': 30898},
 'ining\\gafgyt_attacks_737_SeCam\\scan.cs': {'gafgyt': 29297},
 'ining\\gafgyt_attacks_737_SeCam\\tcp.cs': {'gafgyt': 104510},
 'ining\\gafgyt_attacks_737_SeCam\\udp.cs': {'gafgyt': 104011},
 'ining\\gafgyt_attacks_

In [7]:

classes = ['benign', 'attack']

In [8]:
scored = {}
indices = {}
shps = {}
for cl in classes:
    indices[cl] = df['class'] == cl
    shps[cl] =  df[indices[cl]].shape[0]
        
for col in df.columns:
    if col == 'class':
        continue
    num = 0
    den = 0
    m = df[col].mean()
    
    for cl in classes:
        num += (shps[cl] / df.shape[0]) * (m - df[indices[cl]][col].mean())**2
        den += (shps[cl] / df.shape[0]) * df[indices[cl]][col].var()
    scored[col] = num / den
    print(col + ' scored ' + str(scored[col]))

MI_dir_L5_weight scored 0.012721747765951162
MI_dir_L5_mean scored 3.6267553990405083e-06
MI_dir_L5_variance scored 0.0022387140771065453
MI_dir_L3_weight scored 0.012863706265386892
MI_dir_L3_mean scored 5.784736053644087e-06
MI_dir_L3_variance scored 0.002576714954217538
MI_dir_L1_weight scored 0.012841131473530968
MI_dir_L1_mean scored 1.5536454741999974e-06
MI_dir_L1_variance scored 0.0025442149585559073
MI_dir_L0.1_weight scored 0.012624133902831517
MI_dir_L0.1_mean scored 4.707084734088189e-05
MI_dir_L0.1_variance scored 0.0005503651345051451
MI_dir_L0.01_weight scored 0.009173645250282548
MI_dir_L0.01_mean scored 4.241941816651436e-06
MI_dir_L0.01_variance scored 0.000153057210632806
H_L5_weight scored 0.012721747765951162
H_L5_mean scored 3.6267553990405083e-06
H_L5_variance scored 0.0022387140771065453
H_L3_weight scored 0.012863706265386892
H_L3_mean scored 5.784736053644087e-06
H_L3_variance scored 0.002576714954217538
H_L1_weight scored 0.012841131473226792
H_L1_mean scored

In [9]:
scored_list = [{'feature': f, 'score': s} for f, s in scored.items()]

In [10]:
scored_list.sort(key=lambda x: x['score'], reverse=True)

In [11]:
scored_list[:15]

[{'feature': 'HpHp_L0.01_pcc', 'score': 0.08933636893587561},
 {'feature': 'HH_L0.01_std', 'score': 0.08408110147744634},
 {'feature': 'HpHp_L0.01_std', 'score': 0.047529754709429085},
 {'feature': 'HH_L0.1_std', 'score': 0.036466232846590556},
 {'feature': 'HH_jit_L0.01_mean', 'score': 0.021986309784961806},
 {'feature': 'HH_jit_L0.1_mean', 'score': 0.02175891287495958},
 {'feature': 'HH_jit_L1_mean', 'score': 0.020174772676528813},
 {'feature': 'HH_jit_L3_mean', 'score': 0.019848051439293912},
 {'feature': 'HH_jit_L5_mean', 'score': 0.019847645633758074},
 {'feature': 'HpHp_L0.1_std', 'score': 0.01582254331200187},
 {'feature': 'HH_L0.01_pcc', 'score': 0.015011270061476862},
 {'feature': 'HpHp_L1_std', 'score': 0.014839158161944069},
 {'feature': 'HH_L1_std', 'score': 0.01407736615306436},
 {'feature': 'MI_dir_L3_weight', 'score': 0.012863706265386892},
 {'feature': 'H_L3_weight', 'score': 0.012863706265386892}]