# Pipeline
PParser: Feature extraction module

Input: pcap_file
```

=> PCAP 
    
    pcap2flows(): chunks of up to 600s
    
    flows2subflows(): split by intervals or quantiles

-> get_features(): from the paper

-> label_flows()
```

Output: '.dat' files to store the features

## iotlab_fridgedataset inspection
* Process: get to know more about the dataset using their Kun's idea
* Findings:
    * Previously we were mixing all packets together, but it actually has different traces based on 5-tuples (ip.source, ip.destination, source_port, destination_port, flag_TCP_UDP). 6 stands for TCP, 17 stands for IP, the number after the tuple is the packet number.
    * Pcaps for fridge_door opening
        * Each door file has 12 ~ 14 flows
        * Each flow has 2 ~ 41 packets
        * Fridge_door3.pcap might be a mistake
        
    * Pcaps for fridge_idle
        * Each idle file has 0 ~ 5 flows
        * Each flow has 2 packets

* Conclusion:
    * The door opening vs idle pcaps can be easily distinguished just by looking at the flow number and packet number within flows

In [40]:
import pparser.parser
import numpy as np

pcap_file = 'iotlab_fridgedataset/fridge_door'

for i in range(1,7):
    pp = pparser.parser._pcap2flows(pcap_file + str(i) + '.pcap')
    print(len(pp))

for i in range(len(pp)):
    # print out infos for the last flow
    print(pp[i][0], len(pp[i][1])) # 5-tuples, packet number for each flow

pcap_file: iotlab_fridgedataset/fridge_door1.pcap
12
pcap_file: iotlab_fridgedataset/fridge_door2.pcap
12
pcap_file: iotlab_fridgedataset/fridge_door3.pcap
2
pcap_file: iotlab_fridgedataset/fridge_door4.pcap
13
pcap_file: iotlab_fridgedataset/fridge_door5.pcap
14
pcap_file: iotlab_fridgedataset/fridge_door6.pcap
13
('192.168.143.43', '52.196.247.117', 58496, 5223, 6) 2
('192.168.143.43', '52.14.6.98', 43824, 443, 6) 6
('52.14.6.98', '192.168.143.43', 443, 43824, 6) 6
('192.168.143.43', '52.10.220.236', 58433, 443, 6) 33
('52.10.220.236', '192.168.143.43', 443, 58433, 6) 41
('192.168.143.43', '52.10.220.236', 58434, 443, 6) 29
('52.10.220.236', '192.168.143.43', 443, 58434, 6) 37
('192.168.143.43', '52.10.220.236', 58435, 443, 6) 34
('52.10.220.236', '192.168.143.43', 443, 58435, 6) 41
('192.168.143.43', '52.10.220.236', 58436, 443, 6) 20
('52.10.220.236', '192.168.143.43', 443, 58436, 6) 15
('192.168.143.43', '18.200.190.216', 41507, 443, 6) 2
('18.200.190.216', '192.168.143.43', 443, 

In [41]:
import pparser.parser
import numpy as np

pcap_file = 'iotlab_fridgedataset/fridge_idle'

for i in range(1,11):
    pp = pparser.parser._pcap2flows(pcap_file + str(i) + '.pcap')
    print(len(pp))

for i in range(len(pp)):
    # print out infos for the last flow
    print(pp[i][0], len(pp[i][1])) # 5-tuples, packet number for each flow

pcap_file: iotlab_fridgedataset/fridge_idle1.pcap
2
pcap_file: iotlab_fridgedataset/fridge_idle2.pcap
3
pcap_file: iotlab_fridgedataset/fridge_idle3.pcap
5
pcap_file: iotlab_fridgedataset/fridge_idle4.pcap
3
pcap_file: iotlab_fridgedataset/fridge_idle5.pcap
0
pcap_file: iotlab_fridgedataset/fridge_idle6.pcap
2
pcap_file: iotlab_fridgedataset/fridge_idle7.pcap
5
pcap_file: iotlab_fridgedataset/fridge_idle8.pcap
5
pcap_file: iotlab_fridgedataset/fridge_idle9.pcap
3
pcap_file: iotlab_fridgedataset/fridge_idle10.pcap
3
('192.168.143.43', '52.196.247.117', 58496, 5223, 6) 2
('192.168.143.43', '52.14.6.98', 43824, 443, 6) 2
('192.168.143.43', '224.0.0.251', 5353, 5353, 17) 2


In [90]:
import os

from pparser.parser import PCAP
from utils.tool import dump_data

RANDOM_STATE = 42

def save_feat(pcap_file, feat_type, label, file_num):
    fft = False
    # pcap_file = 'iotlab_fridgedataset/fridge_door1.pcap'
    pp = PCAP(pcap_file, flow_ptks_thres=2, verbose=0, random_state=RANDOM_STATE)

    # extract flows from pcap
    pp.pcap2flows(q_interval=0.9)

    # label each flow with a label
    pp.label_flows(label_file = '', label = label)

    if feat_type[0:3] == 'FFT':
        fft = True
    # extract features from each flow given feat_type
    pp.flow2features(feat_type, fft=fft, header=False)

    # dump data to disk
    X, y = pp.features, pp.labels
    out_dir = os.path.join('out', os.path.dirname(pcap_file))
    dump_data((X, y), out_file=f'{out_dir}/{feat_type}_{label}_{str(file_num)}.dat')

    print(pp.features.shape, pp.pcap2flows.tot_time, pp.flow2features.tot_time)

In [91]:
if 'FFT-SIZE'[0:3] == 'FFT':
    print('yes')

yes


In [92]:
pcap_file = 'iotlab_fridgedataset/fridge_door'
feat_types = ['IAT', 'SIZE', 'IAT_SIZE', 'STATS', 'FFT-IAT', 'FFT-IAT_SIZE', 'FFT-SIZE']
for i in range(1,7):
    file = pcap_file + str(i) + '.pcap'
    for feat_type in feat_types:
        save_feat(file, feat_type, 1, i)

'_pcap2flows()' starts at 2020-07-07 14:03:30
'_pcap2flows()' ends at 2020-07-07 14:03:30 and takes 0.0022 mins.
'_label_flows()' starts at 2020-07-07 14:03:30
'_label_flows()' ends at 2020-07-07 14:03:30 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:30
'_flow2features()' ends at 2020-07-07 14:03:30 and takes 0.0 mins.
(12, 2) 0.0022 0.0
'_pcap2flows()' starts at 2020-07-07 14:03:30
'_pcap2flows()' ends at 2020-07-07 14:03:30 and takes 0.0018 mins.
'_label_flows()' starts at 2020-07-07 14:03:30
'_label_flows()' ends at 2020-07-07 14:03:30 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:30
'_flow2features()' ends at 2020-07-07 14:03:30 and takes 0.0001 mins.
(12, 2) 0.0018 0.0001
'_pcap2flows()' starts at 2020-07-07 14:03:30
'_pcap2flows()' ends at 2020-07-07 14:03:31 and takes 0.0019 mins.
'_label_flows()' starts at 2020-07-07 14:03:31
'_label_flows()' ends at 2020-07-07 14:03:31 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:31


'_pcap2flows()' ends at 2020-07-07 14:03:32 and takes 0.0009 mins.
'_label_flows()' starts at 2020-07-07 14:03:32
'_label_flows()' ends at 2020-07-07 14:03:32 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:32
'_flow2features()' ends at 2020-07-07 14:03:32 and takes 0.0 mins.
(11, 2) 0.0009 0.0
'_pcap2flows()' starts at 2020-07-07 14:03:32
'_pcap2flows()' ends at 2020-07-07 14:03:32 and takes 0.0011 mins.
'_label_flows()' starts at 2020-07-07 14:03:32
'_label_flows()' ends at 2020-07-07 14:03:32 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:32
'_flow2features()' ends at 2020-07-07 14:03:32 and takes 0.0001 mins.
(11, 2) 0.0011 0.0001
'_pcap2flows()' starts at 2020-07-07 14:03:32
'_pcap2flows()' ends at 2020-07-07 14:03:32 and takes 0.001 mins.
'_label_flows()' starts at 2020-07-07 14:03:32
'_label_flows()' ends at 2020-07-07 14:03:32 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:32
'_flow2features()' ends at 2020-07-07 14:03:32 

In [93]:
pcap_file = 'iotlab_fridgedataset/fridge_idle'
feat_types = ['IAT', 'SIZE', 'IAT_SIZE', 'STATS', 'FFT-IAT', 'FFT-IAT_SIZE', 'FFT-SIZE']
for i in range(1,11):
    if i == 5:
        continue
    else:
        file = pcap_file + str(i) + '.pcap'
        for feat_type in feat_types:
            save_feat(file, feat_type, 0, i)

'_pcap2flows()' starts at 2020-07-07 14:03:53
'_pcap2flows()' ends at 2020-07-07 14:03:53 and takes 0.0002 mins.
'_label_flows()' starts at 2020-07-07 14:03:53
'_label_flows()' ends at 2020-07-07 14:03:53 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:53
'_flow2features()' ends at 2020-07-07 14:03:53 and takes 0.0 mins.
(1, 2) 0.0002 0.0
'_pcap2flows()' starts at 2020-07-07 14:03:53
'_pcap2flows()' ends at 2020-07-07 14:03:53 and takes 0.0001 mins.
'_label_flows()' starts at 2020-07-07 14:03:53
'_label_flows()' ends at 2020-07-07 14:03:53 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:53
'_flow2features()' ends at 2020-07-07 14:03:53 and takes 0.0 mins.
(1, 2) 0.0001 0.0
'_pcap2flows()' starts at 2020-07-07 14:03:53
'_pcap2flows()' ends at 2020-07-07 14:03:53 and takes 0.0001 mins.
'_label_flows()' starts at 2020-07-07 14:03:53
'_label_flows()' ends at 2020-07-07 14:03:53 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:53
'_flow2f

'_pcap2flows()' ends at 2020-07-07 14:03:53 and takes 0.0002 mins.
'_label_flows()' starts at 2020-07-07 14:03:53
'_label_flows()' ends at 2020-07-07 14:03:53 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:53
'_flow2features()' ends at 2020-07-07 14:03:53 and takes 0.0 mins.
(1, 2) 0.0002 0.0
'_pcap2flows()' starts at 2020-07-07 14:03:53
'_pcap2flows()' ends at 2020-07-07 14:03:53 and takes 0.0001 mins.
'_label_flows()' starts at 2020-07-07 14:03:53
'_label_flows()' ends at 2020-07-07 14:03:53 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:53
'_flow2features()' ends at 2020-07-07 14:03:53 and takes 0.0 mins.
(1, 2) 0.0001 0.0
'_pcap2flows()' starts at 2020-07-07 14:03:53
'_pcap2flows()' ends at 2020-07-07 14:03:53 and takes 0.0001 mins.
'_label_flows()' starts at 2020-07-07 14:03:53
'_label_flows()' ends at 2020-07-07 14:03:53 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:53
'_flow2features()' ends at 2020-07-07 14:03:53 and tak

'_pcap2flows()' ends at 2020-07-07 14:03:54 and takes 0.0002 mins.
'_label_flows()' starts at 2020-07-07 14:03:54
'_label_flows()' ends at 2020-07-07 14:03:54 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:54
'_flow2features()' ends at 2020-07-07 14:03:54 and takes 0.0 mins.
(2, 2) 0.0002 0.0
'_pcap2flows()' starts at 2020-07-07 14:03:54
'_pcap2flows()' ends at 2020-07-07 14:03:54 and takes 0.0002 mins.
'_label_flows()' starts at 2020-07-07 14:03:54
'_label_flows()' ends at 2020-07-07 14:03:54 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:54
'_flow2features()' ends at 2020-07-07 14:03:54 and takes 0.0 mins.
(2, 2) 0.0002 0.0
'_pcap2flows()' starts at 2020-07-07 14:03:54
'_pcap2flows()' ends at 2020-07-07 14:03:54 and takes 0.0002 mins.
'_label_flows()' starts at 2020-07-07 14:03:54
'_label_flows()' ends at 2020-07-07 14:03:54 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-07 14:03:54
'_flow2features()' ends at 2020-07-07 14:03:54 and tak

# Next steps
* Test the classifiers based on the fridge dataset and larger iot_data dataset.
* Try to integrate it within the DeepLens code.

## ndm on iot_dataset

In [77]:
import os

from sklearn.model_selection import train_test_split

from ndm.model import MODEL
from ndm.ocsvm import OCSVM
from utils.tool import dump_data, load_data

RANDOM_STATE = 42

def load_dataset(feat_type):
    X = []
    y = []
    data_file = 'out/iotlab_fridgedataset/' + feat_type
    for i in range(1, 11):
        ## append idle results
        if i == 5:
            continue
        X_, y_ = load_data(data_file + '_0_' + str(i) + '.dat')
        X.append(X_)
        y.append(y_)
        
    for i in range(1, 7):
        if i == 3:
            continue
        X_, y_ = load_data(data_file + '_1_' + str(i) + '.dat')
        X.append(X_)
        y.append(y_)
    return np.concatenate(X, axis=0), np.concatenate(y, axis=0)


def eval_ndm(feat_type):
    # load data
    X, y = load_dataset(feat_type)
    # split train and test test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=RANDOM_STATE)

    # create detection model
    model = OCSVM(kernel='rbf', nu=0.5, random_state=RANDOM_STATE)
    model.name = 'OCSVM'
    ndm = MODEL(model, score_metric='auc', verbose=10, random_state=RANDOM_STATE)

    # learned the model from the train set
    ndm.train(X_train, y_train)

    # evaluate the learned model
    ndm.test(X_test, y_test)

    # dump data to disk
    out_dir = os.path.dirname(data_file)
    dump_data((model, ndm.history), out_file=f'{out_dir}/{ndm.model_name}-results.dat')

    print(ndm.train.tot_time, ndm.test.tot_time, ndm.score)

In [94]:
feat_types = ['IAT', 'SIZE', 'IAT_SIZE', 'STATS', 'FFT-IAT', 'FFT-IAT_SIZE', 'FFT-SIZE']
for feat_type in feat_types:
    eval_ndm(feat_type)

'_train()' starts at 2020-07-07 14:04:07
'_train()' ends at 2020-07-07 14:04:07 and takes 0.0 mins.
'_test()' starts at 2020-07-07 14:04:07
'_test()' ends at 2020-07-07 14:04:07 and takes 0.0 mins.
0.0 0.0 0.4375
'_train()' starts at 2020-07-07 14:04:07
'_train()' ends at 2020-07-07 14:04:07 and takes 0.0 mins.
'_test()' starts at 2020-07-07 14:04:07
'_test()' ends at 2020-07-07 14:04:07 and takes 0.0 mins.
0.0 0.0 0.5
'_train()' starts at 2020-07-07 14:04:07
'_train()' ends at 2020-07-07 14:04:07 and takes 0.0 mins.
'_test()' starts at 2020-07-07 14:04:07
'_test()' ends at 2020-07-07 14:04:07 and takes 0.0 mins.
0.0 0.0 0.75625
'_train()' starts at 2020-07-07 14:04:07
'_train()' ends at 2020-07-07 14:04:07 and takes 0.0 mins.
'_test()' starts at 2020-07-07 14:04:07
'_test()' ends at 2020-07-07 14:04:07 and takes 0.0 mins.
0.0 0.0 0.22499999999999998
'_train()' starts at 2020-07-07 14:04:07
'_train()' ends at 2020-07-07 14:04:07 and takes 0.0 mins.
'_test()' starts at 2020-07-07 14:04:

In [84]:
import pandas as pd
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn import metrics
import math

def nan2zero(vec):
    return [0 if math.isnan(k) else k for k in vec]

def get_auc(test, pred):
#     print(test, pred)
    fpr, tpr, thresholds = metrics.roc_curve(test, pred)
#     print(fpr, tpr)
    return str(metrics.auc(fpr, tpr))

def eval_sl(feat_type):
    # load data
    X, y = load_dataset(feat_type)
    result = feat_type + ','
    # split train and test test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, y_train)
    y_pred = LR.predict(X_test)
#     print('Logistic Regression:\n')
#     print(classification_report(y_test, y_pred))
    result += get_auc(y_test, y_pred) + ','

    SVM = svm.SVC(decision_function_shape="ovo").fit(X_train, y_train)
    y_pred = SVM.predict(X_test)
#     print('SVM:\n')
#     print(classification_report(y_test, y_pred))
    result += get_auc(y_test, y_pred) + ','

    RF = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0).fit(X_train, y_train)
    y_pred = RF.predict(X_test)
#     print('Random Forest:\n')
#     print(classification_report(y_test, y_pred))
    result += get_auc(y_test, y_pred) + ','

    NN = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(150, 10), random_state=1).fit(X_train, y_train)
    y_pred = NN.predict(X_test)
#     print('Neural Network:\n')
#     print(classification_report(y_test, y_pred))
    result += get_auc(y_test, y_pred)
    
    print(result)
    return result

In [95]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

for i in range(10):
    print(i)
    feat_types = ['IAT', 'SIZE', 'IAT_SIZE', 'STATS', 'FFT-IAT', 'FFT-IAT_SIZE', 'FFT-SIZE']
    for feat_type in feat_types:
    #     print(feat_type)
        eval_sl(feat_type)

0
IAT,0.5,0.5,0.9565217391304348,0.5
SIZE,0.5606060606060606,0.7272727272727273,0.7272727272727273,0.5227272727272727
IAT_SIZE,0.5,0.8125,0.825,0.5
STATS,0.6212121212121212,0.643939393939394,0.7575757575757575,0.8863636363636364
FFT-IAT,0.625,0.5,0.75,0.875
FFT-IAT_SIZE,0.5,0.787878787878788,0.8712121212121213,0.5
FFT-SIZE,0.47619047619047616,0.8809523809523809,0.8809523809523809,0.8095238095238095
1
IAT,0.5,0.5,0.8347826086956522,0.6782608695652175
SIZE,0.5,0.85,0.85,0.525
IAT_SIZE,0.5,0.7619047619047619,0.738095238095238,0.5
STATS,0.7291666666666666,0.75,0.5416666666666667,0.6875
FFT-IAT,0.5,0.5,0.6428571428571429,0.6904761904761905
FFT-IAT_SIZE,0.4411764705882353,0.713903743315508,0.713903743315508,0.5
FFT-SIZE,0.5,0.6111111111111112,0.8070175438596492,0.7836257309941521
2
IAT,0.5,0.5,0.6153846153846154,0.9230769230769231
SIZE,0.7708333333333334,0.875,0.8333333333333334,0.5416666666666666
IAT_SIZE,0.5,0.8541666666666666,0.8333333333333334,0.5
STATS,0.6212121212121212,0.6439393939393

In [96]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

for i in range(20):
    print(i)
    feat_types = ['IAT', 'SIZE', 'IAT_SIZE', 'STATS', 'FFT-IAT', 'FFT-IAT_SIZE', 'FFT-SIZE']
    for feat_type in feat_types:
    #     print(feat_type)
        eval_sl(feat_type)

0
IAT,0.5,0.5,0.7196969696969697,0.4772727272727273
SIZE,0.591304347826087,0.9782608695652174,0.9347826086956521,0.5434782608695652
IAT_SIZE,0.5555555555555556,0.7514619883040935,0.7514619883040935,0.5
STATS,0.5,0.6222222222222222,0.7166666666666668,0.6222222222222222
FFT-IAT,0.5,0.5,0.6190476190476191,0.5
FFT-IAT_SIZE,0.5,0.7272727272727273,0.75,0.5
FFT-SIZE,0.475,0.85,0.8875,0.8375
1
IAT,0.5,0.5,0.8484848484848486,0.5833333333333333
SIZE,0.5,0.787878787878788,0.8712121212121213,0.5
IAT_SIZE,0.5,0.7565217391304349,0.7565217391304349,0.5
STATS,0.4583333333333333,0.5833333333333333,0.6875,0.4583333333333333
FFT-IAT,0.5,0.5,0.6666666666666667,0.5722222222222222
FFT-IAT_SIZE,0.43478260869565216,0.7347826086956522,0.6130434782608696,0.5
FFT-SIZE,0.5,0.8712121212121213,0.9545454545454546,0.803030303030303
2
IAT,0.5,0.5,0.7857142857142858,0.4761904761904762
SIZE,0.6212121212121212,0.8712121212121213,0.9090909090909092,0.5454545454545454
IAT_SIZE,0.5714285714285714,0.8571428571428572,0.880952