# Pipeline
PParser: Feature extraction module

Input: pcap_file
```

=> PCAP 
    
    pcap2flows(): chunks of up to 600s
    
    flows2subflows(): split by intervals or quantiles

-> get_features(): from the paper

-> label_flows()
```

Output: '.dat' files to store the features

## iotlab_fridgedataset inspection
* Process: get to know more about the dataset using their Kun's idea
* Findings:
    * Previously we were mixing all packets together, but it actually has different traces based on 5-tuples (ip.source, ip.destination, source_port, destination_port, flag_TCP_UDP). 6 stands for TCP, 17 stands for IP, the number after the tuple is the packet number.
    * Pcaps for fridge_door opening
        * Each door file has 12 ~ 14 flows
        * Each flow has 2 ~ 41 packets
        * Fridge_door3.pcap might be a mistake
        
    * Pcaps for fridge_idle
        * Each idle file has 0 ~ 5 flows
        * Each flow has 2 packets

* Conclusion:
    * The door opening vs idle pcaps can be easily distinguished just by looking at the flow number and packet number within flows

In [40]:
import pparser.parser
import numpy as np

pcap_file = 'iotlab_fridgedataset/fridge_door'

for i in range(1,7):
    pp = pparser.parser._pcap2flows(pcap_file + str(i) + '.pcap')
    print(len(pp))

for i in range(len(pp)):
    # print out infos for the last flow
    print(pp[i][0], len(pp[i][1])) # 5-tuples, packet number for each flow

pcap_file: iotlab_fridgedataset/fridge_door1.pcap
12
pcap_file: iotlab_fridgedataset/fridge_door2.pcap
12
pcap_file: iotlab_fridgedataset/fridge_door3.pcap
2
pcap_file: iotlab_fridgedataset/fridge_door4.pcap
13
pcap_file: iotlab_fridgedataset/fridge_door5.pcap
14
pcap_file: iotlab_fridgedataset/fridge_door6.pcap
13
('192.168.143.43', '52.196.247.117', 58496, 5223, 6) 2
('192.168.143.43', '52.14.6.98', 43824, 443, 6) 6
('52.14.6.98', '192.168.143.43', 443, 43824, 6) 6
('192.168.143.43', '52.10.220.236', 58433, 443, 6) 33
('52.10.220.236', '192.168.143.43', 443, 58433, 6) 41
('192.168.143.43', '52.10.220.236', 58434, 443, 6) 29
('52.10.220.236', '192.168.143.43', 443, 58434, 6) 37
('192.168.143.43', '52.10.220.236', 58435, 443, 6) 34
('52.10.220.236', '192.168.143.43', 443, 58435, 6) 41
('192.168.143.43', '52.10.220.236', 58436, 443, 6) 20
('52.10.220.236', '192.168.143.43', 443, 58436, 6) 15
('192.168.143.43', '18.200.190.216', 41507, 443, 6) 2
('18.200.190.216', '192.168.143.43', 443, 

In [41]:
import pparser.parser
import numpy as np

pcap_file = 'iotlab_fridgedataset/fridge_idle'

for i in range(1,11):
    pp = pparser.parser._pcap2flows(pcap_file + str(i) + '.pcap')
    print(len(pp))

for i in range(len(pp)):
    # print out infos for the last flow
    print(pp[i][0], len(pp[i][1])) # 5-tuples, packet number for each flow

pcap_file: iotlab_fridgedataset/fridge_idle1.pcap
2
pcap_file: iotlab_fridgedataset/fridge_idle2.pcap
3
pcap_file: iotlab_fridgedataset/fridge_idle3.pcap
5
pcap_file: iotlab_fridgedataset/fridge_idle4.pcap
3
pcap_file: iotlab_fridgedataset/fridge_idle5.pcap
0
pcap_file: iotlab_fridgedataset/fridge_idle6.pcap
2
pcap_file: iotlab_fridgedataset/fridge_idle7.pcap
5
pcap_file: iotlab_fridgedataset/fridge_idle8.pcap
5
pcap_file: iotlab_fridgedataset/fridge_idle9.pcap
3
pcap_file: iotlab_fridgedataset/fridge_idle10.pcap
3
('192.168.143.43', '52.196.247.117', 58496, 5223, 6) 2
('192.168.143.43', '52.14.6.98', 43824, 443, 6) 2
('192.168.143.43', '224.0.0.251', 5353, 5353, 17) 2


In [55]:
import os

from pparser.parser import PCAP
from utils.tool import dump_data

RANDOM_STATE = 42

def save_feat(pcap_file, feat_type, label, file_num):
    # pcap_file = 'iotlab_fridgedataset/fridge_door1.pcap'
    pp = PCAP(pcap_file, flow_ptks_thres=2, verbose=0, random_state=RANDOM_STATE)

    # extract flows from pcap
    pp.pcap2flows(q_interval=0.9)

    # label each flow with a label
    pp.label_flows(label_file = '', label = label)

    # extract features from each flow given feat_type
    pp.flow2features(feat_type, fft=False, header=False)

    # dump data to disk
    X, y = pp.features, pp.labels
    out_dir = os.path.join('out', os.path.dirname(pcap_file))
    dump_data((X, y), out_file=f'{out_dir}/{feat_type}_{label}_{str(file_num)}.dat')

    print(pp.features.shape, pp.pcap2flows.tot_time, pp.flow2features.tot_time)

In [56]:
pcap_file = 'iotlab_fridgedataset/fridge_door'
feat_types = ['IAT', 'SIZE', 'IAT_SIZE', 'STATS']
for i in range(1,7):
    file = pcap_file + str(i) + '.pcap'
    for feat_type in feat_types:
        save_feat(file, feat_type, 1, i)

'_pcap2flows()' starts at 2020-07-02 17:08:32
'_pcap2flows()' ends at 2020-07-02 17:08:32 and takes 0.002 mins.
'_label_flows()' starts at 2020-07-02 17:08:32
'_label_flows()' ends at 2020-07-02 17:08:32 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-02 17:08:32
'_flow2features()' ends at 2020-07-02 17:08:32 and takes 0.0 mins.
(12, 2) 0.002 0.0
'_pcap2flows()' starts at 2020-07-02 17:08:32
'_pcap2flows()' ends at 2020-07-02 17:08:32 and takes 0.0017 mins.
'_label_flows()' starts at 2020-07-02 17:08:32
'_label_flows()' ends at 2020-07-02 17:08:32 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-02 17:08:32
'_flow2features()' ends at 2020-07-02 17:08:32 and takes 0.0001 mins.
(12, 2) 0.0017 0.0001
'_pcap2flows()' starts at 2020-07-02 17:08:32
'_pcap2flows()' ends at 2020-07-02 17:08:32 and takes 0.0019 mins.
'_label_flows()' starts at 2020-07-02 17:08:32
'_label_flows()' ends at 2020-07-02 17:08:32 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-02 17:08:32
'_

'_pcap2flows()' ends at 2020-07-02 17:08:34 and takes 0.0021 mins.
'_label_flows()' starts at 2020-07-02 17:08:34
'_label_flows()' ends at 2020-07-02 17:08:34 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-02 17:08:34
'_flow2features()' ends at 2020-07-02 17:08:34 and takes 0.0002 mins.
(13, 2) 0.0021 0.0002


In [57]:
pcap_file = 'iotlab_fridgedataset/fridge_idle'
feat_types = ['IAT', 'SIZE', 'IAT_SIZE', 'STATS']
for i in range(1,11):
    if i == 5:
        continue
    else:
        file = pcap_file + str(i) + '.pcap'
        for feat_type in feat_types:
            save_feat(file, feat_type, 0, i)

'_pcap2flows()' starts at 2020-07-02 17:08:34
'_pcap2flows()' ends at 2020-07-02 17:08:34 and takes 0.0001 mins.
'_label_flows()' starts at 2020-07-02 17:08:34
'_label_flows()' ends at 2020-07-02 17:08:34 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-02 17:08:34
'_flow2features()' ends at 2020-07-02 17:08:34 and takes 0.0 mins.
(1, 2) 0.0001 0.0
'_pcap2flows()' starts at 2020-07-02 17:08:34
'_pcap2flows()' ends at 2020-07-02 17:08:34 and takes 0.0001 mins.
'_label_flows()' starts at 2020-07-02 17:08:34
'_label_flows()' ends at 2020-07-02 17:08:34 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-02 17:08:34
'_flow2features()' ends at 2020-07-02 17:08:34 and takes 0.0 mins.
(1, 2) 0.0001 0.0
'_pcap2flows()' starts at 2020-07-02 17:08:34
'_pcap2flows()' ends at 2020-07-02 17:08:34 and takes 0.0002 mins.
'_label_flows()' starts at 2020-07-02 17:08:34
'_label_flows()' ends at 2020-07-02 17:08:34 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-02 17:08:34
'_flow2f

'_pcap2flows()' ends at 2020-07-02 17:08:34 and takes 0.0005 mins.
'_label_flows()' starts at 2020-07-02 17:08:34
'_label_flows()' ends at 2020-07-02 17:08:34 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-02 17:08:34
'_flow2features()' ends at 2020-07-02 17:08:34 and takes 0.0 mins.
(4, 2) 0.0005 0.0
'_pcap2flows()' starts at 2020-07-02 17:08:34
'_pcap2flows()' ends at 2020-07-02 17:08:34 and takes 0.0002 mins.
'_label_flows()' starts at 2020-07-02 17:08:34
'_label_flows()' ends at 2020-07-02 17:08:34 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-02 17:08:34
'_flow2features()' ends at 2020-07-02 17:08:34 and takes 0.0 mins.
(4, 2) 0.0002 0.0
'_pcap2flows()' starts at 2020-07-02 17:08:34
'_pcap2flows()' ends at 2020-07-02 17:08:34 and takes 0.0003 mins.
'_label_flows()' starts at 2020-07-02 17:08:34
'_label_flows()' ends at 2020-07-02 17:08:34 and takes 0.0 mins.
'_flow2features()' starts at 2020-07-02 17:08:34
'_flow2features()' ends at 2020-07-02 17:08:34 and tak

# Next steps
* Test the classifiers based on the fridge dataset and larger iot_data dataset.
* Try to integrate it within the DeepLens code.