## Setup

In [1]:
from scapy.all import rdpcap, IP, TCP, UDP, Raw
import plotly.graph_objects as go
import plotly.express as px
import networkx as nx
from pyvis.network import Network
import pandas as pd 
import numpy as np 

%matplotlib inline

In [2]:
amazon_ring_cap = rdpcap("data/AmazonRing/ring_merged.pcap")
pcap = amazon_ring_cap[IP]

# # Working Set
# pcap = pcap[50:80]

pcap

<IP from ring_merged.pcap: TCP:73052 UDP:73317 ICMP:155 Other:67>

In [3]:
pcap_len_summary = {'All': len(amazon_ring_cap), 'IP': len(pcap[IP]), 'TCP': len(pcap[TCP]), 'UDP': len(pcap[UDP])}
pcap_len_summary['IP_Other'] = pcap_len_summary['IP']-(pcap_len_summary['TCP'] + pcap_len_summary['UDP'])
pcap_len_summary['Other'] = pcap_len_summary['All']-pcap_len_summary['IP']
pcap_len_summary

{'All': 211325,
 'IP': 146591,
 'TCP': 73052,
 'UDP': 73317,
 'IP_Other': 222,
 'Other': 64734}

##### Dump Graphics

In [130]:
# from scapy.all import hexdump

# hexdump(pkt)

In [131]:
# pkt.svgdump(layer_shift=10)

## Build DF

In [4]:
num_pkts = len(pcap)

df_dict = {
  **{'time': None},
  **{'payload': None},
  **{"IP_" + field.name: None for field in IP.fields_desc}, 
  **{"TCP_" + field.name: None for field in TCP.fields_desc}, 
  **{"UDP_" + field.name: None for field in UDP.fields_desc},
  **{"Raw_" + field.name: None for field in Raw.fields_desc},
  }

In [5]:
# Set custom dtypes for the data
layer_dtypes_list = ['datetime64[s]', 'category', 'category', 'UInt8', 'UInt8', 'UInt16', 'UInt16',
       'UInt8', 'UInt16', 'UInt8', 'UInt8', 'UInt16', 'string',
       'string', 'object', 'UInt16', 'UInt16', 'UInt32', 'UInt32',
       'UInt8', 'UInt8', 'UInt16', 'UInt16', 'UInt16',
       'UInt16', 'object', 'UInt16', 'UInt16', 'UInt16',
       'UInt16', 'object']
layer_dtypes = dict(zip(df_dict.keys(), layer_dtypes_list))

# columns that need formatting before setting dtype to avoid conversion errors
layer_dtypes['time'] = 'float64'
layer_dtypes['payload'] = 'string'
layer_dtypes['IP_version'] = 'UInt8'
layer_dtypes['IP_flags'] = 'object'
layer_dtypes['TCP_flags'] = 'object'
layer_dtypes['IP_options'] = 'object'
layer_dtypes['TCP_options'] = 'object'
layer_dtypes['Raw_load'] = 'object'

In [6]:
df_dict = {k: pd.array(np.full(num_pkts, np.nan), dtype=layer_dtypes[k]) for k in df_dict}

In [7]:
layer_strings = ["IP", "TCP", "UDP", "Raw"]

for i, pkt in enumerate(pcap):
  df_dict['time'][i] = float(pkt.time)
  # Loop through payloads until lowest layer, ignoring ethernet frame
  while hasattr(pkt, 'payload'):
    pkt = pkt.payload
    layer = type(pkt)
    
    # Interesting layers for feature set
    if layer.__name__ in layer_strings:
      # Inserting pkt variables in df_dict
      for field in pkt.fields:
        field_name = layer.__name__ + "_" + field
        # print(f"df_dict[{field_name}][{i}] = pkt.{field}")
        exec(f"df_dict[field_name][i] = pkt.{field}")
      
      # Stop when reaching Raw layer
      if layer == Raw:
        df_dict['payload'][i] = layer.__name__
        break

    # Less important layers for feature set saved as 'payload'
    else:
      df_dict['payload'][i] = layer.__name__
      break

In [8]:
df = pd.DataFrame(df_dict)
df.head(1)

Unnamed: 0,time,payload,IP_version,IP_ihl,IP_tos,IP_len,IP_id,IP_flags,IP_frag,IP_ttl,...,TCP_flags,TCP_window,TCP_chksum,TCP_urgptr,TCP_options,UDP_sport,UDP_dport,UDP_len,UDP_chksum,Raw_load
0,1617818000.0,BOOTP,4,5,0,328,41358,DF,0,16,...,,,,,,68,67,308,44787,


In [9]:
df.sample(10)

Unnamed: 0,time,payload,IP_version,IP_ihl,IP_tos,IP_len,IP_id,IP_flags,IP_frag,IP_ttl,...,TCP_flags,TCP_window,TCP_chksum,TCP_urgptr,TCP_options,UDP_sport,UDP_dport,UDP_len,UDP_chksum,Raw_load
34117,1631688000.0,Raw,4,5,0,1480,57506,,0,249,...,PA,133.0,1854.0,0.0,"[(NOP, None), (NOP, None), (Timestamp, (422402...",,,,,b'\x9a\x89\x87\x19%\xf8\xed`\x16\x9eWb\xb6\xbd...
107610,1631688000.0,Raw,4,5,0,1177,54171,DF,0,64,...,,,,,,40153.0,64583.0,1157.0,36808.0,b'\x80\xe0C\x158\x08\x93\xf6\xefs\x16\xdd\x91\...
24943,1625918000.0,NoPayload,4,5,0,52,28003,,0,60,...,A,283.0,35013.0,0.0,"[(NOP, None), (NOP, None), (Timestamp, (419253...",,,,,
34434,1631688000.0,Raw,4,5,0,1480,57791,,0,249,...,PA,133.0,7461.0,0.0,"[(NOP, None), (NOP, None), (Timestamp, (422402...",,,,,b'\xf1^d\x7f\xc2\xdd\x8c\x87\xefP\xdcq\xac\xda...
98291,1631688000.0,Raw,4,5,0,1228,33106,DF,0,64,...,,,,,,55384.0,6164.0,1208.0,47245.0,b'\x00\x00\x00=\x00\x06\x99\x17\x00\x000\x83\x...
74331,1631688000.0,Raw,4,5,0,1492,39707,DF,0,64,...,A,913.0,6702.0,0.0,"[(NOP, None), (NOP, None), (Timestamp, (429494...",,,,,b'\xc9w\xff\x89\xa11\x01p\x1dE\xfdf~\xcb\xe7kH...
48697,1631688000.0,Raw,4,5,0,1480,5281,,0,249,...,A,133.0,41986.0,0.0,"[(NOP, None), (NOP, None), (Timestamp, (422403...",,,,,b'\xf0\x90\xd2\xb5\xc6\xa5\xa9\x86\xa7\x18\xca...
30043,1631688000.0,NoPayload,4,5,0,60,55586,DF,0,64,...,S,65535.0,27222.0,0.0,"[(MSS, 1460), (SAckOK, b''), (Timestamp, (1596...",,,,,
62568,1631688000.0,Raw,4,5,0,1480,19582,,0,249,...,PA,133.0,4890.0,0.0,"[(NOP, None), (NOP, None), (Timestamp, (422404...",,,,,b'\xa0\x13H>Am\xa2\xb3\x1f\xa6\xadCB\xf0\xc4\x...
89045,1631688000.0,Raw,4,5,0,1228,23848,DF,0,64,...,,,,,,55384.0,6164.0,1208.0,39931.0,b'\x00\x00\x007\x00\x07\xd9\xcc\x00\x00\x0ce\x...


### Clean up

In [10]:
# Time
df['time'] = pd.to_datetime(df['time'], unit='s')
# Payload category
df['payload'] = df['payload'].astype("category")
# IP Version category
df['IP_version'] = df['IP_version'].astype("category")
# IP Addresses category
df['IP_src'] = df['IP_src'].astype("category")
df['IP_dst'] = df['IP_dst'].astype("category")
# TCP ports category
df['TCP_sport'] = df['TCP_sport'].astype("string").astype("category")
df['TCP_dport'] = df['TCP_dport'].astype("string").astype("category")
# UDP ports category
df['UDP_sport'] = df['UDP_sport'].astype("string").astype("category")
df['UDP_dport'] = df['UDP_dport'].astype("string").astype("category")
# Flags
df['IP_flags'] = df['IP_flags'].apply(int).astype('UInt8')
# df['TCP_flags'] = df['TCP_flags'].fillna(0)
df['TCP_flags'] = df[df['TCP_flags'].notnull()]['TCP_flags'].apply(int).astype('UInt8')
# Options
df.drop("IP_options", axis=1, inplace=True)
df.drop("TCP_options", axis=1, inplace=True)
# Raw payload
df['Raw_load'] = df['Raw_load'].apply(lambda x: len(x.hex())//2).astype('UInt16') # Payload size in bytes

In [11]:
df.sample(3)

Unnamed: 0,time,payload,IP_version,IP_ihl,IP_tos,IP_len,IP_id,IP_flags,IP_frag,IP_ttl,...,TCP_reserved,TCP_flags,TCP_window,TCP_chksum,TCP_urgptr,UDP_sport,UDP_dport,UDP_len,UDP_chksum,Raw_load
26747,2021-07-10 11:57:31.764592384,NoPayload,4,5,0,64,28092,2,0,64,...,0.0,16.0,421.0,49530.0,0.0,,,,,1
33783,2021-09-15 06:42:25.926117888,Raw,4,5,0,1480,57203,0,0,249,...,0.0,24.0,133.0,10939.0,0.0,,,,,1428
89222,2021-09-15 06:45:11.991298048,Raw,4,5,0,1228,24026,2,0,64,...,,,,,,55384.0,6164.0,1208.0,56956.0,1200


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146591 entries, 0 to 146590
Data columns (total 29 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   time          146591 non-null  datetime64[ns]
 1   payload       146591 non-null  category      
 2   IP_version    146591 non-null  category      
 3   IP_ihl        146591 non-null  UInt8         
 4   IP_tos        146591 non-null  UInt8         
 5   IP_len        146591 non-null  UInt16        
 6   IP_id         146591 non-null  UInt16        
 7   IP_flags      146591 non-null  UInt8         
 8   IP_frag       146591 non-null  UInt16        
 9   IP_ttl        146591 non-null  UInt8         
 10  IP_proto      146591 non-null  UInt8         
 11  IP_chksum     146591 non-null  UInt16        
 12  IP_src        146591 non-null  category      
 13  IP_dst        146591 non-null  category      
 14  TCP_sport     73052 non-null   category      
 15  TCP_dport     730

## Graphs

#### Network Graph

In [13]:
# Get unique connections
connections = df[['IP_src', 'IP_dst']].groupby(['IP_src','IP_dst']).size().rename("count").reset_index()
connections = connections[connections['count'] != 0]

# Remove 0.0.0.0 and 255.255.255.255 connections
zero = np.logical_or(connections['IP_src'] == '0.0.0.0', connections['IP_dst'] == '0.0.0.0')
broad = np.logical_or(connections['IP_src'] == '255.255.255.255', connections['IP_dst'] == '255.255.255.255')
connections = connections[np.invert(np.logical_or(zero, broad))]
max_count = connections['count'].max()
connections

Unnamed: 0,IP_src,IP_dst,count
274,104.96.36.136,192.168.24.228,160
478,107.23.53.142,192.168.24.228,23
682,108.177.15.188,192.168.24.228,1
886,13.107.253.45,192.168.24.228,5
1086,13.32.121.20,192.168.10.237,8
...,...,...,...
41278,80.158.41.35,192.168.24.228,8
41478,80.158.43.38,192.168.10.237,9
41682,80.158.61.141,192.168.10.237,9
41890,82.165.229.138,192.168.24.228,91


In [14]:
connections = connections.itertuples(index=False, name=None)

In [15]:
G = nx.DiGraph()
G.add_weighted_edges_from(connections)

In [16]:
net = Network(directed=True, notebook=True)
net.from_nx(G)


In [17]:
for edge in net.edges:
  # edge['width'] = width_from_weight(edge['weight'], max_count=max_count)
  edge['width'] = (edge['weight']/max_count)*5
  edge['label'] = edge['weight']

In [18]:
# net.show_buttons(filter_=['physics', 'nodes', 'edges'])
options = """
var options = {
  "nodes": {
    "font": {
      "background": "rgba(255,125,104,0.77)"
    }
  },
  "edges": {
    "color": {
      "inherit": true
    },
    "scaling": {
      "max": 100
    },
    "font": {
      "size": 9,
      "background": "rgba(255,255,255,0.90)"
    },
    "smooth": {
      "forceDirection": "none"
    }
  },
  "physics": {
    "minVelocity": 0.75,
    "solver": "repulsion"
  }
}
"""
net.set_options(options)
net.show('network.html')

#### IP Addresses

##### Payload size

In [19]:
source_addresses = df.groupby("IP_src")['Raw_load'].sum().to_frame(name = 'size').reset_index()
source_addresses = source_addresses.sort_values("size", ascending=False)
source_addresses['size'] = source_addresses['size'].apply(lambda x: x/2**20)
source_addresses

Unnamed: 0,IP_src,size
67,192.168.10.161,6.827312e+01
7,13.32.99.37,4.203535e+01
79,192.168.24.221,4.783455e+00
106,3.250.253.255,3.019287e+00
120,34.249.149.250,1.721482e+00
...,...,...
38,142.251.36.238,9.536743e-07
87,213.239.239.166,9.536743e-07
85,2.22.144.242,9.536743e-07
44,172.217.16.132,9.536743e-07


In [20]:
fig = px.bar(
  source_addresses, 
  x="size", 
  y="IP_src", 
  # log_x=True,
  # color="size", 
  template="plotly", 
  title="Source addresses: summed payload sizes (Bytes sent)",
  orientation='h',
  labels={
          "IP_src": "IP address",
          "size": "Aggregated payload size (MB)",
        },
  )
fig.show()

In [21]:
destination_addresses = df.groupby("IP_dst")['Raw_load'].sum().to_frame(name='size').reset_index()
destination_addresses = destination_addresses.sort_values("size", ascending=False)
destination_addresses['size'] = destination_addresses['size'].apply(lambda x: x/2**20)
destination_addresses

Unnamed: 0,IP_dst,size
65,192.168.10.161,4.218008e+01
103,3.69.171.62,1.788102e+01
129,46.137.48.202,1.439988e+01
102,3.250.253.255,1.367944e+01
127,44.197.209.144,1.155018e+01
...,...,...
130,5.9.57.158,9.536743e-07
62,192.168.10.1,9.536743e-07
200,80.158.44.94,9.536743e-07
77,213.239.239.166,9.536743e-07


In [22]:
fig = px.bar(
  destination_addresses, 
  x="size", 
  y="IP_dst", 
  # log_x=True,
  # color="size", 
  template="plotly", 
  title="Destination addresses: summed payload sizes (Bytes Received)",
  orientation='h',
  labels={
        "IP_dst": "IP address",
        "size": "Aggregated payload size (MB)",
      },
  )
fig.show()

##### Distribution of payload sizes over time

In [23]:
top_src_addr = source_addresses.iloc[0]['IP_src']
df_top_src_addr = df[df['IP_src'] == top_src_addr][['time', 'IP_src', 'IP_dst', 'Raw_load']]

In [24]:
scatter_plots = []
for address in df_top_src_addr['IP_dst'].unique():
  sub_df = df_top_src_addr[df_top_src_addr['IP_dst'] == address]
  scatter_plots.append(
    go.Scatter(x=sub_df['time'], y=sub_df['Raw_load'], name=address)
    )

In [25]:
layout = go.Layout(
    title=f"IP packet sizes sent from {top_src_addr} over time",
    xaxis_title="Time",
    yaxis_title="Payload size (Bytes)",
    legend_title_text='Destination addresses'
)

fig = go.Figure(
  data=scatter_plots,
  layout=layout,
  )
fig.show()

#### TCP Ports

In [26]:
source_ports = df.groupby("TCP_sport")['Raw_load'].sum().to_frame(name='size').reset_index()
source_ports = source_ports.sort_values("size", ascending=False)
source_ports['size'] = source_ports['size'].apply(lambda x: x/2**10)
source_ports

Unnamed: 0,TCP_sport,size
271,443,47654.935547
30,33699,11826.601562
63,35470,4177.538086
168,38843,1442.998047
38,34153,966.997070
...,...,...
308,44772,0.000977
396,48608,0.000977
165,38712,0.000977
406,49800,0.000977


In [27]:
import plotly.express as px

fig = px.bar(
  source_ports, 
  x="size", 
  y="TCP_sport", 
  # log_x=True,
  # color="size", 
  template="plotly", 
  title="Source ports: summed payload sizes (Bytes sent)",
  orientation='h',
  labels={
          "TCP_sport": "TCP Port",
          "size": "Aggregated payload size (kB)",
        },
  )
fig.show()

In [28]:
destination_ports = df.groupby("TCP_dport")['Raw_load'].sum().to_frame(name='size').reset_index()
destination_ports = destination_ports.sort_values("size", ascending=False)
destination_ports['size'] = destination_ports['size'].apply(lambda x: x/2**10)
destination_ports

Unnamed: 0,TCP_dport,size
582,58682,43044.195312
623,6505,11827.387695
266,443,4358.990234
465,5201,4177.538086
524,54640,558.167969
...,...,...
384,48520,0.000977
31,33756,0.000977
565,57776,0.000977
303,44772,0.000977


In [29]:
fig = px.bar(
  destination_ports, 
  x="size", 
  y="TCP_dport", 
  # log_x=True,
  # color="size", 
  template="plotly", 
  title="Destination ports: summed payload sizes (Bytes Received)",
  orientation='h',
  labels={
        "TCP_dport": "TCP port",
        "size": "Aggregated payload size (kB)",
      },
  )
fig.show()

##### Distribution of TCP payload sizes over time

In [30]:
top_src_port = source_ports.iloc[0]['TCP_sport']
df_top_src_port = df[df['TCP_sport'] == top_src_port][['time', 'TCP_sport', 'TCP_dport', 'Raw_load']]

In [31]:
scatter_plots = []
for port in df_top_src_port['TCP_dport'].unique():
  sub_df = df_top_src_port[df_top_src_port['TCP_dport'] == port]
  scatter_plots.append(
    go.Scatter(x=sub_df['time'], y=sub_df['Raw_load'], name=port)
    )

In [32]:
# Using graph_objects
import plotly.graph_objects as go

layout = go.Layout(
    title=f"TCP packet sizes sent from {top_src_port} over time",
    xaxis_title="Time",
    yaxis_title="Payload size (Bytes)",
    legend_title_text='Destination ports'
)

fig = go.Figure(
  data=scatter_plots,
  layout=layout,
  )
fig.show()

#### UDP Ports

In [33]:
source_ports = df.groupby("UDP_sport")['Raw_load'].sum().to_frame(name='size').reset_index()
source_ports = source_ports.sort_values("size", ascending=False)
source_ports['size'] = source_ports['size'].apply(lambda x: x/2**10)
source_ports

Unnamed: 0,UDP_sport,size
200,55384,18309.378906
80,40153,14742.888672
97,42084,13959.504883
163,51504,4249.308594
125,45890,3030.221680
...,...,...
38,35604,0.000977
35,35241,0.000977
218,58228,0.000977
29,34842,0.000977


In [34]:
import plotly.express as px

fig = px.bar(
  source_ports, 
  x="size", 
  y="UDP_sport", 
  # log_x=True,
  # color="size", 
  template="plotly", 
  title="Source ports: summed payload sizes (Bytes sent)",
  orientation='h',
  labels={
          "UDP_sport": "UDP Port",
          "size": "Aggregated payload size (kB)",
        },
  )
fig.show()

In [35]:
destination_ports = df.groupby("UDP_dport")['Raw_load'].sum().to_frame(name='size').reset_index()
destination_ports = destination_ports.sort_values("size", ascending=False)
destination_ports['size'] = destination_ports['size'].apply(lambda x: x/2**10)
destination_ports

Unnamed: 0,UDP_dport,size
111,6164,18309.378906
113,64583,14742.888672
60,47352,13959.504883
119,9078,5999.103516
103,59304,2919.017578
...,...,...
30,36586,0.000977
56,45733,0.000977
27,35777,0.000977
24,35604,0.000977


In [36]:
fig = px.bar(
  destination_ports, 
  x="size", 
  y="UDP_dport", 
  # log_x=True,
  # color="size", 
  template="plotly", 
  title="Destination ports: summed payload sizes (Bytes Received)",
  orientation='h',
  labels={
        "UDP_dport": "UDP port",
        "size": "Aggregated payload size (kB)",
      },
  )
fig.show()

##### Distribution of TCP payload sizes over time

In [37]:
top_src_port = source_ports.iloc[0]['UDP_sport']
df_top_src_port = df[df['UDP_sport'] == top_src_port][['time', 'UDP_sport', 'UDP_dport', 'Raw_load']]

In [38]:
scatter_plots = []
for port in df_top_src_port['UDP_dport'].unique():
  sub_df = df_top_src_port[df_top_src_port['UDP_dport'] == port]
  scatter_plots.append(
    go.Scatter(x=sub_df['time'], y=sub_df['Raw_load'], name=port)
    )

In [39]:
layout = go.Layout(
    title=f"UDP packet sizes sent from {top_src_port} over time",
    xaxis_title="Time",
    yaxis_title="Payload size (Bytes)",
    legend_title_text='Destination ports'
)

fig = go.Figure(
  data=scatter_plots,
  layout=layout,
  )
fig.show()

#### Inter-arrival/Inter-departure time

In [40]:
df[['time', 'IP_src', 'IP_dst']]

Unnamed: 0,time,IP_src,IP_dst
0,2021-04-07 17:53:49.903643648,0.0.0.0,255.255.255.255
1,2021-04-07 17:53:50.053728768,0.0.0.0,255.255.255.255
2,2021-04-07 17:53:50.118687744,0.0.0.0,255.255.255.255
3,2021-04-07 17:53:51.814193408,0.0.0.0,255.255.255.255
4,2021-04-07 17:53:51.958051840,0.0.0.0,255.255.255.255
...,...,...,...
146586,2021-09-15 06:50:14.162480128,192.168.10.161,54.220.219.16
146587,2021-09-15 06:50:14.162530816,192.168.10.161,54.220.219.16
146588,2021-09-15 06:50:14.162576896,192.168.10.161,54.220.219.16
146589,2021-09-15 06:50:14.162621952,192.168.10.161,54.220.219.16


In [41]:
# Inter-departure time
frames = []
for addr in (set(df['IP_src'].unique()) | set(df['IP_dst'].unique())):
  tmp_df = df[df["IP_src"] == addr].sort_values("time", ascending=True)
  # Create series and calculate time until the next packet departs
  tmp_df['IP_int_dpt_time'] = tmp_df['time'].diff()
  frames.append(tmp_df)
df = pd.concat(frames).sort_index()

# Inter-arrival time
frames = []
for addr in (set(df['IP_src'].unique()) | set(df['IP_dst'].unique())):
  tmp_df = df[df["IP_dst"] == addr].sort_values("time", ascending=True)
  # Create series and calculate time until the next packet departs
  tmp_df['IP_int_arr_time'] = tmp_df['time'].diff()
  frames.append(tmp_df)
df = pd.concat(frames).sort_index()
df[['time', 'IP_src', 'IP_dst', 'IP_int_dpt_time', 'IP_int_arr_time']]


Unnamed: 0,time,IP_src,IP_dst,IP_int_dpt_time,IP_int_arr_time
0,2021-04-07 17:53:49.903643648,0.0.0.0,255.255.255.255,NaT,NaT
1,2021-04-07 17:53:50.053728768,0.0.0.0,255.255.255.255,0 days 00:00:00.150085120,0 days 00:00:00.150085120
2,2021-04-07 17:53:50.118687744,0.0.0.0,255.255.255.255,0 days 00:00:00.064958976,0 days 00:00:00.064958976
3,2021-04-07 17:53:51.814193408,0.0.0.0,255.255.255.255,0 days 00:00:01.695505664,0 days 00:00:01.695505664
4,2021-04-07 17:53:51.958051840,0.0.0.0,255.255.255.255,0 days 00:00:00.143858432,0 days 00:00:00.143858432
...,...,...,...,...,...
146586,2021-09-15 06:50:14.162480128,192.168.10.161,54.220.219.16,0 days 00:00:00.001092096,0 days 00:00:00.001092096
146587,2021-09-15 06:50:14.162530816,192.168.10.161,54.220.219.16,0 days 00:00:00.000050688,0 days 00:00:00.000050688
146588,2021-09-15 06:50:14.162576896,192.168.10.161,54.220.219.16,0 days 00:00:00.000046080,0 days 00:00:00.000046080
146589,2021-09-15 06:50:14.162621952,192.168.10.161,54.220.219.16,0 days 00:00:00.000045056,0 days 00:00:00.000045056


Retroactive analysis would allow for analyses of streams -> burst/burstiness

In [42]:
# Stream analysis of top source address with corresponding destination
top_src_addr, top_dst_addr = df[df['IP_src'] == top_src_addr][['IP_src', 'IP_dst']].describe().loc['top']
top_src_addr, top_dst_addr 

('192.168.10.161', '46.137.48.202')

In [43]:
top_stream_df = df[df['IP_src'] == top_src_addr]
top_stream_df = top_stream_df[top_stream_df['IP_dst'] == top_dst_addr]
top_stream_df = top_stream_df[['time', 'payload', 'IP_src', 'IP_dst', 'IP_int_arr_time', 'IP_int_dpt_time', 'Raw_load']]
top_stream_df

Unnamed: 0,time,payload,IP_src,IP_dst,IP_int_arr_time,IP_int_dpt_time,Raw_load
101725,2021-09-15 06:47:53.466924032,NoPayload,192.168.10.161,46.137.48.202,NaT,0 days 00:00:00.033704192,1
101727,2021-09-15 06:47:53.496035072,NoPayload,192.168.10.161,46.137.48.202,0 days 00:00:00.029111040,0 days 00:00:00.029111040,1
101728,2021-09-15 06:47:53.500571904,Raw,192.168.10.161,46.137.48.202,0 days 00:00:00.004536832,0 days 00:00:00.004536832,517
101734,2021-09-15 06:47:53.530958848,NoPayload,192.168.10.161,46.137.48.202,0 days 00:00:00.030386944,0 days 00:00:00.030386944,1
101735,2021-09-15 06:47:53.532646912,NoPayload,192.168.10.161,46.137.48.202,0 days 00:00:00.001688064,0 days 00:00:00.001688064,1
...,...,...,...,...,...,...,...
118322,2021-09-15 06:48:53.682923008,Raw,192.168.10.161,46.137.48.202,0 days 00:00:00.003336192,0 days 00:00:00.003336192,1194
118323,2021-09-15 06:48:53.682967040,Raw,192.168.10.161,46.137.48.202,0 days 00:00:00.000044032,0 days 00:00:00.000044032,1194
118329,2021-09-15 06:48:53.714829056,NoPayload,192.168.10.161,46.137.48.202,0 days 00:00:00.031862016,0 days 00:00:00.031862016,1
118330,2021-09-15 06:48:53.719194880,NoPayload,192.168.10.161,46.137.48.202,0 days 00:00:00.004365824,0 days 00:00:00.004365824,1


In [44]:
layout = go.Layout(
    title=f"Packet sizes of most frequent communication partners from {top_src_addr} to {top_dst_addr} over time",
    xaxis_title="Time",
    yaxis_title="Payload size (Bytes)",
    # legend_title_text=''
)

fig = go.Figure(
  data=[go.Scatter(x=top_stream_df['time'], y=top_stream_df['Raw_load'])],
  layout=layout,
  )
fig.show()

Fur our purposes we define the parameters that comprise a burst as:

In [45]:
BURST_TIME_THRESHOLD = 0.3 # Only packets sent within this time interval will be considered for the current burst
BURST_SIZE_THRESHOLD = 300 # Only packets that have at least this many bytes will be considered for the current burst

In [46]:
row = df.iloc[2]

df.head(3)

Unnamed: 0,time,payload,IP_version,IP_ihl,IP_tos,IP_len,IP_id,IP_flags,IP_frag,IP_ttl,...,TCP_window,TCP_chksum,TCP_urgptr,UDP_sport,UDP_dport,UDP_len,UDP_chksum,Raw_load,IP_int_dpt_time,IP_int_arr_time
0,2021-04-07 17:53:49.903643648,BOOTP,4,5,0,328,41358,2,0,16,...,,,,68,67,308,44787,1,NaT,NaT
1,2021-04-07 17:53:50.053728768,BOOTP,4,5,0,328,41513,2,0,16,...,,,,68,67,308,4975,1,0 days 00:00:00.150085120,0 days 00:00:00.150085120
2,2021-04-07 17:53:50.118687744,BOOTP,4,5,0,328,41598,2,0,16,...,,,,68,67,308,64361,1,0 days 00:00:00.064958976,0 days 00:00:00.064958976


In [47]:
df['IP_burst_ix'] = pd.array(np.full(len(df), np.nan), dtype='int32')

burst_ix = 0
i = df[df['Raw_load'].gt(BURST_SIZE_THRESHOLD)].index[0] # first row with relevant payload
pred_row = df.iloc[i-1]
row = df.iloc[i]
succ_row = df.iloc[i+1]
