In [2]:
%%javascript

IPython.tab_as_tab_everywhere = function(use_tabs) {
    if (use_tabs === undefined) {
        use_tabs = true; 
    }

    // apply setting to all current CodeMirror instances
    IPython.notebook.get_cells().map(
        function(c) {  return c.code_mirror.options.indentWithTabs=use_tabs;  }
    );
    // make sure new CodeMirror instances created in the future also use this setting
    CodeMirror.defaults.indentWithTabs=use_tabs;

    };

IPython.tab_as_tab_everywhere()

<IPython.core.display.Javascript object>

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import re

# Sample input text
input_text = """
Dataset loaded from pickle at Wyze_camera.pkl
stddev_ports = 7839.19839 Excellent randomization (stdev 7839.20 bits 14.73)
stddev_id    = 188.10117 Poor randomization (stdev 188.10 bits 9.35)
Dataset loaded from pickle at alexa_swan_kettle.pkl
stddev_ports = 16288.95114 Excellent randomization (stdev 16288.95 bits 15.78)
stddev_id    = 17311.11043 Excellent randomization (stdev 17311.11 bits 15.87)
Dataset loaded from pickle at arlo_camera_pro4.pkl
stddev_ports = 127.04026 Poor randomization (stdev 127.04 bits 8.78)
stddev_id    = 0.04335 No randomness
Dataset loaded from pickle at arlo_chime.pkl
stddev_ports = 19419.84077 Excellent randomization (stdev 19419.84 bits 16.04)
stddev_id    = 19359.94119 Excellent randomization (stdev 19359.94 bits 16.03)
Dataset loaded from pickle at belkin_plug.pkl
stddev_ports = 1115.52662 Good randomization (stdev 1115.53 bits 11.91)
stddev_id    = 3643.14526 Good randomization (stdev 3643.15 bits 13.62)
Dataset loaded from pickle at blurams_security_camera.pkl
stddev_ports = 8949.05047 Excellent randomization (stdev 8949.05 bits 14.92)
stddev_id    = 5010.94474 Excellent randomization (stdev 5010.94 bits 14.08)
Dataset loaded from pickle at bose_speaker.pkl
stddev_ports = 5373.70860 Excellent randomization (stdev 5373.71 bits 14.18)
stddev_id    = 17632.52664 Excellent randomization (stdev 17632.53 bits 15.90)
Dataset loaded from pickle at eufy_chime.pkl
stddev_ports = 8674.98227 Excellent randomization (stdev 8674.98 bits 14.87)
stddev_id    = 14.68451 Poor randomization (stdev 14.68 bits 5.67)
Dataset loaded from pickle at furbo_dog_camera.pkl
stddev_ports = 8359.88165 Excellent randomization (stdev 8359.88 bits 14.82)
stddev_id    = 19464.92892 Excellent randomization (stdev 19464.93 bits 16.04)
Dataset loaded from pickle at ring_chime_pro.pkl
stddev_ports = 8968.04164 Excellent randomization (stdev 8968.04 bits 14.92)
stddev_id    = 8217.52478 Excellent randomization (stdev 8217.52 bits 14.80)
Dataset loaded from pickle at simplisafe_doorbell.pkl
stddev_ports = 7412.65275 Excellent randomization (stdev 7412.65 bits 14.65)
stddev_id    = 18032.76627 Excellent randomization (stdev 18032.77 bits 15.93)
Dataset loaded from pickle at sonos_speaker.pkl
stddev_ports = 6283.85530 Excellent randomization (stdev 6283.86 bits 14.41)
stddev_id    = 20528.24133 Excellent randomization (stdev 20528.24 bits 16.12)
Dataset loaded from pickle at tapo_plug110_33.pkl
stddev_ports = 17455.52603 Excellent randomization (stdev 17455.53 bits 15.88)
stddev_id    = 19151.35361 Excellent randomization (stdev 19151.35 bits 16.02)
Dataset loaded from pickle at tapo_plug110_8.pkl
stddev_ports = 19322.39006 Excellent randomization (stdev 19322.39 bits 16.03)
stddev_id    = 12711.07402 Excellent randomization (stdev 12711.07 bits 15.42)
Dataset loaded from pickle at vtech_baby_camera.pkl
stddev_ports = 8055.71711 Excellent randomization (stdev 8055.72 bits 14.77)
stddev_id    = 438.00457 Good randomization (stdev 438.00 bits 10.57)
Dataset loaded from pickle at wyze_cam_pan_v2.pkl
stddev_ports = 8051.83485 Excellent randomization (stdev 8051.83 bits 14.77)
stddev_id    = 206.38829 Poor randomization (stdev 206.39 bits 9.48)
"""

# Parsing the text to extract device names, standard deviations, and categories
device_data = []
lines = input_text.strip().split("\n")

for i in range(0, len(lines), 3):
	device_name = re.search(r'at (.+?)\.pkl', lines[i]).group(1)
	stddev_ports = float(re.search(r'stddev_ports = ([\d.]+)', lines[i+1]).group(1))
	category_ports = re.search(r'(Excellent|Good|Poor) randomization', lines[i+1]).group(1)
	stddev_id = float(re.search(r'stddev_id\s+=\s+([\d.]+)', lines[i+2]).group(1))
	category_id = re.search(r'(Excellent|Good|Poor|No randomness)', lines[i+2]).group(1)

	device_data.append((device_name, stddev_ports, category_ports, stddev_id, category_id))

# Convert to DataFrame for plotting
import pandas as pd
df = pd.DataFrame(device_data, columns=['Device', 'StdDev Ports', 'Port Category', 'StdDev ID', 'ID Category'])

# Define colors based on categories
color_map = {
	"Excellent": "green",
	"Good": "blue",
	"Poor": "red",
	"No randomness": "gray"
}

df['Port Color'] = df['Port Category'].map(color_map)
df['ID Color'] = df['ID Category'].map(color_map)

print(df)

# # Plot standard deviation of source ports
# plt.figure(figsize=(12, 6))
# plt.bar(df['Device'], df['StdDev Ports'], color=df['Port Color'])
# plt.xlabel('Device Name')
# plt.ylabel('Standard Deviation of Source Ports')
# # plt.title('Standard Deviation of Source Ports per Device')
# plt.xticks(rotation=45, ha='right')
# plt.legend(handles=[plt.Rectangle((0,0),1,1, color=color_map[c]) for c in color_map.keys()], labels=color_map.keys())
# plt.tight_layout()
# plt.savefig("stddev_ports_plot.pdf")
# plt.close()

# # Plot standard deviation of DNS transaction IDs
# plt.figure(figsize=(12, 6))
# plt.bar(df['Device'], df['StdDev ID'], color=df['ID Color'])
# plt.xlabel('Device Name')
# # plt.ylabel('Standard Deviation of DNS Transaction IDs')
# # plt.title('Standard Deviation of DNS Transaction IDs per Device')
# plt.xticks(rotation=45, ha='right')
# plt.legend(handles=[plt.Rectangle((0,0),1,1, color=color_map[c]) for c in color_map.keys()], labels=color_map.keys())
# plt.tight_layout()
# plt.savefig("stddev_id_plot.pdf")
# plt.close()

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

# Settings
bar_width_inches = 3
bar_height_inches = 2.2

# Hatch and color setup
hatches = ['/', '\\', '|', '-', '+', 'x', 'o', 'O', '.', '*']
colors = cm.get_cmap('tab10', 10)  # Grayscale color map

# === Source Ports Plot ===
plt.figure(figsize=(bar_width_inches, bar_height_inches))
df2 = df[df["Port Category"] != "Excellent"]
bars = plt.bar(df2['Device'], df2['StdDev Ports'],
			   edgecolor='black')

# Apply hatch + color
for i, bar in enumerate(bars):
	bar.set_hatch(hatches[i % len(hatches)])
	bar.set_facecolor(colors(i % 10))  # Cycle through grayscale

plt.xlabel('Device Name', fontsize=10)
plt.ylabel(r'$\sigma$ (Source Ports)', fontsize=10)
plt.xticks(rotation=60, ha='right', fontsize=10)
plt.yticks(fontsize=10)
# plt.tight_layout(pad=0.5)
plt.savefig("stddev_ports_plot.pdf", bbox_inches='tight', dpi=300)
plt.close()

# === Transaction IDs Plot ===
plt.figure(figsize=(bar_width_inches, bar_height_inches))
df2 = df[df["ID Category"] != "Excellent"]
bars = plt.bar(df2['Device'], df2['StdDev ID'],
			   edgecolor='black')

# Apply hatch + color
for i, bar in enumerate(bars):
	bar.set_hatch(hatches[i % len(hatches)])
	bar.set_facecolor(colors(i % 10))

plt.xlabel('Device Name', fontsize=10)
plt.ylabel(r'$\sigma$ (Transaction IDs)', fontsize=10)
plt.xticks(rotation=60, ha='right', fontsize=10)
plt.yticks(fontsize=10)
# plt.tight_layout(pad=0.5)
plt.savefig("stddev_id_plot.pdf", bbox_inches='tight', dpi=300)
plt.close()

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
  colors = cm.get_cmap('tab10', 10)  # Grayscale color map


                     Device  StdDev Ports Port Category    StdDev ID  \
0               Wyze_camera    7839.19839     Excellent    188.10117   
1         alexa_swan_kettle   16288.95114     Excellent  17311.11043   
2          arlo_camera_pro4     127.04026          Poor      0.04335   
3                arlo_chime   19419.84077     Excellent  19359.94119   
4               belkin_plug    1115.52662          Good   3643.14526   
5   blurams_security_camera    8949.05047     Excellent   5010.94474   
6              bose_speaker    5373.70860     Excellent  17632.52664   
7                eufy_chime    8674.98227     Excellent     14.68451   
8          furbo_dog_camera    8359.88165     Excellent  19464.92892   
9            ring_chime_pro    8968.04164     Excellent   8217.52478   
10      simplisafe_doorbell    7412.65275     Excellent  18032.76627   
11            sonos_speaker    6283.85530     Excellent  20528.24133   
12          tapo_plug110_33   17455.52603     Excellent  19151.3

In [4]:
import os
import pandas as pd
import pickle
import subprocess
import json

class IoTPcapReader:
	def __init__(self, dataset_folder: str):
		"""
		Initialize the IoTPcapReader with the folder path.

		Args:
			dataset_folder (str): The path to the dataset folder containing device subfolders.
		"""
		self.dataset_folder = dataset_folder
		self.global_dataframe = pd.DataFrame()
		self.dns_dataframe = pd.DataFrame()

	def _run_tshark(self, file_path: str) -> list:
		"""
		Run tshark command to extract full packet details in JSON format.

		Args:
			file_path (str): Path to the pcap file.

		Returns:
			list: A list of complete packet details as dictionaries.
		"""
		try:
			cmd = ["tshark", "-r", file_path, "-T", "json"]
			result = subprocess.run(cmd, capture_output=True, text=True, check=True)
			packets = json.loads(result.stdout)
			return packets
		except Exception as e:
			print(f"Error running tshark on {file_path}: {e}")
			return []

	def _parse_packets(self, packets: list, device_name: str):
		"""
		Parse full packet JSON data and add it to the dataframe.

		Args:
			packets (list): List of packet details as dictionaries.
			device_name (str): Name of the IoT device.
		"""
		data = []

		for packet in packets:
			if "_source" in packet and "layers" in packet["_source"]:
				row = {
					'Device Name': device_name,
					'Packet JSON': json.dumps(packet)  # Store the full JSON as a string
				}
				data.append(row)

		temp_df = pd.DataFrame(data)
		self.global_dataframe = pd.concat([self.global_dataframe, temp_df], ignore_index=True)

	def _read_pcap_file(self, file_path: str, device_name: str):
		"""
		Read the contents of a pcap file and extract frame information using tshark.

		Args:
			file_path (str): Path to the pcap file.
			device_name (str): Name of the IoT device associated with the pcap file.
		"""
		packets = self._run_tshark(file_path)
		if packets:
			self._parse_packets(packets, device_name)

	def read_all_pcap_files(self):
		"""
		Read all pcap files from all device subfolders and store the data in a global dataframe.
		"""
		for device_folder in os.listdir(self.dataset_folder):
			device_path = os.path.join(self.dataset_folder, device_folder)
			if os.path.isdir(device_path):
				for file_name in os.listdir(device_path):
					if file_name.endswith('.pcap'):
						file_path = os.path.join(device_path, file_name)
						print(f"Reading file: {file_path}")
						self._read_pcap_file(file_path, device_folder)

	def save_as_pickle(self, output_file: str):
		"""
		Save the global dataframe as a pickle file.

		Args:
			output_file (str): The output pickle file path.
		"""
		with open(output_file, 'wb') as f:
			pickle.dump(self.global_dataframe, f)
		print(f"Dataset saved as pickle at {output_file}")

	def load_pickle(self, pickle_file: str):
		"""
		Load the dataset from a pickle file.

		Args:
			pickle_file (str): The path to the pickle file.
		"""
		with open(pickle_file, 'rb') as f:
			self.global_dataframe = pickle.load(f)
		print(f"Dataset loaded from pickle at {pickle_file}")

if __name__ == '__main__':
	# Example usage:
	dataset_folder_path = '../../baseline'
	output_pickle_path = '../../iot_data_baseline.pkl'
	dns_pickle_path = '../../dns_data_baseline.pkl' 

	# Create an object of IoTPcapReader
	iot_reader = IoTPcapReader(dataset_folder_path)

# 	# Read all pcap files and store data in the dataframe
# 	iot_reader.read_all_pcap_files()

# 	# Save the dataframe as a pickle file
# 	iot_reader.save_as_pickle(output_pickle_path)

	# Load and check the pickle file
	iot_reader.load_pickle(output_pickle_path)
	print(iot_reader.global_dataframe.head())

Dataset loaded from pickle at ../../iot_data_baseline.pkl
        Device Name                                        Packet JSON
0  arlo_camera_pro4  {"_index": "packets-2025-01-05", "_type": "pca...
1  arlo_camera_pro4  {"_index": "packets-2025-01-05", "_type": "pca...
2  arlo_camera_pro4  {"_index": "packets-2025-01-05", "_type": "pca...
3  arlo_camera_pro4  {"_index": "packets-2025-01-05", "_type": "pca...
4  arlo_camera_pro4  {"_index": "packets-2025-01-05", "_type": "pca...


In [5]:
print(iot_reader.global_dataframe["Packet JSON"][0])

{"_index": "packets-2025-01-05", "_type": "pcap_file", "_score": null, "_source": {"layers": {"frame": {"frame.encap_type": "1", "frame.time": "Nov 22, 2024 10:38:41.949893000 CET", "frame.offset_shift": "0.000000000", "frame.time_epoch": "1732268321.949893000", "frame.time_delta": "0.000000000", "frame.time_delta_displayed": "0.000000000", "frame.time_relative": "0.000000000", "frame.number": "1", "frame.len": "88", "frame.cap_len": "88", "frame.marked": "0", "frame.ignored": "0", "frame.protocols": "eth:ethertype:ip:udp:dns"}, "eth": {"eth.dst": "28:e3:47:8e:cf:a8", "eth.dst_tree": {"eth.dst_resolved": "LiteonTe_8e:cf:a8", "eth.addr": "28:e3:47:8e:cf:a8", "eth.addr_resolved": "LiteonTe_8e:cf:a8", "eth.lg": "0", "eth.ig": "0"}, "eth.src": "fc:9c:98:11:71:a8", "eth.src_tree": {"eth.src_resolved": "fc:9c:98:11:71:a8", "eth.addr": "fc:9c:98:11:71:a8", "eth.addr_resolved": "fc:9c:98:11:71:a8", "eth.lg": "0", "eth.ig": "0"}, "eth.type": "0x00000800"}, "ip": {"ip.version": "4", "ip.hdr_len"

In [6]:
print(iot_reader.global_dataframe)

                 Device Name  \
0           arlo_camera_pro4   
1           arlo_camera_pro4   
2           arlo_camera_pro4   
3           arlo_camera_pro4   
4           arlo_camera_pro4   
...                      ...   
712977  switchbot_hub_mini_2   
712978  switchbot_hub_mini_2   
712979  switchbot_hub_mini_2   
712980  switchbot_hub_mini_2   
712981  switchbot_hub_mini_2   

                                              Packet JSON  
0       {"_index": "packets-2025-01-05", "_type": "pca...  
1       {"_index": "packets-2025-01-05", "_type": "pca...  
2       {"_index": "packets-2025-01-05", "_type": "pca...  
3       {"_index": "packets-2025-01-05", "_type": "pca...  
4       {"_index": "packets-2025-01-05", "_type": "pca...  
...                                                   ...  
712977  {"_index": "packets-2025-01-05", "_type": "pca...  
712978  {"_index": "packets-2025-01-05", "_type": "pca...  
712979  {"_index": "packets-2025-01-05", "_type": "pca...  
712980  {"_inde

In [7]:
# import json
# import pandas as pd

# def expand_packet_json(global_dataframe: pd.DataFrame) -> pd.DataFrame:
# 	"""
# 	Expand the 'Packet JSON' column in the global dataframe into individual columns.

# 	Args:
# 		global_dataframe (pd.DataFrame): The dataframe containing the 'Packet JSON' column.

# 	Returns:
# 		pd.DataFrame: A new dataframe with individual fields from the JSON.
# 	"""
# 	expanded_data = []

# 	for _, row in global_dataframe.iterrows():
# 		packet_json = json.loads(row.get('Packet JSON', '{}'))
# 		layers = packet_json.get("_source", {}).get("layers", {})

# 		# Flattening the nested structure into individual columns
# 		expanded_row = {
# 			"Device Name": row.get("Device Name", ""),
# 			"Frame Time": layers.get("frame", {}).get("frame.time", ""),
# 			"Frame Number": layers.get("frame", {}).get("frame.number", ""),
# 			"Frame Length": layers.get("frame", {}).get("frame.len", ""),
# 			"Source MAC": layers.get("eth", {}).get("eth.src", ""),
# 			"Destination MAC": layers.get("eth", {}).get("eth.dst", ""),
# 			"Source IP": layers.get("ip", {}).get("ip.src", ""),
# 			"Destination IP": layers.get("ip", {}).get("ip.dst", ""),
# 			"Protocol": layers.get("frame", {}).get("frame.protocols", ""),
# 			"TCP Sequence Number": layers.get("tcp", {}).get("tcp.seq", ""),
# 			"UDP Length": layers.get("udp", {}).get("udp.length", ""),
# 			"DNS Transaction ID": layers.get("dns", {}).get("dns.id", ""),
# 			"DNS Flags": layers.get("dns", {}).get("dns.flags", ""),
# 			"DNS Query Name": layers.get("dns.qry", {}).get("dns.qry.name", ""),
# 			"DNS Answer Name": layers.get("dns.resp", {}).get("dns.resp.name", ""),
# 			"DNS Answer Address": layers.get("dns.resp", {}).get("dns.a", "")
# 		}

# 		expanded_data.append(expanded_row)

# 	expanded_df = pd.DataFrame(expanded_data)
# 	return expanded_df

In [8]:
# # Assuming you have a global_dataframe with the "Packet JSON" column:
# expanded_df = expand_packet_json(iot_reader.global_dataframe)
# print(expanded_df.head())

In [18]:
import os
import json
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil import parser
from collections import defaultdict

class DNSPacketAnalyzer:
	def __init__(self, global_dataframe: pd.DataFrame, output_folder: str = './dns_analysis_plots'):
		self.df = global_dataframe
		self.output_folder = output_folder
		os.makedirs(self.output_folder, exist_ok=True)
		self.preprocessed_df = None

	def preprocess_dns_packets(self):
		dns_records = []

		for _, row in self.df.iterrows():
			packet_json = json.loads(row.get('Packet JSON', '{}'))
			source = packet_json.get('_source', {})
			layers = source.get('layers', {})
			frame = layers.get('frame', {})
			dns = layers.get('dns', {})
			dns_flags = dns.get('dns.flags_tree', {})

			protocols = frame.get('frame.protocols', ' ')
			is_dns = 'dns' in protocols

			if is_dns:
				record = {
					'Device Name': row['Device Name'].replace('_', ''),
					'frame_time': frame.get('frame.time', ''),
					'frame_len': int(frame.get('frame.len', 0)) if frame.get('frame.len', '').isdigit() else 0,
					'protocols': protocols,
					'is_response': dns_flags.get('dns.flags.response', '') == '1',
					'queries': dns.get('Queries', {}),
					'answers': dns.get('Answers', {}),
					'edns': dns.get('dns.opt', ''),
				}

				record['ttl_list'] = [int(a.get('dns.resp.ttl', 0)) for a in record['answers'].values()] if record['answers'] else []
				record['query_types'] = [q.get('dns.qry.type', '') for q in record['queries'].values()] if record['queries'] else []
				record['query_names'] = [q.get('dns.qry.name', '') for q in record['queries'].values()] if record['queries'] else []

				dns_records.append(record)

		self.preprocessed_df = pd.DataFrame(dns_records)
		print(f"Preprocessed {len(self.preprocessed_df)} DNS packets.")

	def plot_bar(self, df, x, y, title, ylabel, filename, log_scale=False):
		plt.figure(figsize=(3.33, 2.2))
		plt.bar(df[x], df[y], width=0.6)
		if log_scale:
			plt.yscale('log')
		plt.xlabel(x, fontsize=7)
		plt.ylabel(ylabel, fontsize=7)
# 		plt.title(title, fontsize=7)
		plt.xticks(rotation=45, ha='right', fontsize=5.5)
		plt.yticks(fontsize=6)
		plt.tight_layout()
		plt.savefig(os.path.join(self.output_folder, filename), bbox_inches='tight', dpi=300)
		plt.close()
		
		# Save DataFrame + metadata as pickle
		pickle_data = {
			"df": df,
			"x": x,
			"y": y
		}
		pickle_path = os.path.join(self.output_folder, filename) + '.pkl'
		with open(pickle_path, 'wb') as f:
			pickle.dump(pickle_data, f)

	def plot_dns_query_counts(self):
		summary = self.preprocessed_df.groupby('Device Name').size().reset_index(name='Number of Queries')
		self.plot_bar(summary, 'Device Name', 'Number of Queries', 'DNS Queries per Device', 'Queries', 'dns_query_counts.pdf')

	def plot_average_ttl(self):
		exploded = self.preprocessed_df.explode('ttl_list')
		exploded = exploded[exploded['ttl_list'] > 0]
		summary = exploded.groupby('Device Name')['ttl_list'].mean().reset_index()
		self.plot_bar(summary, 'Device Name', 'ttl_list', 'Average TTL per Device', 'Avg. TTL (log, s)', 'average_ttl_log.pdf', log_scale=True)

	def plot_dns_answer_counts(self):
		summary = self.preprocessed_df.explode('answers').groupby('Device Name').size().reset_index(name='Number of Answers')
		self.plot_bar(summary, 'Device Name', 'Number of Answers', 'DNS Answers per Device', 'DNS Answers', 'dns_answer_counts.pdf')

	def plot_dns_query_types(self):
		query_types = self.preprocessed_df.explode('query_names')
		counts = query_types.groupby(['Device Name', 'query_names']).size().reset_index(name='Count')
		pivot = counts.pivot(index='Device Name', columns='query_names', values='Count').fillna(0)
		ax = pivot.plot(kind='bar', stacked=True, figsize=(3.33, 2.5), width=0.6)
		plt.xlabel('Device Name', fontsize=7)
		plt.ylabel('Query Count', fontsize=7)
# 		plt.title('DNS Query Types per Device', fontsize=7)
		plt.xticks(rotation=45, ha='right', fontsize=6)
		plt.yticks(fontsize=6)
		plt.legend(fontsize=5, loc='upper right', frameon=False)
		plt.tight_layout()
		plt.savefig(os.path.join(self.output_folder, 'dns_query_types.pdf'), bbox_inches='tight', dpi=300)
		plt.close()

	def plot_avg_time_between_queries(self):
		times = self.preprocessed_df[~self.preprocessed_df['is_response']]
		times['parsed_time'] = pd.to_datetime(times['frame_time'], errors='coerce')
		times = times.dropna(subset=['parsed_time'])
		avg_times = times.sort_values('parsed_time').groupby('Device Name')['parsed_time'].apply(lambda x: x.diff().mean().total_seconds() if len(x) > 1 else 0).reset_index(name='Avg Time Between Queries')
		self.plot_bar(avg_times, 'Device Name', 'Avg Time Between Queries', 'Avg. Time Between DNS Queries', 'Avg. Time (log, s)', 'avg_time_between_queries_log.pdf', log_scale=True)

	def plot_distinct_addresses(self):
		addr_df = self.preprocessed_df.explode('answers')
		addr_df['dns_a'] = addr_df['answers'].apply(lambda a: a.get('dns.a') if isinstance(a, dict) else None)
		distinct_counts = addr_df.dropna(subset=['dns_a']).groupby('Device Name')['dns_a'].nunique().reset_index(name='Distinct Addresses')
		self.plot_bar(distinct_counts, 'Device Name', 'Distinct Addresses', 'Distinct DNS Addresses per Device', 'Distinct Addr.', 'distinct_addresses.pdf')

	def plot_avg_answers_per_frame(self):
		ans_df = self.preprocessed_df.copy()
		ans_df['answer_count'] = ans_df['answers'].apply(lambda a: len(a) if isinstance(a, dict) else 0)
		avg_ans = ans_df.groupby('Device Name')['answer_count'].mean().reset_index(name='Answers per Frame')
		self.plot_bar(avg_ans, 'Device Name', 'Answers per Frame', 'Avg. DNS Answers per Frame', 'Avg. Answers / Frame', 'average_answers_per_frame.pdf')

	def calculate_ipv6_query_percentage(self):
		ipv6_df = self.preprocessed_df.explode('query_types')
		total_counts = ipv6_df.groupby('Device Name').size()
		ipv6_counts = ipv6_df[ipv6_df['query_types'] == '28'].groupby('Device Name').size()
		percent_df = (ipv6_counts / total_counts * 100).fillna(0).reset_index(name='IPv6 Query Percentage')
		self.plot_bar(percent_df, 'Device Name', 'IPv6 Query Percentage', 'IPv6 Queries per Device', 'IPv6 Query %', 'ipv6_query_percentage.pdf')

	def calculate_average_retries(self):
		retries_df = self.preprocessed_df.explode('query_names')
		retries_count = retries_df.groupby(['Device Name', 'query_names']).size().reset_index(name='count')
		avg_retries = retries_count[retries_count['count'] > 1].groupby('Device Name')['count'].mean().reset_index(name='Average Retries')
		self.plot_bar(avg_retries, 'Device Name', 'Average Retries', 'Avg. DNS Query Retries per Device', 'Avg. Retries', 'average_dns_retries.pdf')

	def plot_query_rate(self):
		times = self.preprocessed_df[~self.preprocessed_df['is_response']]
		times['parsed_time'] = pd.to_datetime(times['frame_time'], errors='coerce')
		rate_df = times.dropna(subset=['parsed_time']).groupby('Device Name').apply(lambda x: len(x) / (x['parsed_time'].max() - x['parsed_time'].min()).total_seconds() if len(x) > 1 else 0).reset_index(name='Query Rate (queries/sec)')
		self.plot_bar(rate_df, 'Device Name', 'Query Rate (queries/sec)', 'DNS Query Rate per Device', 'Queries/sec', 'query_rate.pdf')

	def plot_protocol_distribution(self):
		proto_counts = defaultdict(lambda: defaultdict(int))
		for _, row in self.df.iterrows():
			device = row['Device Name']
			packet_json = json.loads(row.get('Packet JSON', '{}'))
			protocols = packet_json.get('_source', {}).get('layers', {}).get('frame', {}).get('frame.protocols', '')
			for proto in protocols.split(':'):
				proto_counts[device][proto] += 1
		proto_df = pd.DataFrame(proto_counts).fillna(0).T
		ax = proto_df.plot(kind='bar', stacked=True, figsize=(3.33, 2.5), width=0.6)
		plt.xlabel('Device Name', fontsize=7)
		plt.ylabel('Packet Count', fontsize=7)
		plt.yscale('log')
# 		plt.title('Protocol Distribution per Device', fontsize=7)
		plt.xticks(rotation=45, ha='right', fontsize=6)
		plt.yticks(fontsize=6)
		plt.legend(title='Protocols', fontsize=5, title_fontsize=6, bbox_to_anchor=(1.05, 1), ncols=4, frameon=False)
		plt.tight_layout()
		plt.savefig(os.path.join(self.output_folder, 'protocol_distribution.pdf'), bbox_inches='tight', dpi=300)
		plt.close()

	def plot_mdns_count(self):
		mdns_df = self.preprocessed_df[self.preprocessed_df['protocols'].str.lower().str.contains('mdns')]
		summary = mdns_df.groupby('Device Name').size().reset_index(name='MDNS Count')
		self.plot_bar(summary, 'Device Name', 'MDNS Count', 'MDNS Packet Count per Device', 'MDNS Packets', 'mdns_count.pdf')

	def analyze_dns_query_context(self, time_window=5):
		self.df['Device Name'] = self.df['Device Name'].str.replace('_', ' ', regex=False)
		self.preprocessed_df['Device Name'] = self.preprocessed_df['Device Name'].str.replace('_', ' ', regex=False)

		self.df['Timestamp'] = self.df['Packet JSON'].apply(
			lambda x: parser.parse(json.loads(x).get('_source', {}).get('layers', {}).get('frame', {}).get('frame.time', '')) if x else None)
		self.preprocessed_df['Timestamp'] = self.preprocessed_df['frame_time'].apply(
			lambda x: parser.parse(x) if x else None)

		query_contexts = []
		traffic_context_count = {}

		for _, dns_row in self.preprocessed_df.iterrows():
			dns_time = dns_row['Timestamp']
			device_name = dns_row['Device Name']
			dns_query = dns_row['queries']

			if dns_query:
				dns_query_name = list(dns_query.values())[0].get('dns.qry.name', '')

				related_packets = self.df[
					(self.df['Device Name'] == device_name) &
					(self.df['Timestamp'] >= dns_time - pd.Timedelta(seconds=time_window)) &
					(self.df['Timestamp'] <= dns_time + pd.Timedelta(seconds=time_window))
				]

				reasons = []

				for _, packet in related_packets.iterrows():
					packet_json = json.loads(packet['Packet JSON'])
					protocols = packet_json.get('_source', {}).get('layers', {}).get('frame', {}).get('frame.protocols', '')

					if 'tcp' in protocols and 'http' in protocols:
						reasons.append("HTTP Request after DNS")
					elif 'tcp' in protocols and 'tcp.analysis.retransmission' in packet_json.get('_source', {}).get('layers', {}).get('tcp', {}):
						reasons.append("TCP Retransmission before DNS")
					elif 'tcp' in protocols and 'tcp.flags.syn' in packet_json.get('_source', {}).get('layers', {}).get('tcp', {}):
						reasons.append("TCP SYN before DNS")
					elif 'dhcp' in protocols:
						reasons.append("DHCP before DNS")
					elif 'icmp' in protocols:
						reasons.append("ICMP before DNS")
					elif 'quic' in protocols:
						reasons.append("QUIC traffic near DNS")

				reasons = list(set(reasons))  # Remove duplicates

				query_contexts.append({
					'Device Name': device_name,
					'DNS Query': dns_query_name,
					'Query Time': dns_time,
					'Traffic Context': ', '.join(reasons) if reasons else "Unknown"
				})

				for reason in reasons:
					if device_name not in traffic_context_count:
						traffic_context_count[device_name] = {}
					if reason not in traffic_context_count[device_name]:
						traffic_context_count[device_name][reason] = 0
					traffic_context_count[device_name][reason] += 1

		query_context_df = pd.DataFrame(query_contexts)
		query_context_df.to_csv(os.path.join(self.output_folder, 'dns_query_context.csv'), index=False)
		print("DNS query context analysis completed. Results saved in 'dns_query_context.csv'.")

		self.plot_traffic_context_distribution(traffic_context_count)

		return query_context_df

	def plot_traffic_context_distribution(self, traffic_context_count):
		matplotlib.rcParams.update({'font.size': 7})  # ACM small font
		data = []
		for device, reasons in traffic_context_count.items():
			for reason, count in reasons.items():
				data.append({'Device Name': device, 'Traffic Context': reason, 'Count': count})

		df = pd.DataFrame(data)
		pivot_df = df.pivot(index='Device Name', columns='Traffic Context', values='Count').fillna(0)
		ax = pivot_df.plot(kind='bar', stacked=True, figsize=(3.33, 2.5), width=0.6)
		plt.xlabel('Device Name', fontsize=7)
		plt.ylabel('Context Count', fontsize=7)
# 		plt.title('Traffic Contexts Around DNS Queries', fontsize=7)
		plt.xticks(rotation=45, ha='right', fontsize=5.5)
		plt.yticks(fontsize=6)
		plt.legend(fontsize=5, bbox_to_anchor=(0.5, 1.30), ncols=2, frameon=False)
		plt.tight_layout()
		plt.savefig(os.path.join(self.output_folder, 'traffic_context_distribution.pdf'), bbox_inches='tight', dpi=300)
		plt.close()
		print("Traffic context distribution plot saved.")

	def plot_edns0_usage(self):
		edns_df = self.preprocessed_df[self.preprocessed_df['edns'] != '']
		summary = edns_df.groupby('Device Name').size().reset_index(name='EDNS(0) Count')
		self.plot_bar(summary, 'Device Name', 'EDNS(0) Count', 'EDNS(0) Usage per Device', 'EDNS(0) Count', 'edns0_usage.pdf')

	def analyze(self):
		self.preprocess_dns_packets()
		if not self.preprocessed_df.empty:
			self.plot_dns_query_counts()
			self.plot_average_ttl()
			self.plot_dns_answer_counts()
			self.plot_dns_query_types()
# 			self.plot_avg_time_between_queries()
			self.plot_distinct_addresses()
			self.plot_avg_answers_per_frame()
			self.calculate_ipv6_query_percentage()
			self.calculate_average_retries()
# 			self.plot_query_rate()
			self.analyze_dns_query_context()
			self.plot_protocol_distribution()
			self.plot_mdns_count()
# 			self.plot_edns0_usage()
			print(f"All plots saved to {self.output_folder}")
		else:
			print("No DNS packets found.")

In [19]:
# Assuming iot_reader.global_dataframe is already populated
dns_analyzer = DNSPacketAnalyzer(iot_reader.global_dataframe)

# Perform DNS analysis and generate/save plots
dns_analyzer.analyze()

Preprocessed 6024 DNS packets.


  plt.tight_layout()


DNS query context analysis completed. Results saved in 'dns_query_context.csv'.
Traffic context distribution plot saved.


  plt.tight_layout()


All plots saved to ./dns_analysis_plots


In [20]:
if __name__ == '__main__':
	# Example usage:
	dataset_folder_path = '../../DOH'
	output_pickle_path = '../../iot_data_doh.pkl'
	dns_pickle_path = '../../dns_data_doh.pkl' 

	# Create an object of IoTPcapReader
	iot_reader2 = IoTPcapReader(dataset_folder_path)

# 	# Read all pcap files and store data in the dataframe
# 	iot_reader.read_all_pcap_files()

# 	# Save the dataframe as a pickle file
# 	iot_reader.save_as_pickle(output_pickle_path)

	# Load and check the pickle file
	iot_reader2.load_pickle(output_pickle_path)
	print(iot_reader2.global_dataframe.head())

Dataset loaded from pickle at ../../iot_data_doh.pkl
        Device Name                                        Packet JSON
0  arlo_camera_pro4  {"_index": "packets-2025-01-05", "_type": "pca...
1  arlo_camera_pro4  {"_index": "packets-2025-01-05", "_type": "pca...
2  arlo_camera_pro4  {"_index": "packets-2025-01-05", "_type": "pca...
3  arlo_camera_pro4  {"_index": "packets-2025-01-05", "_type": "pca...
4  arlo_camera_pro4  {"_index": "packets-2025-01-05", "_type": "pca...


In [21]:
# Assuming iot_reader.global_dataframe is already populated
dns_analyzer2 = DNSPacketAnalyzer(iot_reader2.global_dataframe, output_folder = './dns_analysis_plots_doh')

# Perform DNS analysis and generate/save plots
dns_analyzer2.analyze()

Preprocessed 35563 DNS packets.


  plt.tight_layout()


DNS query context analysis completed. Results saved in 'dns_query_context.csv'.


KeyError: 'Device Name'