In [2]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

# ---------------- CONFIGURATION ----------------
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

base_path = "/Users/tejasmacipad/Desktop/final_CN_project/CN_Project/Report_2_new"
save_dir = "./plots_final_json"
os.makedirs(save_dir, exist_ok=True)

# ---------------- FILE PATHS ----------------
paths = {
    "Do53": [f"{base_path}/do53_log_top50.json", f"{base_path}/do53_log_top30003050.json"],
    "DoH": [f"{base_path}/doh_log_top50.json", f"{base_path}/doh_log_top30003050.json"],
    "DoT": [f"{base_path}/dot_log_top50.json", f"{base_path}/dot_log_top30003050.json"],
}

# ---------------- LOADING FUNCTION ----------------
def load_json_file(path, protocol):
    """Load JSON file and return DataFrame with protocol tag."""
    with open(path, "r") as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    df["protocol"] = protocol
    return df

# ---------------- LOAD ALL FILES ----------------
dfs = []
for proto, files in paths.items():
    for fpath in files:
        dfs.append(load_json_file(fpath, proto))

df = pd.concat(dfs, ignore_index=True)

# ---------------- CLEANING + NORMALIZATION ----------------
# Map columns to consistent names
df = df.rename(
    columns={
        "timestamp": "timestamp_utc",
        "bytes_sent": "bytes_out",
        "bytes_recv": "bytes_in",
        "tcp_handshake_ms": "tcp_connect_ms",
        "query_time_ms": "query_rtt_ms",
    }
)

# Ensure numeric columns
for col in ["tcp_connect_ms", "tls_handshake_ms", "query_rtt_ms", "bytes_out", "bytes_in", "total_time_ms"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Convert timestamp
df["timestamp_utc"] = pd.to_datetime(df["timestamp_utc"], errors="coerce")

# ---------------- ADD QUALITY LABEL ----------------
qualities = ["Top50", "Bottom50"]
df["quality"] = df.groupby("protocol").cumcount().apply(lambda x: qualities[x // 50] if x // 50 < 2 else "Extra")

# ✅ Remove rows labeled "Extra"
df = df[df["quality"] != "Extra"]

# ---------------- 1️⃣ LATENCY DISTRIBUTION ----------------
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x="protocol", y="total_time_ms", hue="quality", palette="Set2")
plt.title("Total DNS Resolution Time by Protocol and Website Group")
plt.ylabel("Total Time (ms)")
plt.xlabel("Protocol")
plt.savefig(f"{save_dir}/1_latency_boxplot.png", dpi=300)
plt.close()

# ---------------- 2️⃣ BANDWIDTH ANALYSIS ----------------
bw = (
    df.groupby(["protocol", "quality"])[["bytes_out", "bytes_in"]]
    .mean()
    .reset_index()
)
bw["total_bytes"] = bw["bytes_out"] + bw["bytes_in"]

plt.figure(figsize=(10, 6))
bw_melt = bw.melt(
    id_vars=["protocol", "quality"],
    value_vars=["bytes_out", "bytes_in", "total_bytes"],
    var_name="Type",
    value_name="Bytes",
)
sns.barplot(data=bw_melt, x="protocol", y="Bytes", hue="Type", palette="coolwarm")
plt.title("Average Bandwidth Usage per Query (Top vs Bottom)")
plt.ylabel("Bytes")
plt.savefig(f"{save_dir}/2_bandwidth_bar.png", dpi=300)
plt.close()

# ---------------- 3️⃣ SUCCESS RATE ----------------
success_counts = (
    df.groupby(["protocol", "quality", "status"]).size().reset_index(name="count")
)

plt.figure(figsize=(10, 6))
sns.barplot(
    data=success_counts,
    x="protocol",
    y="count",
    hue="status",
    palette="Paired",
)
plt.title("Query Success vs Failure Rate (Top vs Bottom)")
plt.ylabel("Number of Queries")
plt.savefig(f"{save_dir}/3_success_rate.png", dpi=300)
plt.close()

# ---------------- 4️⃣ TEMPORAL TREND ----------------
plt.figure(figsize=(10, 6))
for proto in df["protocol"].unique():
    sub = df[df["protocol"] == proto].sort_values("timestamp_utc")
    plt.plot(sub["timestamp_utc"], sub["total_time_ms"], label=proto, alpha=0.8)
plt.title("DNS Latency Over Time")
plt.xlabel("Time")
plt.ylabel("Total Time (ms)")
plt.legend()
plt.savefig(f"{save_dir}/4_temporal_trend.png", dpi=300)
plt.close()

# ---------------- 5️⃣ SUMMARY TABLE ----------------
summary = (
    df.groupby(["protocol", "quality"])
    .agg(
        avg_total_time_ms=("total_time_ms", "mean"),
        success_count=("status", lambda x: (x == "SUCCESS").sum()),
        total_queries=("status", "count"),
    )
    .reset_index()
)
summary["success_rate_%"] = (summary["success_count"] / summary["total_queries"]) * 100

print("\n===== Summary Table =====")
print(summary.to_string(index=False))
summary.to_csv(f"{save_dir}/summary_stats.csv", index=False)

# ---------------- 6️⃣ CDF PLOT ----------------
plt.figure(figsize=(10, 6))
for (proto, qual), sub in df.groupby(["protocol", "quality"]):
    subset_sorted = sub["total_time_ms"].dropna().sort_values().values
    if len(subset_sorted) == 0:
        continue
    yvals = np.arange(len(subset_sorted)) / len(subset_sorted)
    plt.plot(subset_sorted, yvals, label=f"{proto}-{qual}", linewidth=2)

plt.xlabel("Total Time (ms)")
plt.ylabel("Cumulative Probability")
plt.title("CDF of DNS Lookup Latency (Top vs Bottom)")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.5)
plt.savefig(f"{save_dir}/6_latency_cdf.png", dpi=300)
plt.close()

print(f"\n✅ All plots saved in: {os.path.abspath(save_dir)}")


===== Summary Table =====
protocol  quality  avg_total_time_ms  success_count  total_queries  success_rate_%
    Do53 Bottom50         975.701565             46             50            92.0
    Do53    Top50         753.991446             50             50           100.0
     DoH Bottom50        1343.934565             50             50           100.0
     DoH    Top50         926.084037             50             50           100.0
     DoT Bottom50        1164.095105             49             49           100.0
     DoT    Top50         843.089805             50             50           100.0

✅ All plots saved in: /Users/tejasmacipad/Desktop/final_CN_project/CN_Project/Report_2_new/plots_final_json


In [None]:
import json
import pandas as pd
import os

# ---------------- CONFIGURATION ----------------
base_path = "/Users/tejasmacipad/Desktop/final_CN_project/CN_Project/Report_2_new"

# ---------------- FILE PATHS ----------------
paths = {
    "Do53": [f"{base_path}/do53_log_top50.json", f"{base_path}/do53_log_top30003050.json"],
    "DoH": [f"{base_path}/doh_log_top50.json", f"{base_path}/doh_log_top30003050.json"],
    "DoT": [f"{base_path}/dot_log_top50.json", f"{base_path}/dot_log_top30003050.json"],
}

# ---------------- LOAD FILES ----------------
def load_json_file(path, protocol, quality):
    with open(path, "r") as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    df["protocol"] = protocol
    df["quality"] = quality
    # Only first 50 queries
    df = df.head(50)
    return df

dfs = []
for proto, files in paths.items():
    dfs.append(load_json_file(files[0], proto, "Top50"))
    dfs.append(load_json_file(files[1], proto, "Bottom50"))

df = pd.concat(dfs, ignore_index=True)

# ---------------- ENSURE NUMERIC FIELDS ----------------
numeric_cols = [
    "tcp_handshake_ms",
    "tls_handshake_ms",
    "query_time_ms",
    "total_time_ms",
    "bytes_sent",
    "bytes_recv",
    "query_size_bytes",
    "response_size_bytes",
]

for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Compute total bytes if not already present
if "total_bytes" not in df.columns:
    df["total_bytes"] = df["bytes_sent"] + df["bytes_recv"]

# ---------------- COMPUTE BASIC METRICS ----------------
summary = (
    df.groupby(["protocol", "quality"])
    .agg({
        "tcp_handshake_ms": "mean",
        "tls_handshake_ms": "mean",
        "query_time_ms": "mean",
        "total_time_ms": "mean",
        "bytes_sent": "mean",
        "bytes_recv": "mean",
        "total_bytes": "mean",
        "query_size_bytes": "mean",
        "response_size_bytes": "mean"
    })
    .reset_index()
)

# Round for neatness
summary = summary.round(2)

# ---------------- DISPLAY ----------------
print("\n===== Average Metrics per Protocol and Quality =====\n")
print(summary.to_string(index=False))


===== Average Metrics per Protocol and Quality =====

protocol  quality  tcp_handshake_ms  tls_handshake_ms  query_time_ms  total_time_ms  bytes_sent  bytes_recv  total_bytes  query_size_bytes  response_size_bytes
    Do53 Bottom50              0.00              0.00         997.06         997.06       33.70       89.04       122.74             33.70                89.04
    Do53    Top50              0.00              0.00         753.99         753.99       34.76       87.48       122.24             34.76                87.48
     DoH Bottom50             25.43             40.31        1300.13        1366.27     1261.40     2202.04      3463.44            224.70               191.52
     DoH    Top50             69.33             34.41         821.87         926.08     1263.52     2188.36      3451.88            225.76               184.68
     DoT Bottom50             51.01             41.05        1071.65        1164.10      879.10     2013.29      2892.39             33.55       

In [6]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

# ---------------- CONFIG ----------------
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (8, 5)

base_path = "/Users/tejasmacipad/Desktop/final_CN_project/CN_Project/Report_2_new"
save_dir = "./plots_final_json"
os.makedirs(save_dir, exist_ok=True)

paths = {
    "Do53": [f"{base_path}/do53_log_top50.json", f"{base_path}/do53_log_top30003050.json"],
    "DoH":  [f"{base_path}/doh_log_top50.json",  f"{base_path}/doh_log_top30003050.json"],
    "DoT":  [f"{base_path}/dot_log_top50.json",  f"{base_path}/dot_log_top30003050.json"],
}

# ---------------- LOAD (first 50 per file only) ----------------
def load_json_file(path, protocol, quality):
    with open(path, "r") as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    df["protocol"] = protocol
    df["quality"] = quality
    # take at most first 50 entries (ignore extras)
    return df.head(50)

dfs = []
for proto, files in paths.items():
    dfs.append(load_json_file(files[0], proto, "Top50"))
    dfs.append(load_json_file(files[1], proto, "Bottom50"))

df = pd.concat(dfs, ignore_index=True)

# ---------------- NORMALIZE NAMES & NUMERIC ----------------
# ensure total_bytes exists
if "total_bytes" not in df.columns:
    # prefer bytes_sent/bytes_recv or bytes_out/bytes_in if present
    if "bytes_sent" in df.columns and "bytes_recv" in df.columns:
        df["total_bytes"] = pd.to_numeric(df["bytes_sent"], errors="coerce") + pd.to_numeric(df["bytes_recv"], errors="coerce")
    elif "bytes_out" in df.columns and "bytes_in" in df.columns:
        df["total_bytes"] = pd.to_numeric(df["bytes_out"], errors="coerce") + pd.to_numeric(df["bytes_in"], errors="coerce")
    else:
        df["total_bytes"] = np.nan

# ensure total_time_ms exists (use total_time_ms or compute from pieces)
if "total_time_ms" not in df.columns:
    # fallbacks (not expected) - set NaN
    df["total_time_ms"] = np.nan

# coerce numeric
df["total_bytes"]    = pd.to_numeric(df["total_bytes"], errors="coerce")
df["total_time_ms"]  = pd.to_numeric(df["total_time_ms"], errors="coerce")

# drop rows with invalid or zero time or bytes (can't compute bandwidth)
df = df.dropna(subset=["total_bytes", "total_time_ms"])
df = df[df["total_time_ms"] > 0]
if df.empty:
    raise SystemExit("No valid rows (total_bytes / total_time_ms) to compute bandwidth.")

# ---------------- PER-QUERY BANDWIDTH (kbps) ----------------
# bytes/sec = total_bytes / (total_time_ms / 1000)
# kilobits/sec = (bytes/sec * 8) / 1000 = total_bytes * 8 / total_time_ms
df["bandwidth_kbps"] = (df["total_bytes"] * 8.0) / df["total_time_ms"]

# ---------------- GROUP & AVERAGE ----------------
agg = df.groupby(["protocol", "quality"])["bandwidth_kbps"].mean().reset_index()
agg["bandwidth_kbps"] = agg["bandwidth_kbps"].round(2)

# ---------------- PLOT ----------------
plt.figure(figsize=(8,5))
sns.barplot(data=agg, x="protocol", y="bandwidth_kbps", hue="quality", palette="Set2")
plt.ylabel("Average Bandwidth (kbps)")
plt.xlabel("Protocol")
plt.title("Average load (kilobits/sec) per Query — mean over samples")
plt.grid(axis="y", linestyle="--", alpha=0.5)
plt.legend(title="Website group")
plt.tight_layout()
outpath = os.path.join(save_dir, "avg_bandwidth_kbps.png")
plt.savefig(outpath, dpi=300)
plt.close()

# ---------------- PRINT TABLE ----------------
print("\nAverage bandwidth (kbps) per protocol & quality:\n")
print(agg.to_string(index=False))

print(f"\nSaved plot to: {os.path.abspath(outpath)}")


Average bandwidth (kbps) per protocol & quality:

protocol  quality  bandwidth_kbps
    Do53 Bottom50            1.49
    Do53    Top50            1.94
     DoH Bottom50           34.44
     DoH    Top50           44.19
     DoT Bottom50           30.00
     DoT    Top50           39.80

Saved plot to: /Users/tejasmacipad/Desktop/final_CN_project/CN_Project/Report_2_new/plots_final_json/avg_bandwidth_kbps.png
