In [None]:
import json
import re
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# load
with open('aggregated_data.json','r',encoding='utf-8') as f:
    data = json.load(f)

cleaned = {k: v for k, v in data.items() if "gpt-5" not in k}

# (optional) overwrite your original data variable
data = cleaned

cleaned = {k: v for k, v in data.items() if "gpt-5-thinking" not in k}

data = cleaned

# abstain = data['factual']

# save = []
# for e in abstain:
#     if e[:23] in save:
#         print(e)
#     save.append(e[:23])
#     # print(e[:23])
#     # break

In [None]:
def compute_accessed_and_cited_per_query_and_per_search(data):
    """
    Expects `data` to be a dict of categories -> dicts of HARs -> HAR objects.
    Each HAR object may contain:
      - "search_string": list of search strings (we use len(...) as search_count)
      - "urls_from_prompt": list  (accessed = len(urls_from_prompt) ONLY)
      - "urls_cited": list

    Returns a dict with:
      - accessed_per_query_mean, accessed_per_query_iqr (q25,q75,iqr)
      - accessed_per_search_mean, accessed_per_search_iqr (q25,q75,iqr)
      - cited_per_query_mean, cited_per_query_iqr
      - cited_per_search_mean, cited_per_search_iqr
      - and totals: total_searches, total_accessed, total_cited, har_count
    """
    def _percentile(sorted_vals, p):
        """Linear-interpolation percentile on sorted_vals (0<=p<=100)."""
        n = len(sorted_vals)
        if n == 0:
            return 0.0
        if n == 1:
            return float(sorted_vals[0])
        rank = (p / 100.0) * (n - 1)
        low = int(rank)
        high = low + 1
        if high >= n:
            return float(sorted_vals[-1])
        frac = rank - low
        return float(sorted_vals[low] * (1 - frac) + sorted_vals[high] * frac)

    def _iqr_from_list(vals):
        if not vals:
            return (0.0, 0.0, 0.0)
        s = sorted(vals)
        q25 = _percentile(s, 25)
        q75 = _percentile(s, 75)
        return (q25, q75, q75 - q25)

    # accumulators
    har_count = 0

    total_searches = 0
    total_accessed = 0    # sum of len(urls_from_prompt)
    total_cited = 0       # sum of len(urls_cited)

    per_har_accessed_ratio = []   # value/search_count for each HAR (0 if search_count==0)
    per_har_cited_ratio = []

    per_search_accessed_values = []  # one entry per individual search: accessed_for_that_search
    per_search_cited_values = []

    for category, hars in data.items():
        if not isinstance(hars, dict):
            continue
        for har_name, har_obj in hars.items():
            har_count += 1
            if not isinstance(har_obj, dict):
                search_count = 0
                accessed_len = 0
                cited_len = 0
            else:
                search_list = har_obj.get("search_string")
                search_count = len(search_list) if isinstance(search_list, list) else 0

                urls_from_prompt = har_obj.get("urls_from_prompt")
                accessed_len = len(urls_from_prompt) if isinstance(urls_from_prompt, list) else 0

                urls_cited = har_obj.get("urls_cited")
                cited_len = len(urls_cited) if isinstance(urls_cited, list) else 0

            # totals
            total_searches += search_count
            total_accessed += accessed_len
            total_cited += cited_len

            # per-HAR ratios (per query = mean of these ratios)
            if search_count > 0:
                per_har_accessed_ratio.append(accessed_len)
                per_har_cited_ratio.append(cited_len)
                # build per-search list by assigning equal share to each search in the HAR
                per_search_accessed_values.append(accessed_len / search_count)
                per_search_cited_values.append(cited_len / search_count)
            else:
                # include 0.0 to represent HARs with no searches when computing "per query" mean/IQR
                pass
                # no entries added to per-search lists because there are zero searches in this HAR

    # Means
    def _mean(lst):
        return (sum(lst) / len(lst)) if lst else 0.0

    accessed_per_query_mean = _mean(per_har_accessed_ratio)
    cited_per_query_mean = _mean(per_har_cited_ratio)

    accessed_per_search_mean = _mean(per_search_accessed_values) if per_search_accessed_values else 0.0
    cited_per_search_mean = _mean(per_search_cited_values) if per_search_cited_values else 0.0

    # IQRs
    accessed_per_query_iqr = _iqr_from_list(per_har_accessed_ratio)
    cited_per_query_iqr = _iqr_from_list(per_har_cited_ratio)

    accessed_per_search_iqr = _iqr_from_list(per_search_accessed_values)
    cited_per_search_iqr = _iqr_from_list(per_search_cited_values)

    return {
        "har_count": har_count,
        "total_searches": total_searches,
        "total_accessed": total_accessed,
        "total_cited": total_cited,

        "accessed_per_query_mean": accessed_per_query_mean,
        "accessed_per_query_iqr": accessed_per_query_iqr,   # (q25, q75, iqr)

        "accessed_per_search_mean": accessed_per_search_mean,
        "accessed_per_search_iqr": accessed_per_search_iqr,

        "cited_per_query_mean": cited_per_query_mean,
        "cited_per_query_iqr": cited_per_query_iqr,

        "cited_per_search_mean": cited_per_search_mean,
        "cited_per_search_iqr": cited_per_search_iqr,
    }


compute_accessed_and_cited_per_query_and_per_search(data=data)

In [None]:
# helper
def strip_utm(url):
    return re.sub(r"[?&]utm_source=chatgpt\.com", "", url)

In [None]:
# helper
def strip_utm(url):
    return re.sub(r"[?&]utm_source=chatgpt\.com", "", url)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Data preparation (as before)
har_counts = {cat: len(data[cat]) for cat in data}
series = pd.Series(har_counts).sort_index()

# Plot setup
fig, ax = plt.subplots(figsize=(10, 6))

# Soft pastel colors
colors = plt.get_cmap('Pastel1')(np.arange(len(series)))

# Create bars
bars = ax.bar(
    series.index,
    series.values,
    width=0.6,
    edgecolor='gray',
    linewidth=1.2,
    color=colors,
    zorder=3
)

# Zero baseline
ax.set_ylim(bottom=0)

# Titles & labels
# ax.set_title('Queries per Category', fontsize=16, pad=15)
ax.set_ylabel('Number of Queries', fontsize=14, labelpad=10)
ax.tick_params(axis='x', rotation=0, labelsize=12)
ax.tick_params(axis='y', labelsize=12)

# Grid
ax.yaxis.grid(True, linestyle='--', alpha=0.7, zorder=0)

# Annotate bar values
for bar in bars:
    h = bar.get_height()
    ax.annotate(
        f'{h}',
        xy=(bar.get_x() + bar.get_width() / 2, h),
        xytext=(0, 5),
        textcoords='offset points',
        ha='center',
        va='bottom',
        fontsize=11
    )

# color map (unchanged)
color_map = dict(zip(series.index, colors))

# plot original bars again (if you need this second draw)
bars = ax.bar(
    series.index,
    series.values,
    width=0.6,
    edgecolor='gray',
    linewidth=1.2,
    color=[color_map[cat] for cat in series.index],
    zorder=3
)

# --- DISPLAY-ONLY LABEL FIX: instramental -> instrumental ---
current_labels = [t.get_text() for t in ax.get_xticklabels()]
fixed_labels = ["instrumental" if lbl == "instramental" else lbl for lbl in current_labels]
ax.set_xticklabels(fixed_labels)

plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Filter out HARs that have only Bing searches (no Google results)
for cat in list(data.keys()):
    data[cat] = {hid: rec for hid, rec in data[cat].items() if rec['google_urls']}

for cat in list(data.keys()):
    data[cat] = {hid: rec for hid, rec in data[cat].items() if rec['bing_urls']}

# Additionally filter out HAR IDs 87–110 for navigational and 160–175 for factual
# Assuming HAR keys like 'network-logs-prompt-<ID>_<timestamp>'
# if 'navigational' in data:
#     filtered = {}
#     for hid, rec in data['navigational'].items():
#         try:
#             num = int(hid.split('-')[3].split('_')[0])
#         except Exception:
#             filtered[hid] = rec
#             continue
#         if not (87 <= num <= 110):
#             filtered[hid] = rec
#     data['navigational'] = filtered
# if 'factual' in data:
#     filtered = {}
#     for hid, rec in data['factual'].items():
#         try:
#             num = int(hid.split('-')[3].split('_')[0])
#         except Exception:
#             filtered[hid] = rec
#             continue
#         if not (160 <= num <= 175):
#             filtered[hid] = rec
#     data['factual'] = filtered

# 1) Compute hit‐rate records and average per category with substring matching
records = []
for cat, har_dict in data.items():
    for har_id, rec in har_dict.items():
        prompts = rec['urls_from_prompt']
        search_urls = [e['url'] for e in rec['bing_urls'] + rec['google_urls']]
        total = len(prompts)
        hits = 0
        for p in prompts:
            # count as hit if any search URL equals p or is a substring of p
            if any(s == p or s in p for s in search_urls):
                hits += 1
        records.append({
            'category': cat,
            'har_id': har_id,
            'hit_rate': hits / total if total else 0
        })

df_hr = pd.DataFrame(records)
avg_hr = df_hr.groupby('category')['hit_rate'].mean().sort_index()


# 2. Apply muted style and pastel palette
fig, ax = plt.subplots(figsize=(10, 6))

colors = plt.get_cmap('Pastel1')(np.arange(len(avg_hr)))

bars = ax.bar(
    avg_hr.index,
    avg_hr.values,
    width=0.6,
    edgecolor='gray',
    linewidth=1.2,
    color=colors,
    zorder=3
)

# 3. Zero-based baseline
ax.set_ylim(bottom=0)

# 4. Titles & labels
ax.set_title('Average Hit Rate (Combine Search Engines) per Category', fontsize=16, pad=15)
ax.set_ylabel('Average Hit Rate', fontsize=14, labelpad=10)
ax.tick_params(axis='x', rotation=0, labelsize=12)
ax.tick_params(axis='y', labelsize=12)

# 5. Subtle horizontal gridlines
ax.yaxis.grid(True, linestyle='--', alpha=0.7, zorder=0)

# 6. Annotate each bar with its value (formatted as percentage)
for bar in bars:
    h = bar.get_height()
    ax.annotate(
        f'{h:.1%}',
        xy=(bar.get_x() + bar.get_width() / 2, h),
        xytext=(0, 5),
        textcoords='offset points',
        ha='center',
        va='bottom',
        fontsize=11
    )

# --- DISPLAY-ONLY LABEL FIX: instramental -> instrumental ---
current_labels = [t.get_text() for t in ax.get_xticklabels()]
fixed_labels = ["instrumental" if lbl == "instramental" else lbl for lbl in current_labels]
ax.set_xticklabels(fixed_labels)

plt.tight_layout()
plt.show()


In [None]:
# # # 1. Load the data
# # with open('aggregated_data.json', 'r', encoding='utf-8') as f:
# #     data = json.load(f)

# # 2. Filter out records without both Google and Bing URLs
# for cat in list(data.keys()):
#     data[cat] = {
#         hid: rec 
#         for hid, rec in data[cat].items() 
#         if rec['google_urls'] and rec['bing_urls']
#     }

# # 3. Compute google_and_bing_both vs unique hit counts per category
# records = []
# for cat, har_dict in data.items():
#     for rec in har_dict.values():
#         prompts = rec['urls_from_prompt']
#         g_urls = [e['url'] for e in rec['google_urls']]
#         b_urls = [e['url'] for e in rec['bing_urls']]
        
#         total = len(prompts)
#         # Count google_and_bing_boths and uniques
#         google_and_bing_both = sum(
#             1 
#             for p in prompts 
#             if (any(u == p or u in p for u in g_urls) 
#                 and any(u == p or u in p for u in b_urls))
#         )
#         google_only = sum(
#             1 
#             for p in prompts 
#             if (any(u == p or u in p for u in g_urls) 
#                 and not any(u == p or u in p for u in b_urls))
#         )
#         bing_only = sum(
#             1 
#             for p in prompts 
#             if (any(u == p or u in p for u in b_urls) 
#                 and not any(u == p or u in p for u in g_urls))
#         )
        
#         records.append({
#             'category': cat,
#             'google_and_bing_both': google_and_bing_both,
#             'google_only': google_only,
#             'bing_only': bing_only,
#             'total': total
#         })

# df = pd.DataFrame(records)

# # 4. Aggregate by category, add "Combined", and compute proportions
# # keep original category order from the groupby
# agg = df.groupby('category', sort=False).sum()

# # Add a "Combined" row that sums across all categories
# combined_row = agg.sum(axis=0).to_frame().T
# combined_row.index = ['Combined']

# agg = pd.concat([agg, combined_row], axis=0)

# # Compute proportions safely (avoid div-by-zero), fill NaNs with 0
# proportions = agg[['google_and_bing_both', 'google_only', 'bing_only']].div(
#     agg['total'].replace(0, np.nan), axis=0
# ).fillna(0)


# # 5. Plot stacked bar chart
# # 5. Plot: paired bars (Overall vs. Breakdown)
# segs = ['google_and_bing_both', 'google_only', 'bing_only']
# cmap = plt.get_cmap('Pastel2')
# colors = {seg: cmap(i) for i, seg in enumerate(segs)}

# # Overall hit rate per category (sum of the three proportions)
# overall = proportions[segs].sum(axis=1)

# cats = proportions.index
# x = np.arange(len(cats))
# bar_w = 0.3

# fig, ax = plt.subplots(figsize=(12, 6))

# # --- Left bar in each pair: Overall (single color) ---
# overall_color = '#FFB6C1'  # a readable single color; change if you prefer
# # overall_color = plt.get_cmap('Pastel1')(np.arange(len(overall.values)))
# overall_bars = ax.bar(
#     x - bar_w/2, overall.values, width=bar_w,
#     color=overall_color, edgecolor='gray', linewidth=1.2,
#     label='Overall Hit Rate', zorder=3, hatch="x"
# )

# # Annotate overall bars
# for bar, v in zip(overall_bars, overall.values):
#     if v > 0:
#         ax.text(
#             bar.get_x() + bar.get_width()/2, v + 0.015,
#             f'{v:.1%}', ha='center', va='bottom', fontsize=11, fontweight='bold'
#         )

# # --- Right bar in each pair: Stacked breakdown ---
# bottom = np.zeros(len(cats))
# stack_handles = []
# for seg in segs:
#     vals = proportions[seg].values
#     bars = ax.bar(
#         x + bar_w/2, vals, bottom=bottom, width=bar_w,
#         color=colors[seg], edgecolor='gray', linewidth=1.2,
#         label=seg.replace('_', ' ').title(), zorder=3
#     )
#     # annotate segment percentages
#     for i, (bar, v) in enumerate(zip(bars, vals)):
#         if v > 0.01:  # avoid clutter on tiny slivers
#             mid_y = bottom[i] + v/2
#             ax.text(
#                 bar.get_x() + bar.get_width()/2, mid_y,
#                 f'{v:.1%}' if v >= 0.095 else f'{v:.1%}',
#                 ha='center', va='center', fontsize=8, color='black',
#             )
#     bottom += vals
#     stack_handles.append(bars)

# # --- Styling & legend ---
# ax.set_ylim(0, 1.12)
# # ax.set_title('Overall vs. Breakdown of Hit Rates per Category', fontsize=16, pad=15)
# ax.set_ylabel('Proportion of Prompts Matched', fontsize=14, labelpad=10)
# ax.set_xticks(x)
# ax.set_xticklabels(
#     [str(c).replace('instramental','instrumental') for c in cats],
#     rotation=0, fontsize=12
# )

# # Group divider lines (optional, for readability)
# for xi in x:
#     ax.axvline(x=xi, ymin=0, ymax=1, color='k', alpha=0.05, linewidth=0.8, zorder=0)

# # Light grid
# ax.yaxis.grid(True, linestyle='--', alpha=0.7, zorder=0)

# # Legend: Overall + breakdown segments
# handles, labels = ax.get_legend_handles_labels()
# # Ensure "Overall Hit Rate" is first
# order = [labels.index('Overall Hit Rate')] + [i for i, lab in enumerate(labels) if lab != 'Overall Hit Rate']
# handles = [handles[i] for i in order]
# labels  = [labels[i]  for i in order]
# ax.legend(handles, labels, title='Bars in Each Pair', fontsize=11, title_fontsize=12, loc='upper right')

# plt.tight_layout()
# plt.savefig("plots/search_engine_by_cat.png",format="png")
# plt.show()

# # 1. Load the data
# with open('aggregated_data.json', 'r', encoding='utf-8') as f:
#     data = json.load(f)

# 2. Filter out records without both Google and Bing URLs
for cat in list(data.keys()):
    data[cat] = {
        hid: rec 
        for hid, rec in data[cat].items() 
        if rec['google_urls'] and rec['bing_urls']
    }

# 3. Compute google_and_bing_both vs unique hit counts per category
records = []
for cat, har_dict in data.items():
    for rec in har_dict.values():
        prompts = rec['urls_from_prompt']
        g_urls = [e['url'] for e in rec['google_urls']]
        b_urls = [e['url'] for e in rec['bing_urls']]
        
        total = len(prompts)
        # Count google_and_bing_boths and uniques
        google_and_bing_both = sum(
            1 
            for p in prompts 
            if (any(u == p or u in p for u in g_urls) 
                and any(u == p or u in p for u in b_urls))
        )
        google_only = sum(
            1 
            for p in prompts 
            if (any(u == p or u in p for u in g_urls) 
                and not any(u == p or u in p for u in b_urls))
        )
        bing_only = sum(
            1 
            for p in prompts 
            if (any(u == p or u in p for u in b_urls) 
                and not any(u == p or u in p for u in g_urls))
        )
        
        records.append({
            'category': cat,
            'google_and_bing_both': google_and_bing_both,
            'google_only': google_only,
            'bing_only': bing_only,
            'total': total
        })

df = pd.DataFrame(records)


# 4. Aggregate by category and compute proportions
agg = df.groupby('category').sum()

# Add a "Combined" row that sums across all categories
combined_row = agg.sum(axis=0).to_frame().T
combined_row.index = ['Combined']

agg = pd.concat([agg, combined_row], axis=0)

# Compute proportions safely (avoid div-by-zero), fill NaNs with 0
proportions = agg[['google_and_bing_both', 'google_only', 'bing_only']].div(
    agg['total'].replace(0, np.nan), axis=0
).mul(100).fillna(0)

# 5. Plot stacked bar chart
# Map segments to the first three colors in Pastel2
segs = ['google_and_bing_both', 'google_only', 'bing_only']
cmap = plt.get_cmap('Pastel2')
colors = {seg: cmap(i) for i, seg in enumerate(segs)}

fig, ax = plt.subplots(figsize=(10, 6))
cats   = proportions.index
bottom = np.zeros(len(proportions))

# 1) Plot stacked bars
for seg in segs:
    vals = proportions[seg].values
    bars = ax.bar(
        cats, vals, bottom=bottom,
        width=0.6,
        color=colors[seg],
        edgecolor='gray',
        linewidth=1.2,
        label=seg.replace('_', ' ').title(),
        zorder=3
    )
    # annotate each segment
    for i, (bar, v) in enumerate(zip(bars, vals)):
        if v > 0:
            mid_y = bottom[i] + v/2
            ax.text(
                bar.get_x() + bar.get_width()/2, mid_y,
                f'{v:.1f}%',
                ha='center', va='center',
                fontsize=10, color='black'
            )
    bottom += vals

# 2) Add bold total-rate labels just above each bar
for i, hr in enumerate(bottom):
    ax.text(
        i, hr + 0.62,
        f'{hr:.1f}%',
        ha='center', va='bottom',
        fontsize=12, fontweight='bold'
    )

# 3) Build legend including Total
handles, labels = ax.get_legend_handles_labels()
total_patch = mpatches.Patch(facecolor='black', edgecolor='black', label='Total Hit Rate')
handles.append(total_patch)
labels.append('Top text: Total Hit Rate')

ax.legend(handles, labels, title='URL Hit By', fontsize=11, title_fontsize=12, loc='upper right', bbox_to_anchor=(1.0, 1.13))

# 4) Final styling
ax.set_ylim(0, 100)
# ax.set_title('Hit-Type Proportions per Category', fontsize=16, pad=15)
ax.set_ylabel('Proportion of URLs Matched/%', fontsize=14, labelpad=10)
ax.set_xlabel('Catagories', fontsize=14, labelpad=10)
ax.set_xticklabels(
    [lbl.get_text().replace('instramental','instrumental') for lbl in ax.get_xticklabels()],
    rotation=0, fontsize=12
)
ax.yaxis.grid(True, linestyle='--', alpha=0.7, zorder=0)
# ax.xaxis.set_label("Catagories")

plt.tight_layout()
plt.savefig("plots/search_engine_by_cat.svg",format="svg")
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 1. Prepare the data (as you already have)
b_records, g_records = [], []
for cat, har_dict in data.items():
    for har_id, rec in har_dict.items():
        prompts = set(rec['urls_from_prompt'])
        total = len(prompts)
        bing_urls = set(e['url'] for e in rec['bing_urls'])
        google_urls = set(e['url'] for e in rec['google_urls'])
        b_rate = len(prompts & bing_urls) / total if total else 0
        g_rate = len(prompts & google_urls) / total if total else 0
        b_records.append({'category': cat, 'rate_bing': b_rate})
        g_records.append({'category': cat, 'rate_google': g_rate})
df_b = pd.DataFrame(b_records)
df_g = pd.DataFrame(g_records)
avg_b = df_b.groupby('category')['rate_bing'].mean().sort_index()
avg_g = df_g.groupby('category')['rate_google'].mean().sort_index()

# 2. Styling setup
fig, ax = plt.subplots(figsize=(12, 6))

n = len(avg_b)
index = np.arange(n)
bar_width = 0.4
opacity = 0.9

# 3. Choose two distinct pastel colors
cmap = plt.get_cmap('Pastel1')
colors = cmap([0, 1])  # first two pastel shades

# 4. Plot grouped bars
bars_b = ax.bar(
    index - bar_width/2,
    avg_b.values,
    bar_width,
    label='Bing',
    edgecolor='gray',
    linewidth=1.2,
    color=colors[0],
    zorder=3
)
bars_g = ax.bar(
    index + bar_width/2,
    avg_g.values,
    bar_width,
    label='Google',
    edgecolor='gray',
    linewidth=1.2,
    color=colors[1],
    zorder=3
)

# 5. Zero baseline & grid
ax.set_ylim(0, 1)
ax.yaxis.grid(True, linestyle='--', alpha=0.7, zorder=0)

# 6. Labels, title, legend
ax.set_title('Average Hit Rate per Category (Bing vs Google)', fontsize=16, pad=15)
ax.set_ylabel('Average Hit Rate', fontsize=14, labelpad=10)
ax.set_xticks(index)
ax.set_xticklabels(avg_b.index, rotation=0, ha='right', fontsize=12)
ax.tick_params(axis='y', labelsize=12)
ax.legend(fontsize=12)

# 7. Annotate bars
for bars in (bars_b, bars_g):
    for bar in bars:
        h = bar.get_height()
        ax.annotate(
            f'{h:.1%}',
            xy=(bar.get_x() + bar.get_width()/2, h),
            xytext=(0, 5),
            textcoords='offset points',
            ha='center',
            va='bottom',
            fontsize=11
        )

# --- DISPLAY-ONLY LABEL FIX: instramental -> instrumental ---
current_labels = [t.get_text() for t in ax.get_xticklabels()]
fixed_labels = ["instrumental" if lbl == "instramental" else lbl for lbl in current_labels]
ax.set_xticklabels(fixed_labels)

plt.tight_layout()
plt.show()


In [None]:
# helper
def strip_utm(url):
    return re.sub(r"[?&]utm_source=chatgpt\.com", "", url)

In [None]:
# helper
def strip_utm(url):
    return re.sub(r"[?&]utm_source=chatgpt\.com", "", url)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 1. Compute overall hit rates (as you already have)
totals = {'bing': 0, 'google': 0, 'both': 0, 'prompts': 0}
for recs in data.values():
    for rec in recs.values():
        p = set(rec['urls_from_prompt'])
        totals['prompts'] += len(p)
        b = set(e['url'] for e in rec['bing_urls'])
        g = set(e['url'] for e in rec['google_urls'])
        totals['bing']   += len(p & b)
        totals['google'] += len(p & g)
        totals['both']   += len(p & (b | g))
rates = {k: totals[k] / totals['prompts'] for k in ('bing', 'google', 'both')}

# 2. Styling setup
fig, ax = plt.subplots(figsize=(8, 6))

series = pd.Series(rates).reindex(['bing','google','both'])
n = len(series)
index = np.arange(n)
bar_width = 0.6

# 3. Pastel1 palette for three bars
colors = plt.get_cmap('Pastel1')(np.arange(n))

bars = ax.bar(
    index,
    series.values,
    bar_width,
    edgecolor='gray',
    linewidth=1.2,
    color=colors,
    zorder=3
)

# 4. Zero baseline
ax.set_ylim(bottom=0)

# 5. Titles & labels
ax.set_title('Overall Hit Rates', fontsize=16, pad=15)
ax.set_ylabel('Hit Rate', fontsize=14, labelpad=10)
ax.set_xticks(index)
ax.set_xticklabels(series.index, rotation=0, fontsize=12)
ax.tick_params(axis='y', labelsize=12)

# 6. Light dashed gridlines
ax.yaxis.grid(True, linestyle='--', alpha=0.7, zorder=0)

# 7. Annotate bars with percentages
for bar in bars:
    h = bar.get_height()
    ax.annotate(
        f'{h:.1%}',
        xy=(bar.get_x() + bar.get_width() / 2, h),
        xytext=(0, 5),
        textcoords='offset points',
        ha='center',
        va='bottom',
        fontsize=11
    )

plt.tight_layout()
plt.show()

# 8. Print the numeric values
print('Overall rates:', {k: f"{v:.2%}" for k, v in rates.items()})


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# 1. Helper functions
def compute_stats(vals):
    """Return mean, median, and mode of a list of values."""
    s = pd.Series(vals)
    mean = s.mean()
    median = s.median()
    mode = s.mode().iloc[0] if not s.mode().empty else None
    return mean, median, mode

def styled_hist(ax, data, bins, stats, title):
    """
    Draw a histogram on ax with pastel fill, dashed gridlines,
    and vertical lines for mean/median/mode.
    Parameters must be passed positionally:
      ax, data, bins, (mean, median, mode), title
    """
    mean, median, mode = stats
    # Soft pastel fill
    pastel = plt.get_cmap('Pastel1')
    color = pastel(0.5)
    ax.hist(data, bins=bins, align='left',
            edgecolor='gray', linewidth=1.2,
            color=color, zorder=3)
    ax.set_ylim(bottom=0)  # zero baseline
    ax.yaxis.grid(True, linestyle='--', alpha=0.7, zorder=0)
    # Add mean/median/mode lines
    for val, ls, lbl, col in [
        (mean, '--', f"Mean {mean:.1f}", 'red'),
        (median, ':', f"Median {median:.1f}", 'blue'),
        (mode, '-.', f"Mode {mode:.0f}", 'green')
    ]:
        ax.axvline(val, color=col, linestyle=ls, label=lbl)
    ax.set_title(title, fontsize=14, pad=10)
    ax.set_xlabel('Rank', fontsize=12)
    ax.set_ylabel('Frequency', fontsize=12)
    ax.legend(fontsize=10)


# 3. Prepare rank data
# 5.1 Combined Engines
all_ranks = []
nonhits = 0
for recs in data.values():
    for rec in recs.values():
        prompts = set(rec['urls_from_prompt'])
        found = {e['url'] for e in rec['bing_urls'] + rec['google_urls']}
        # collect ranks for all hits
        all_ranks.extend(
            e['rank']
            for e in rec['bing_urls'] + rec['google_urls']
            if e['url'] in prompts
        )
        nonhits += len(prompts - found)
# represent misses with rank=0
all_ranks += [0] * nonhits
stats_all = compute_stats(all_ranks)

# Plot 5.1
fig, ax = plt.subplots(figsize=(8, 5))
styled_hist(
    ax,
    all_ranks,
    range(0, max(all_ranks) + 2),
    stats_all,
    'Combined Engines Rank Distribution'
)
plt.tight_layout()
plt.show()

# 5.2 Per-Engine Distributions
for engine, title in [
    ('bing', 'Bing Rank Distribution'),
    ('google', 'Google Rank Distribution')
]:
    ranks = [
        e['rank']
        for recs in data.values()
        for rec in recs.values()
        for e in rec[f'{engine}_urls']
        if e['url'] in set(rec['urls_from_prompt'])
    ]
    stats = compute_stats(ranks)
    fig, ax = plt.subplots(figsize=(8, 5))
    styled_hist(
        ax,
        ranks,
        range(1, max(ranks) + 2),
        stats,
        title
    )
    plt.tight_layout()
    plt.show()

# 5.3 Per-Category Combined (2×3 grid)
categories = list(data.keys())
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
for ax, cat in zip(axes.flatten(), categories):
    cat_ranks = []
    miss_cat = 0
    for rec in data[cat].values():
        prompts = set(rec['urls_from_prompt'])
        found = set()
        for e in rec['bing_urls'] + rec['google_urls']:
            if e['url'] in prompts:
                cat_ranks.append(e['rank'])
                found.add(e['url'])
        miss_cat += len(prompts - found)
    combined = cat_ranks + [0] * miss_cat
    stats_cat = compute_stats(combined)
    styled_hist(
        ax,
        combined,
        range(0, max(combined or [0]) + 2),
        stats_cat,
        f'{cat} Combined'
    )
# turn off any extra subplots
for ax in axes.flatten()[len(categories):]:
    ax.axis('off')
fig.suptitle('5.3 Rank Distribution by Category (Combined)', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

# 5.4 Per-Category, Per-Engine
for cat in categories:
    for engine in ('bing', 'google'):
        ranks = [
            e['rank']
            for rec in data[cat].values()
            for e in rec[f'{engine}_urls']
            if e['url'] in set(rec['urls_from_prompt'])
        ]
        if not ranks:
            continue
        stats_engine = compute_stats(ranks)
        fig, ax = plt.subplots(figsize=(8, 5))
        styled_hist(
            ax,
            ranks,
            range(1, max(ranks) + 2),
            stats_engine,
            f'5.4 {engine.title()} Rank Distribution for {cat}'
        )
        plt.tight_layout()
        plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# --- compute average cited hit rate per category (+ overall), same logic as yours ---
cited_records = []
overall_cited = {'hits': 0, 'total': 0}

for cat, har_dict in data.items():
    for rec in har_dict.values():
        cited = {strip_utm(u) for u in rec.get('urls_cited', [])}
        found_urls = {strip_utm(e['url']) for e in rec.get('bing_urls', []) + rec.get('google_urls', [])}
        hits_c = len(cited & found_urls)
        total_c = len(cited)
        overall_cited['hits'] += hits_c
        overall_cited['total'] += total_c
        if total_c > 0:
            cited_records.append({'category': cat, 'rate_cited': hits_c / total_c})

if not cited_records:
    raise ValueError("No cited records with non-zero totals found.")

df_cr = pd.DataFrame(cited_records)
avg_cr = df_cr.groupby('category')['rate_cited'].mean().sort_index()

overall_rate = (overall_cited['hits'] / overall_cited['total']) if overall_cited['total'] else 0.0
avg_cr_with_overall = pd.concat([avg_cr, pd.Series({'Overall': overall_rate})])

# --- style & plot (matches your provided bar style) ---
fig, ax = plt.subplots(figsize=(10, 6))

# Soft pastel colors (enough colors for all categories + Overall)
colors = plt.get_cmap('Pastel1')(np.arange(len(avg_cr_with_overall)))

bars = ax.bar(
    avg_cr_with_overall.index,
    avg_cr_with_overall.values * 100,
    width=0.6,
    edgecolor='gray',
    linewidth=1.2,
    color=colors,
    zorder=3
)

# Baseline & limits (0–1 since these are rates)
ax.set_ylim(bottom=0, top=100)

# Titles & labels
ax.set_title('Presented URL Hit Rate per Category + Overall', fontsize=16, pad=15)
ax.set_ylabel('Average Presented Hit Rate', fontsize=14, labelpad=10)
ax.tick_params(axis='x', rotation=0, labelsize=12)
ax.tick_params(axis='y', labelsize=12)

# Grid
ax.yaxis.grid(True, linestyle='--', alpha=0.7, zorder=0)

# Annotate bar values (percent)
for bar, val in zip(bars, avg_cr_with_overall.values):
    h = bar.get_height()
    ax.annotate(
        f'{val*100:.1f}%',
        xy=(bar.get_x() + bar.get_width() / 2, h),
        xytext=(0, 5),
        textcoords='offset points',
        ha='center',
        va='bottom',
        fontsize=11
    )

# Optional: ensure consistent pastel mapping by name (if you later re-order)
color_map = dict(zip(avg_cr_with_overall.index, colors))
bars = ax.bar(
    avg_cr_with_overall.index,
    avg_cr_with_overall.values,
    width=0.6,
    edgecolor='gray',
    linewidth=1.2,
    color=[color_map[k] for k in avg_cr_with_overall.index],
    zorder=3
)

# --- DISPLAY-ONLY LABEL FIX: instramental -> instrumental ---
current_labels = [t.get_text() for t in ax.get_xticklabels()]
fixed_labels = ["instrumental" if lbl == "instramental" else lbl for lbl in current_labels]
ax.set_xticklabels(fixed_labels)

plt.tight_layout()
plt.show()

print('Overall cited hit rate:', f"{overall_rate:.2%}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# -----------------
# Input data
# -----------------
data = {
    "navigational": {
        "Average time (s)": 39.29,
        "Average transfer size (B)": 111461.11,
        "Average URLs accessed": 14.12,
        "Average URLs presented/given": 7.53,
    },
    "abstain": {
        "Average time (s)": 40.51,
        "Average transfer size (B)": 125345.01,
        "Average URLs accessed": 15.99,
        "Average URLs presented/given": 8.62,
    },
    "factual": {
        "Average time (s)": 44.91,
        "Average transfer size (B)": 104996.06,
        "Average URLs accessed": 14.67,
        "Average URLs presented/given": 5.96,
    },
    "instrumental": {
        "Average time (s)": 38.94,
        "Average transfer size (B)": 125396.86,
        "Average URLs accessed": 18.49,
        "Average URLs presented/given": 9.54,
    },
    "transactional": {
        "Average time (s)": 38.42,
        "Average transfer size (B)": 114060.88,
        "Average URLs accessed": 18.35,
        "Average URLs presented/given": 7.77,
    },
}

# Create a DataFrame and sort categories alphabetically for consistent ordering
df = pd.DataFrame(data).T.sort_index()

# Helper to plot a single metric chart in the requested style
def plot_metric(metric_name, title, ylabel, fmt="{:.2f}"):
    series = df[metric_name]
    categories = series.index.tolist()
    
    fig, ax = plt.subplots(figsize=(10, 6))

    # Colors (as in the provided style)
    colors = plt.get_cmap('Pastel1')(np.arange(len(series)))

    # Bars
    bars = ax.bar(
        categories,
        series.values,
        width=0.6,
        edgecolor='gray',
        linewidth=1.2,
        color=colors,
        zorder=3
    )

    # Zero baseline
    ax.set_ylim(bottom=0)

    # Titles & labels
    ax.set_title(title, fontsize=16, pad=15)
    ax.set_ylabel(ylabel, fontsize=14, labelpad=10)
    ax.tick_params(axis='x', rotation=0, labelsize=12)
    ax.tick_params(axis='y', labelsize=12)

    # Grid
    ax.yaxis.grid(True, linestyle='--', alpha=0.7, zorder=0)

    # Annotate bar values
    for bar, val in zip(bars, series.values):
        h = bar.get_height()
        ax.annotate(
            fmt.format(val),
            xy=(bar.get_x() + bar.get_width() / 2, h),
            xytext=(0, 5),
            textcoords='offset points',
            ha='center',
            va='bottom',
            fontsize=11
        )

    plt.tight_layout()
    plt.show()


# 1) Average time per category
plot_metric("Average time (s)", "Average Time per Category", "Seconds", "{:.2f}")

# 2) Average transfer size per category
plot_metric("Average transfer size (B)", "Average Transfer Size per Category", "Bytes", "{:,.2f}")

# 3) Average URLs accessed per category
plot_metric("Average URLs accessed", "Average URLs Accessed per Category", "Count", "{:.2f}")

# 4) Average URLs presented/given per category
plot_metric("Average URLs presented/given", "Average URLs Presented/Given per Category", "Count", "{:.2f}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Recreate the dataframe after reset
data = {
    "navigational": {
        "Average time (s)": 39.29,
        "Average transfer size (B)": 111461.11,
        "Average URLs accessed": 14.12,
        "Average URLs presented/given": 7.53,
    },
    "abstain": {
        "Average time (s)": 40.51,
        "Average transfer size (B)": 125345.01,
        "Average URLs accessed": 15.99,
        "Average URLs presented/given": 8.62,
    },
    "factual": {
        "Average time (s)": 44.91,
        "Average transfer size (B)": 104996.06,
        "Average URLs accessed": 14.67,
        "Average URLs presented/given": 5.96,
    },
    "instrumental": {
        "Average time (s)": 38.94,
        "Average transfer size (B)": 125396.86,
        "Average URLs accessed": 18.49,
        "Average URLs presented/given": 9.54,
    },
    "transactional": {
        "Average time (s)": 38.42,
        "Average transfer size (B)": 114060.88,
        "Average URLs accessed": 18.35,
        "Average URLs presented/given": 7.77,
    },
}

df = pd.DataFrame(data).T.sort_index()

# Side-by-side bar chart for URLs accessed vs presented
series_accessed = df["Average URLs accessed"]
series_presented = df["Average URLs presented/given"]
categories = df.index.tolist()

x = np.arange(len(categories))  # category positions
width = 0.35  # bar width

fig, ax = plt.subplots(figsize=(10, 6))

# Bars
bars1 = ax.bar(
    x - width/2,
    series_accessed.values,
    width,
    label='URLs Accessed',
    edgecolor='gray',
    linewidth=1.2,
    color=plt.get_cmap('Pastel2')(0),
    zorder=3
)

bars2 = ax.bar(
    x + width/2,
    series_presented.values,
    width,
    label='URLs Presented/Given',
    edgecolor='gray',
    linewidth=1.2,
    color=plt.get_cmap('Pastel2')(1),
    zorder=3
)

# Titles & labels
ax.set_title("Average URLs Accessed vs Presented per Category", fontsize=16, pad=15)
ax.set_ylabel("Count", fontsize=14, labelpad=10)
ax.set_xticks(x)
ax.set_xticklabels(categories, fontsize=12)
ax.tick_params(axis='y', labelsize=12)

# Grid
ax.yaxis.grid(True, linestyle='--', alpha=0.7, zorder=0)

# Annotate bars
for bars, series in [(bars1, series_accessed), (bars2, series_presented)]:
    for bar, val in zip(bars, series.values):
        h = bar.get_height()
        ax.annotate(
            f"{val:.2f}",
            xy=(bar.get_x() + bar.get_width() / 2, h),
            xytext=(0, 5),
            textcoords="offset points",
            ha='center',
            va='bottom',
            fontsize=11
        )

ax.legend(fontsize=12)
plt.tight_layout()
plt.show()
