In [None]:
import pandas as pd

# Load all datasets
file_path = "../data/repositoryinfo/peer_auth_v1_repos.parquet"
df_peer_auth_v1 = pd.read_parquet(file_path).assign(kind="peer", version="v1")

file_path = "../data/repositoryinfo/req_auth_v1_repos.parquet"
df_req_auth_v1 = pd.read_parquet(file_path).assign(kind="request", version="v1")

file_path = "../data/repositoryinfo/peer_auth_v1beta1_repos.parquet"
df_peer_auth_v1beta1 = pd.read_parquet(file_path).assign(kind="peer", version="v1beta1")

file_path = "../data/repositoryinfo/req_auth_v1beta1_repos.parquet"
df_req_auth_v1beta1 = pd.read_parquet(file_path).assign(kind="request", version="v1beta1")

file_path = "../data/repositoryinfo/peer_auth_v1alpha1_repos.parquet"
df_peer_auth_v1alpha1 = pd.read_parquet(file_path).assign(kind="peer", version="v1alpha1")

file_path = "../data/repositoryinfo/req_auth_v1alpha1_repos.parquet"
df_req_auth_v1alpha1 = pd.read_parquet(file_path).assign(kind="request", version="v1alpha1")

# Combine everything
df_all_auth = pd.concat([
    df_peer_auth_v1,
    df_req_auth_v1,
    df_peer_auth_v1beta1,
    df_req_auth_v1beta1,
    df_peer_auth_v1alpha1,
    df_req_auth_v1alpha1
], ignore_index=True)

# Assuming each dataframe has a 'repo' column (adjust if it's named differently)
repo_sets = {
    "peer_v1": set(df_peer_auth_v1["full_name"]),
    "req_v1": set(df_req_auth_v1["full_name"]),
    "peer_v1beta1": set(df_peer_auth_v1beta1["full_name"]),
    "req_v1beta1": set(df_req_auth_v1beta1["full_name"]),
    "peer_v1alpha1": set(df_peer_auth_v1alpha1["full_name"]),
    "req_v1alpha1": set(df_req_auth_v1alpha1["full_name"]),
}

# Calculate disjoint sets (repos unique to each group)
disjoint_sets = {
    key: repos - set.union(*(v for k,v in repo_sets.items() if k != key))
    for key, repos in repo_sets.items()
}

# Create summary dataframe
summary_df = pd.DataFrame({
    "group": list(disjoint_sets.keys()),
    "unique_repos_count": [len(s) for s in disjoint_sets.values()]
})


summary_df


In [None]:
import os
import matplotlib.font_manager as font_manager

def saveFigure(fig,fileName):
    fig.savefig(fileName,bbox_inches='tight',dpi=100, pad_inches = 0)


def get_font_properties(routePath=os.getcwd()):
    
    fontpath = routePath+'/NimbusSanL-Reg.otf'
    prop = font_manager.FontProperties(fname=fontpath, size=16)
    return prop

def get_props_as_dict():
    routePath=os.getcwd()
    fontpath = routePath+'/NimbusSanL-Reg.otf'
    return {
        "name": fontpath,
        "size": 26
    }

def get_alt_font_properties(type="san",size=15):
    routePath=os.getcwd()
    if type == "rom":
        fontpath = routePath+'/NimbusRomNo9L-Reg.otf'
    else:
        fontpath = routePath+'/NimbusSanL-Reg.otf'
    prop = font_manager.FontProperties(fname=fontpath, size=size)
    return prop

def save_fig(ax, title):
    ax.figure.savefig(title,  dpi=300,  bbox_inches='tight', format="pdf", pad_inches = 0)


fontsize = 17
legend_fontsize = fontsize
smallfontsize = fontsize -1

prop = get_alt_font_properties(type="rom",size=fontsize)
prop_small = get_alt_font_properties(type="rom",size=smallfontsize)
prop_legend = get_alt_font_properties(type="rom",size=legend_fontsize)

In [None]:
df_all = pd.concat([df_peer_auth_v1,df_req_auth_v1,df_peer_auth_v1beta1,df_req_auth_v1beta1,df_peer_auth_v1alpha1,df_req_auth_v1alpha1 ], ignore_index=True)
df_req=pd.concat([df_req_auth_v1,df_req_auth_v1beta1,df_req_auth_v1alpha1 ], ignore_index=True)
df_peer=pd.concat([df_peer_auth_v1,df_peer_auth_v1beta1,df_peer_auth_v1alpha1 ], ignore_index=True)
df_v1=pd.concat([df_peer_auth_v1,df_req_auth_v1 ], ignore_index=True)
df_v1beta1=pd.concat([df_peer_auth_v1beta1,df_req_auth_v1beta1 ], ignore_index=True)
df_v1alpha1=pd.concat([df_peer_auth_v1alpha1,df_req_auth_v1alpha1 ], ignore_index=True)

df_all_unique = df_all.drop_duplicates(subset="full_name")
df_req_auth_unique = df_req.drop_duplicates(subset="full_name")
df_peer_auth_unique = df_peer.drop_duplicates(subset="full_name")
df_v1_unique = df_v1.drop_duplicates(subset="full_name")
df_v1beta1_unique = df_v1beta1.drop_duplicates(subset="full_name")
df_v1alpha1_unique = df_v1alpha1.drop_duplicates(subset="full_name")
print("All:", len(df_all_unique))
print("PeerAuthentication:", len(df_peer_auth_unique))
print("RequestAuthentication:", len(df_req_auth_unique))
print("v1:", len(df_v1_unique)-len(df_v1beta1_unique))
print("v1beta1:", len(df_v1beta1_unique))
print("v1alpha1:", len(df_v1alpha1_unique))

In [None]:

df_all_star = df_all_unique[df_all_unique["stargazers_count"] >= 10]
df_req_auth_star = df_req_auth_unique[df_req_auth_unique["stargazers_count"] >= 10]
df_peer_auth_star = df_peer_auth_unique[df_peer_auth_unique["stargazers_count"] >= 10]
df_v1_star = df_v1_unique[df_v1_unique["stargazers_count"] >= 10]
df_v1beta1_star = df_v1beta1_unique[df_v1beta1_unique["stargazers_count"] >= 10]
df_v1alpha1_star = df_v1alpha1_unique[df_v1alpha1_unique["stargazers_count"] >= 10]

print("All:", len(df_all_star))
print("PeerAuthentication:", len(df_peer_auth_star))
print("RequestAuthentication:", len(df_req_auth_star))
print("v1:", len(df_v1_star) - len(df_v1beta1_star))
print("v1beta1:", len(df_v1beta1_star))
print("v1alpha1:", len(df_v1alpha1_star))

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Build summary counts for unique vs star repos
counts = {
    "all": (len(df_all_unique), len(df_all_star)),
    "req_auth": (len(df_req_auth_unique), len(df_req_auth_star)),
    "peer_auth": (len(df_peer_auth_unique), len(df_peer_auth_star)),
    "v1": (len(df_v1_unique), len(df_v1_star)),
    "v1beta1": (len(df_v1beta1_unique), len(df_v1beta1_star)),
    "v1alpha1": (len(df_v1alpha1_unique), len(df_v1alpha1_star)),
}

# Convert to dataframe for plotting
plot_df = pd.DataFrame(
    [(k, "unique", v[0]) for k, v in counts.items()] +
    [(k, "star", v[1]) for k, v in counts.items()],
    columns=["group", "type", "count"]
)

# Plot grouped bar chart
fig, ax = plt.subplots(figsize=(10,6))
plot_df.pivot(index="group", columns="type", values="count").plot(
    kind="bar", ax=ax, rot=0
)

ax.set_title("Unique vs Star (>=10 stars) Repositories per Category")
ax.set_ylabel("full_namesitory Count")
ax.set_xlabel("Category")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Individual version sets
set_v1 = set(df_v1_unique["full_name"])
set_v1beta1 = set(df_v1beta1_unique["full_name"])
set_v1alpha1 = set(df_v1alpha1_unique["full_name"])

# Compute intersections
counts = {
    "v1": len(set_v1),
    "v1beta1": len(set_v1beta1),
    "v1alpha1": len(set_v1alpha1),
    "v1 ∩ v1beta1": len(set_v1 & set_v1beta1),
    "v1 ∩ v1alpha1": len(set_v1 & set_v1alpha1),
    "v1beta1 ∩ v1alpha1": len(set_v1beta1 & set_v1alpha1),
    "v1 ∩ v1beta1 ∩ v1alpha1": len(set_v1 & set_v1beta1 & set_v1alpha1)
}

# Convert to dataframe for plotting
plot_df = pd.DataFrame({
    "group": list(counts.keys()),
    "count": list(counts.values())
})

# Plot horizontal bar chart
fig, ax = plt.subplots(figsize=(10,6))
bars = ax.barh(plot_df["group"], plot_df["count"], color="#1f77b4")

# Annotate counts
for bar in bars:
    width = bar.get_width()
    y = bar.get_y() + bar.get_height()/2
    ax.text(width + 2, y, str(width), ha='left', va='center')

ax.set_xlabel("Number of Unique Repositories")
ax.set_title("Unique Repositories per Version and Overlaps")
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
from matplotlib.patches import Patch

# Precomputed counts
only_v1 = len(set_v1 - set_v1beta1 - set_v1alpha1)
only_v1beta1 = len(set_v1beta1 - set_v1 - set_v1alpha1)
only_v1alpha1 = len(set_v1alpha1 - set_v1 - set_v1beta1)

v1_and_v1beta1 = len((set_v1 & set_v1beta1) - set_v1alpha1)
v1_and_v1alpha1 = len((set_v1 & set_v1alpha1) - set_v1beta1)
v1beta1_and_v1alpha1 = len((set_v1beta1 & set_v1alpha1) - set_v1)
all_three = len(set_v1 & set_v1beta1 & set_v1alpha1)

# Colors
colors = ('#1f77b4', '#ff7f0e', '#2ca02c')

# Create Venn diagram
plt.figure(figsize=(8,8))
v = venn3(subsets=(only_v1, only_v1beta1, v1_and_v1beta1,
                   only_v1alpha1, v1_and_v1alpha1,
                   v1beta1_and_v1alpha1, all_three),
          set_labels=('', '', ''),
          set_colors=colors,
          alpha=0.5)

# Legend
legend_elements = [Patch(facecolor=colors[0], alpha=0.5, label='v1'),
                   Patch(facecolor=colors[1], alpha=0.5, label='beta'),
                   Patch(facecolor=colors[2], alpha=0.5, label='alpha')]
plt.legend(handles=legend_elements, loc='upper right', prop=prop)

# Add small labels for set combinations
subset_labels = {
    '100': 'v1',
    '010': 'beta',
    '001': 'alpha',
    '110': 'v1 ∪ beta',
    '101': 'v1 ∪ alpha',
    '011': 'beta ∪ alpha',
    '111': 'v1 ∪ beta ∪ alpha'
}

for subset_id, text in subset_labels.items():
    label = v.get_label_by_id(subset_id)
    if label is not None:
        # Add count + small label
        current_text = label.get_text()
        label.set_text(f"{current_text}\n{text}")
        # Apply font
        label.set_fontproperties(prop)
        # Optional: adjust position slightly
        x, y = label.get_position()
        label.set_position((x, y + 0.01))

#plt.title("Repositories including Authentication Across Versions", fontproperties=prop)
plt.tight_layout()
plt.savefig("./figures/authentication_venn.svg", dpi=300, bbox_inches="tight", pad_inches=0)
plt.show()


In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
from matplotlib.patches import Patch
import numpy as np

# Precomputed counts
only_v1 = len(set_v1 - set_v1beta1 - set_v1alpha1)
only_v1beta1 = len(set_v1beta1 - set_v1 - set_v1alpha1)
only_v1alpha1 = len(set_v1alpha1 - set_v1 - set_v1beta1)

v1_and_v1beta1 = len((set_v1 & set_v1beta1) - set_v1alpha1)
v1_and_v1alpha1 = len((set_v1 & set_v1alpha1) - set_v1beta1)
v1beta1_and_v1alpha1 = len((set_v1beta1 & set_v1alpha1) - set_v1)
all_three = len(set_v1 & set_v1beta1 & set_v1alpha1)

# Apply log scaling (add 1 to avoid log(0))
def log_scale(x):
    return np.log10(x + 1)

subsets = (
    log_scale(only_v1),
    log_scale(only_v1beta1),
    log_scale(v1_and_v1beta1),
    log_scale(only_v1alpha1),
    log_scale(v1_and_v1alpha1),
    log_scale(v1beta1_and_v1alpha1),
    log_scale(all_three)
)

# Colors
colors = ('#1f77b4', '#ff7f0e', '#2ca02c')

# Create Venn diagram with log-scaled sizes
plt.figure(figsize=(8,8))
v = venn3(subsets=subsets,
          set_labels=('', '', ''),
          set_colors=colors,
          alpha=0.5)

# Legend
legend_elements = [Patch(facecolor=colors[0], alpha=0.5, label='v1'),
                   Patch(facecolor=colors[1], alpha=0.5, label='beta'),
                   Patch(facecolor=colors[2], alpha=0.5, label='alpha')]
plt.legend(handles=legend_elements, loc='upper right', prop=prop)

# Add small labels for set combinations
subset_labels = {
    '100': 'v1',
    '010': 'beta',
    '001': 'alpha',
    '110': 'v1 ∪ beta',
    '101': 'v1 ∪ alpha',
    '011': 'beta ∪ alpha',
    '111': 'v1 ∪ beta ∪ alpha'
}

for subset_id, text in subset_labels.items():
    label = v.get_label_by_id(subset_id)
    if label is not None:
        # Original raw count
        if subset_id == '100':
            count = only_v1
        elif subset_id == '010':
            count = only_v1beta1
        elif subset_id == '001':
            count = only_v1alpha1
        elif subset_id == '110':
            count = v1_and_v1beta1
        elif subset_id == '101':
            count = v1_and_v1alpha1
        elif subset_id == '011':
            count = v1beta1_and_v1alpha1
        elif subset_id == '111':
            count = all_three

        # Add count + small label
        label.set_text(f"{count}\n{text}")
        label.set_fontproperties(prop)
        # Optional: adjust position slightly
        x, y = label.get_position()
        label.set_position((x, y + 0.01))

plt.tight_layout()
plt.savefig("./figures/authentication_venn.svg", dpi=300, bbox_inches="tight", pad_inches=0)
plt.show()


In [None]:
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
from matplotlib.patches import Patch

# Precomputed counts
only_v1 = len(set_v1 - set_v1beta1 - set_v1alpha1)
only_v1beta1 = len(set_v1beta1 - set_v1 - set_v1alpha1)
only_v1alpha1 = len(set_v1alpha1 - set_v1 - set_v1beta1)

v1_and_v1beta1 = len((set_v1 & set_v1beta1) - set_v1alpha1)
v1_and_v1alpha1 = len((set_v1 & set_v1alpha1) - set_v1beta1)
v1beta1_and_v1alpha1 = len((set_v1beta1 & set_v1alpha1) - set_v1)
all_three = len(set_v1 & set_v1beta1 & set_v1alpha1)

# Colors
colors = ('#1f77b4', '#ff7f0e', '#2ca02c')

# Fixed circle sizes
fixed_sizes = (1, 1, 1, 1, 1, 1, 1)

# Reduce figure size but keep font size
plt.figure(figsize=(5,5))  # smaller than 8x8
v = venn3(subsets=fixed_sizes,
          set_labels=('', '', ''),
          set_colors=colors,
          alpha=0.5)

# Legend
legend_elements = [Patch(facecolor=colors[0], alpha=0.5, label='v1'),
                   Patch(facecolor=colors[1], alpha=0.5, label='beta'),
                   Patch(facecolor=colors[2], alpha=0.5, label='alpha')]
plt.legend(
    handles=legend_elements,
    loc='upper right',
    prop=prop,
    bbox_to_anchor=(1, 1.05)  # x, y relative to axes; <1 moves it inward
)

# Add labels with counts and union info
subset_counts = {
    '100': only_v1,
    '010': only_v1beta1,
    '001': only_v1alpha1,
    '110': v1_and_v1beta1,
    '101': v1_and_v1alpha1,
    '011': v1beta1_and_v1alpha1,
    '111': all_three
}

subset_labels = {
    '100': 'v1',
    '010': 'beta',
    '001': 'alpha',
    '110': 'v1 ∪ beta',
    '101': 'v1 ∪ alpha',
    '011': 'beta ∪ alpha',
    '111': 'v1 ∪ beta ∪ alpha'
}

for subset_id, text in subset_labels.items():
    label = v.get_label_by_id(subset_id)
    if label is not None:
        count = subset_counts[subset_id]
        label.set_text(f"{count}\n{text}")
        label.set_fontproperties(prop)
        x, y = label.get_position()
        label.set_position((x, y + 0.01))  # adjust slightly upward

plt.tight_layout(pad=0)
plt.savefig("./figures/authentication_venn_fixed_small.svg", dpi=300, bbox_inches="tight", pad_inches=0)
plt.show()


In [None]:
# you do not need to run, this code is for presetitnh 
import dask.dataframe as dd
import os
import glob

files_v1 = [
    "../data/commitinfo/provider_authz_v1_commit.parquet",
    "../data/commitinfo/req_auth_v1_commit.parquet",
    "../data/commitinfo/http_authz_v1_commit.parquet",
    "../data/commitinfo/tcp_authz_v1_commit.parquet",
    "../data/commitinfo/jwt_authz_v1_commit.parquet",
    "../data/commitinfo/provider_authz_v1_commit.parquet",
    "../data/commitinfo/ingress_authz_ip_v1_commit.parquet",
    "../data/commitinfo/ingress_authz_remote_ip_v1_commit.parquet",
    "../data/commitinfo/mtls_strict_v1_commit.parquet",
    "../data/commitinfo/mtls_permissive_v1_commit.parquet",
    "../data/commitinfo/mtls_disable_v1_commit.parquet",
]

files_v1beta1 = [
    "../data/commitinfo/peer_auth_v1beta1_commit.parquet",
    "../data/commitinfo/req_auth_v1beta1_commit.parquet",
    "../data/commitinfo/http_authz_v1beta1_commit.parquet",
    "../data/commitinfo/tcp_authz_v1beta1_commit.parquet",
    "../data/commitinfo/jwt_authz_v1beta1_commit.parquet",
    "../data/commitinfo/provider_authz_v1beta1_commit.parquet",
    "../data/commitinfo/ingress_authz_ip_v1beta1_commit.parquet",
    "../data/commitinfo/ingress_authz_remote_ip_v1beta1_commit.parquet",
    "../data/commitinfo/mtls_strict_v1beta1_commit.parquet",
    "../data/commitinfo/mtls_permissive_v1beta1_commit.parquet",
    "../data/commitinfo/mtls_disable_v1beta1_commit.parquet",
]

files_v1alpha1 = [
    "../data/commitinfo/peer_auth_v1alpha1_commit.parquet",
    "../data/commitinfo/req_auth_v1alpha1_commit.parquet",
    "../data/commitinfo/any_authz_v1alpha1_commit.parquet",
    "../data/commitinfo/mtls_strict_v1alpha1_commit.parquet",
    "../data/commitinfo/mtls_permissive_v1alpha1_commit.parquet",
]


processed_dirs = {
    "v1": "../data/processed/v1",
    "v1beta1": "../data/processed/v1beta1",
    "v1alpha1": "../data/processed/v1alpha1",
}

final_dirs = {
    "v1": "../data/final/v1",
    "v1beta1": "../data/final/v1beta1",
    "v1alpha1": "../data/final/v1alpha1",
}

def extract_and_save(file_list, out_dir):
    os.makedirs(out_dir, exist_ok=True)

    for f in file_list:
        print(f"Processing {f}...")
        try:
            ddf = dd.read_parquet(f, columns=['sha', 'commit', 'repository_full_name'], blocksize="128MB")


            def extract(df):
                df['author_date'] = df['commit'].apply(lambda x: x['author']['date'])
                return df[['sha', 'author_date', 'repository_full_name']]

            ddf = ddf.map_partitions(
                extract,
                meta={'sha': 'object', 'author_date': 'object', 'repository_full_name': 'object'}
            )

  
            ddf = ddf.map_partitions(lambda df: df.drop_duplicates(subset='sha'), meta=ddf._meta)


            name = os.path.basename(f).replace(".parquet", "")
            out_path = os.path.join(out_dir, f"{name}_processed")
            ddf.to_parquet(out_path, overwrite=True)

 
            ddf_check = dd.read_parquet(out_path)
            if 'sha' not in ddf_check.columns:
                print(f"[MISSING] 'sha' column missing in saved file: {out_path}")
            else:
                print(f"[OK] 'sha' column exists in saved file: {out_path}")

        except Exception as e:
            print(f"[ERROR] Failed to process {f}: {e}")

def combine_and_save(dir_path, final_out):
    files = glob.glob(os.path.join(dir_path, "*_processed"))
    if not files:
        print(f"[WARNING] No processed files found in {dir_path}")
        return

    all_parquet_files = []
    for d in files:

        parquet_files = glob.glob(os.path.join(d, "*.parquet"))
        if parquet_files:
            all_parquet_files.extend(parquet_files)
        else:
            print(f"[SKIP] No parquet files found in {d}")

    if not all_parquet_files:
        print(f"[ERROR] No valid parquet files to combine in {dir_path}")
        return


    ddf = dd.read_parquet(all_parquet_files, columns=['sha', 'author_date', 'repository_full_name'])


    ddf = ddf.set_index('sha', sorted=False, drop=False)
    ddf = ddf.map_partitions(lambda df: df[~df.index.duplicated(keep='first')])
    ddf = ddf.reset_index(drop=True)

    os.makedirs(final_out, exist_ok=True)
    ddf.to_parquet(final_out, overwrite=True)
    print(f"Combined data saved to {final_out}")

# v1
extract_and_save(files_v1, processed_dirs['v1'])
combine_and_save(processed_dirs['v1'], final_dirs['v1'])

# v1beta1
extract_and_save(files_v1beta1, processed_dirs['v1beta1'])
combine_and_save(processed_dirs['v1beta1'], final_dirs['v1beta1'])

# v1alpha1
extract_and_save(files_v1alpha1, processed_dirs['v1alpha1'])
combine_and_save(processed_dirs['v1alpha1'], final_dirs['v1alpha1'])


In [None]:
import pandas as pd

# If set_v1alpha1 is a set, convert it to a list or Series
alpha_repos = pd.Series(list(set_v1alpha1), name="full_name")

# Merge with commit info
df_alpha_commits = pd.merge(
    alpha_repos.to_frame(),
    df_commits[['full_name', 'last_commit_date']],
    on='full_name',
    how='left'  # keep all alpha repos even if commit info is missing
)

# Optional: sort by last commit date descending
df_alpha_commits = df_alpha_commits.sort_values(by='last_commit_date', ascending=False)

df_alpha_commits.head()


In [None]:
!pip install matplotlib_venn

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Counts
req_counts = [
    len(df_v1_unique[df_v1_unique["kind"] == "request"]),
    len(df_v1beta1_unique[df_v1beta1_unique["kind"] == "request"]),
    len(df_v1alpha1_unique[df_v1alpha1_unique["kind"] == "request"])
]

peer_counts = [
    len(df_v1_unique[df_v1_unique["kind"] == "peer"]),
    len(df_v1beta1_unique[df_v1beta1_unique["kind"] == "peer"]),
    len(df_v1alpha1_unique[df_v1alpha1_unique["kind"] == "peer"])
]

versions = ["v1", "v1beta1", "v1alpha1"]
x = np.arange(len(versions))
bar_width = 0.35

# Plot
fig, ax = plt.subplots(figsize=(10,6))
bars_req = ax.bar(x - bar_width/2, req_counts, bar_width, label='req_auth')
bars_peer = ax.bar(x + bar_width/2, peer_counts, bar_width, label='peer_auth')

# Annotate percentages INSIDE each bar (relative to version total)
for i, (bar_req, bar_peer) in enumerate(zip(bars_req, bars_peer)):
    total = req_counts[i] + peer_counts[i]
    if total > 0:
        pct_req = req_counts[i] / total * 100
        pct_peer = peer_counts[i] / total * 100

        # Put text inside each bar
        ax.text(bar_req.get_x() + bar_req.get_width()/2, bar_req.get_height()/2,
                f"{pct_req:.1f}%", ha="center", va="center", color="white", fontsize=10, fontweight="bold")
        ax.text(bar_peer.get_x() + bar_peer.get_width()/2, bar_peer.get_height()/2,
                f"{pct_peer:.1f}%", ha="center", va="center", color="white", fontsize=10, fontweight="bold")

# Labels and legend
ax.set_xlabel("Version")
ax.set_ylabel("Number of Unique Repositories")
ax.set_title("Request vs Peer Authentication by Version")
ax.set_xticks(x)
ax.set_xticklabels(versions)
ax.legend()

plt.tight_layout()
plt.show()


In [None]:
# Python script to generate LaTeX table with counts and percentages for unique and starred

versions = ["v1", "v1beta1", "v1alpha1"]
kinds = ["request", "peer"]

unique_dfs = {
    "v1": df_v1_unique,
    "v1beta1": df_v1beta1_unique,
    "v1alpha1": df_v1alpha1_unique
}

star_dfs = {
    "v1": df_v1_star,
    "v1beta1": df_v1beta1_star,
    "v1alpha1": df_v1alpha1_star
}

# Initialize LaTeX table string
latex_table = r"""\begin{table}[h!]
\centering
\begin{tabular}{lccc}
\hline
 & v1 & v1beta1 & v1alpha1 \\
\hline
"""

for kind in kinds:
    row = kind.replace("_", " ").title() + " Authentication"
    for version in versions:
        # Counts
        df_unique = unique_dfs[version]
        df_star = star_dfs[version]

        unique_count = len(df_unique[df_unique["kind"] == kind])
        star_count = len(df_star[df_star["kind"] == kind])

        # Percentages relative to total unique/starred for that version
        total_unique = len(df_unique)
        total_star = len(df_star)

        pct_unique = (unique_count / total_unique * 100) if total_unique > 0 else 0
        pct_star = (star_count / total_star * 100) if total_star > 0 else 0

        row += f" & {unique_count} ({pct_unique:.1f}\\%) / {star_count} ({pct_star:.1f}\\%)"
    row += r" \\"
    latex_table += row + "\n"

latex_table += r"""\hline
\end{tabular}
\caption{Number of unique and starred repositories (≥10 stars) using request vs peer authentication across different API versions. Format: unique (percentage) / starred (percentage).}
\label{tab:req_peer_versions_star_pct}
\end{table}"""

print(latex_table)
