In [None]:
# you do not need to run, this code is for presetitnh 
import dask.dataframe as dd
import os
import glob

files_v1 = [
    "../data/commitinfo/provider_authz_v1_commit.parquet",
    "../data/commitinfo/req_auth_v1_commit.parquet",
    "../data/commitinfo/http_authz_v1_commit.parquet",
    "../data/commitinfo/tcp_authz_v1_commit.parquet",
    "../data/commitinfo/jwt_authz_v1_commit.parquet",
    "../data/commitinfo/provider_authz_v1_commit.parquet",
    "../data/commitinfo/ingress_authz_ip_v1_commit.parquet",
    "../data/commitinfo/ingress_authz_remote_ip_v1_commit.parquet",
    "../data/commitinfo/mtls_strict_v1_commit.parquet",
    "../data/commitinfo/mtls_permissive_v1_commit.parquet",
    "../data/commitinfo/mtls_disable_v1_commit.parquet",
]

files_v1beta1 = [
    "../data/commitinfo/peer_auth_v1beta1_commit.parquet",
    "../data/commitinfo/req_auth_v1beta1_commit.parquet",
    "../data/commitinfo/http_authz_v1beta1_commit.parquet",
    "../data/commitinfo/tcp_authz_v1beta1_commit.parquet",
    "../data/commitinfo/jwt_authz_v1beta1_commit.parquet",
    "../data/commitinfo/provider_authz_v1beta1_commit.parquet",
    "../data/commitinfo/ingress_authz_ip_v1beta1_commit.parquet",
    "../data/commitinfo/ingress_authz_remote_ip_v1beta1_commit.parquet",
    "../data/commitinfo/mtls_strict_v1beta1_commit.parquet",
    "../data/commitinfo/mtls_permissive_v1beta1_commit.parquet",
    "../data/commitinfo/mtls_disable_v1beta1_commit.parquet",
]

files_v1alpha1 = [
    "../data/commitinfo/peer_auth_v1alpha1_commit.parquet",
    "../data/commitinfo/req_auth_v1alpha1_commit.parquet",
    "../data/commitinfo/any_authz_v1alpha1_commit.parquet",
    "../data/commitinfo/mtls_strict_v1alpha1_commit.parquet",
    "../data/commitinfo/mtls_permissive_v1alpha1_commit.parquet",
]


processed_dirs = {
    "v1": "../data/processed/v1",
    "v1beta1": "../data/processed/v1beta1",
    "v1alpha1": "../data/processed/v1alpha1",
}

final_dirs = {
    "v1": "../data/final/v1",
    "v1beta1": "../data/final/v1beta1",
    "v1alpha1": "../data/final/v1alpha1",
}

def extract_and_save(file_list, out_dir):
    os.makedirs(out_dir, exist_ok=True)

    for f in file_list:
        print(f"Processing {f}...")
        try:
            ddf = dd.read_parquet(f, columns=['sha', 'commit', 'repository_full_name'], blocksize="128MB")


            def extract(df):
                df['author_date'] = df['commit'].apply(lambda x: x['author']['date'])
                return df[['sha', 'author_date', 'repository_full_name']]

            ddf = ddf.map_partitions(
                extract,
                meta={'sha': 'object', 'author_date': 'object', 'repository_full_name': 'object'}
            )

  
            ddf = ddf.map_partitions(lambda df: df.drop_duplicates(subset='sha'), meta=ddf._meta)


            name = os.path.basename(f).replace(".parquet", "")
            out_path = os.path.join(out_dir, f"{name}_processed")
            ddf.to_parquet(out_path, overwrite=True)

 
            ddf_check = dd.read_parquet(out_path)
            if 'sha' not in ddf_check.columns:
                print(f"[MISSING] 'sha' column missing in saved file: {out_path}")
            else:
                print(f"[OK] 'sha' column exists in saved file: {out_path}")

        except Exception as e:
            print(f"[ERROR] Failed to process {f}: {e}")

def combine_and_save(dir_path, final_out):
    files = glob.glob(os.path.join(dir_path, "*_processed"))
    if not files:
        print(f"[WARNING] No processed files found in {dir_path}")
        return

    all_parquet_files = []
    for d in files:

        parquet_files = glob.glob(os.path.join(d, "*.parquet"))
        if parquet_files:
            all_parquet_files.extend(parquet_files)
        else:
            print(f"[SKIP] No parquet files found in {d}")

    if not all_parquet_files:
        print(f"[ERROR] No valid parquet files to combine in {dir_path}")
        return


    ddf = dd.read_parquet(all_parquet_files, columns=['sha', 'author_date', 'repository_full_name'])


    ddf = ddf.set_index('sha', sorted=False, drop=False)
    ddf = ddf.map_partitions(lambda df: df[~df.index.duplicated(keep='first')])
    ddf = ddf.reset_index(drop=True)

    os.makedirs(final_out, exist_ok=True)
    ddf.to_parquet(final_out, overwrite=True)
    print(f"Combined data saved to {final_out}")

# v1
extract_and_save(files_v1, processed_dirs['v1'])
combine_and_save(processed_dirs['v1'], final_dirs['v1'])

# v1beta1
extract_and_save(files_v1beta1, processed_dirs['v1beta1'])
combine_and_save(processed_dirs['v1beta1'], final_dirs['v1beta1'])

# v1alpha1
extract_and_save(files_v1alpha1, processed_dirs['v1alpha1'])
combine_and_save(processed_dirs['v1alpha1'], final_dirs['v1alpha1'])


In [None]:
import os
import matplotlib.font_manager as font_manager

def saveFigure(fig,fileName):
    fig.savefig(fileName,bbox_inches='tight',dpi=100, pad_inches = 0)


def get_font_properties(routePath=os.getcwd()):
    
    fontpath = routePath+'/NimbusSanL-Reg.otf'
    prop = font_manager.FontProperties(fname=fontpath, size=16)
    return prop

def get_props_as_dict():
    routePath=os.getcwd()
    fontpath = routePath+'/NimbusSanL-Reg.otf'
    return {
        "name": fontpath,
        "size": 26
    }

def get_alt_font_properties(type="san",size=15):
    routePath=os.getcwd()
    if type == "rom":
        fontpath = routePath+'/NimbusRomNo9L-Reg.otf'
    else:
        fontpath = routePath+'/NimbusSanL-Reg.otf'
    prop = font_manager.FontProperties(fname=fontpath, size=size)
    return prop

def save_fig(ax, title):
    ax.figure.savefig(title,  dpi=300,  bbox_inches='tight', format="pdf", pad_inches = 0)


fontsize = 17
legend_fontsize = fontsize
smallfontsize = fontsize -1
bigfont = fontsize + 5

prop = get_alt_font_properties(type="rom",size=fontsize)
prop_big = get_alt_font_properties(type="rom",size=bigfont)
prop_small = get_alt_font_properties(type="rom",size=smallfontsize)
prop_legend = get_alt_font_properties(type="rom",size=legend_fontsize)

In [None]:
import dask.dataframe as dd
import pandas as pd
import matplotlib.pyplot as plt

# Load combined commit data
df_v1 = dd.read_parquet("../data/final/v1").compute()
df_v1beta1 = dd.read_parquet("../data/final/v1beta1").compute()
df_v1alpha1 = dd.read_parquet("../data/final/v1alpha1").compute()

# Convert commit dates to datetime
for df in [df_v1, df_v1beta1, df_v1alpha1]:
    df['author_date'] = pd.to_datetime(df['author_date'])

# Get last commit per repository
df_v1_last = df_v1.groupby('repository_full_name')['author_date'].max().reset_index()
df_v1beta1_last = df_v1beta1.groupby('repository_full_name')['author_date'].max().reset_index()
df_v1alpha1_last = df_v1alpha1.groupby('repository_full_name')['author_date'].max().reset_index()

# Sort by date
df_v1_last = df_v1_last.sort_values('author_date')
df_v1beta1_last = df_v1beta1_last.sort_values('author_date')
df_v1alpha1_last = df_v1alpha1_last.sort_values('author_date')

# Plot
plt.figure(figsize=(10,4))
plt.scatter(df_v1_last['author_date'], range(len(df_v1_last)), color='#1f77b4', s=20, label='v1')
plt.scatter(df_v1beta1_last['author_date'], range(len(df_v1beta1_last)), color='#ff7f0e', s=20, label='v1beta1')
plt.scatter(df_v1alpha1_last['author_date'], range(len(df_v1alpha1_last)), color='#2ca02c', s=20, label='v1alpha1')

plt.xlabel("Last Commit Date", fontproperties=prop)
plt.ylabel("Repositories", fontproperties=prop)
plt.title("Last Commit Date for v1, v1beta1, and v1alpha1 Repositories", fontproperties=prop)
plt.yticks([], fontproperties=prop)  # hide y-axis ticks but keep font
plt.xticks(fontproperties=prop)  # hide y-axis ticks but keep font
plt.legend(prop=prop)
plt.tight_layout()
plt.show()


In [None]:
import dask.dataframe as dd
import pandas as pd
import matplotlib.pyplot as plt

# Load combined commit data
df_v1 = dd.read_parquet("../data/final/v1").compute()
df_v1beta1 = dd.read_parquet("../data/final/v1beta1").compute()
df_v1alpha1 = dd.read_parquet("../data/final/v1alpha1").compute()

# Convert commit dates to datetime
for df in [df_v1, df_v1beta1, df_v1alpha1]:
    df['author_date'] = pd.to_datetime(df['author_date'])

# Get last commit per repository by taking the last element
df_v1_last = df_v1.sort_values('author_date').groupby('repository_full_name').tail(1)
df_v1beta1_last = df_v1beta1.sort_values('author_date').groupby('repository_full_name').tail(1)
df_v1alpha1_last = df_v1alpha1.sort_values('author_date').groupby('repository_full_name').tail(1)

# Sort all for plotting
df_v1_last = df_v1_last.sort_values('author_date')
df_v1beta1_last = df_v1beta1_last.sort_values('author_date')
df_v1alpha1_last = df_v1alpha1_last.sort_values('author_date')

# Plot
plt.figure(figsize=(10,4))
plt.scatter(df_v1_last['author_date'], range(len(df_v1_last)), color='#1f77b4', s=20, label='v1')
plt.scatter(df_v1beta1_last['author_date'], range(len(df_v1beta1_last)), color='#ff7f0e', s=20, label='v1beta1')
plt.scatter(df_v1alpha1_last['author_date'], range(len(df_v1alpha1_last)), color='#2ca02c', s=20, label='v1alpha1')

plt.xlabel("Last Commit Date", fontproperties=prop)
plt.ylabel("Repositories", fontproperties=prop)
plt.title("Last Commit Date for v1, v1beta1, and v1alpha1 Repositories", fontproperties=prop)
plt.yticks([], fontproperties=prop)
plt.legend(prop=prop)
plt.tight_layout()
plt.show()


In [None]:
import dask.dataframe as dd
import pandas as pd

# Load v1alpha1 commit data
df_alpha = dd.read_parquet("../data/final/v1alpha1").compute()

# Convert commit dates to datetime
df_alpha['author_date'] = pd.to_datetime(df_alpha['author_date'])

# Get last commit per repository
df_alpha_last = df_alpha.sort_values('author_date').groupby('repository_full_name').tail(1)

# Show 10 samples
sample_alpha = df_alpha_last[['repository_full_name', 'author_date']].sample(10, random_state=42)
print(sample_alpha)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Make sure author_date is datetime
df_alpha_last['author_date'] = pd.to_datetime(df_alpha_last['author_date'])

# Aggregate by month (year-month)
df_alpha_last['year_month'] = df_alpha_last['author_date'].dt.to_period('M')
repo_counts = df_alpha_last.groupby('year_month').size().reset_index(name='num_repos')

# Convert year_month back to datetime for plotting
repo_counts['year_month'] = repo_counts['year_month'].dt.to_timestamp()

# Plot
plt.figure(figsize=(10,4))
plt.bar(repo_counts['year_month'], repo_counts['num_repos'], color='#2ca02c', width=20)  # width in days

plt.xlabel("Month of Last Commit", fontproperties=prop)
plt.ylabel("Number of Repositories", fontproperties=prop)
plt.title("Number of v1alpha1 Repositories by Month of Last Commit", fontproperties=prop)
plt.xticks(rotation=45, ha='right', fontproperties=prop)
plt.yticks(fontproperties=prop)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assume these DataFrames exist and have columns:
# 'repository_full_name', 'author_date'
# df_v1_last, df_beta_last, df_alpha_last

# Convert author_date to datetime if not already
for df in [df_v1_last, df_v1beta1_last, df_alpha_last]:
    df['author_date'] = pd.to_datetime(df['author_date'])

# Sort dates for CDF
df_v1_sorted = df_v1_last['author_date'].sort_values()
df_beta_sorted = df_v1beta1_last['author_date'].sort_values()
df_alpha_sorted = df_alpha_last['author_date'].sort_values()

# Compute CDF values
cdf_v1 = pd.Series(range(1, len(df_v1_sorted)+1)) / len(df_v1_sorted)
cdf_beta = pd.Series(range(1, len(df_beta_sorted)+1)) / len(df_beta_sorted)
cdf_alpha = pd.Series(range(1, len(df_alpha_sorted)+1)) / len(df_alpha_sorted)

# Plot
plt.figure(figsize=(8,4))
plt.plot(df_v1_sorted, cdf_v1, label='v1', color='#1f77b4')
plt.plot(df_beta_sorted, cdf_beta, label='beta', color='#ff7f0e')
plt.plot(df_alpha_sorted, cdf_alpha, label='alpha', color='#2ca02c')

plt.xlabel("Last Commit Date")
plt.ylabel("Cumulative Fraction of Repositories")
plt.title("CDF of Last Commit Dates per API Version")
plt.legend()
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pytz
prop = prop_big
# Ensure dates are tz-aware
for df in [df_v1_last, df_v1beta1_last, df_alpha_last]:
    df['author_date'] = pd.to_datetime(df['author_date'])
    if df['author_date'].dt.tz is None:
        df['author_date'] = df['author_date'].dt.tz_localize(pytz.UTC)

# Sort dates
df_v1_sorted = df_v1_last['author_date'].sort_values()
df_beta_sorted = df_v1beta1_last['author_date'].sort_values()
df_alpha_sorted = df_alpha_last['author_date'].sort_values()

# Compute survival function
sf_v1 = 1 - pd.Series(range(1, len(df_v1_sorted)+1)) / len(df_v1_sorted)
sf_beta = 1 - pd.Series(range(1, len(df_beta_sorted)+1)) / len(df_beta_sorted)
sf_alpha = 1 - pd.Series(range(1, len(df_alpha_sorted)+1)) / len(df_alpha_sorted)

# Plot
plt.figure(figsize=(10,5))
plt.plot(df_v1_sorted, sf_v1, label='v1', color='#1f77b4')
plt.plot(df_beta_sorted, sf_beta, label='beta (β)', color='#ff7f0e')
plt.plot(df_alpha_sorted, sf_alpha, label='alpha (α)', color='#2ca02c')

#plt.xlabel("Date", fontproperties=prop)
plt.ylabel("Repositories Active After Date (%)",   fontproperties=prop)
#plt.title("Repository Activity Survival Function",  fontproperties=prop)
plt.grid(alpha=0.3)

# Last year reference
one_year_ago = datetime.now(pytz.UTC) - timedelta(days=365)
plt.axvline(one_year_ago, color='black', linestyle='--', alpha=0.5)
plt.text(one_year_ago, 0.12, "One year ago", rotation=90, va='bottom', ha='right', color='black', fontproperties=prop_small)
# Create a “box” legend with percentages
percent_last_year = {
    'v1': (df_v1_sorted > one_year_ago).mean() * 100,
    'β': (df_beta_sorted > one_year_ago).mean() * 100,
    'α': (df_alpha_sorted > one_year_ago).mean() * 100
}

# Add text box in axes coordinates
textstr = '\n'.join([f"{k}: {v:.1f}%" for k,v in percent_last_year.items()])
props = dict(boxstyle='round', facecolor='white', alpha=0.8)

# Place the text box in the top-right corner
plt.text(
    0.97, 0.97,      # x, y in axes coordinates (0-1)
    textstr,
    transform=plt.gca().transAxes,
    
    ha='right',       # horizontal alignment
    va='top',         # vertical alignment
    bbox=props,
     fontproperties=prop,
)
plt.legend(loc='lower left', prop=prop)
plt.tight_layout()
plt.xticks(fontproperties=prop)
plt.yticks(fontproperties=prop)
plt.savefig("./figures/repositoriy_activity_survival.svg", dpi=300, bbox_inches="tight", pad_inches=0)

plt.show()


In [None]:
import pandas as pd

# Define cutoff date (one year ago from "now")
one_year_ago = datetime.now(pytz.UTC) - timedelta(days=365)

# Filter each version
df_v1_active = df_v1_last[df_v1_last['author_date'] > one_year_ago].copy()
df_beta_active = df_v1beta1[df_v1beta1['author_date'] > one_year_ago].copy()
df_alpha_active = df_alpha_last[df_alpha_last['author_date'] > one_year_ago].copy()

# Show sizes
print("Active v1 repositories:", len(df_v1_active))
print("Active beta repositories:", len(df_beta_active))
print("Active alpha repositories:", len(df_alpha_active))

# Peek into the results
print(df_alpha_active.head(10))
