In [None]:
import dask.dataframe as dd
import pandas as pd

final_dirs = {
    "v1": "../data/final/v1",
    "v1beta1": "../data/final/v1beta1",
    "v1alpha1": "../data/final/v1alpha1",
}

start_date = pd.Timestamp("2024-08-01", tz='UTC')
end_date   = pd.Timestamp("2025-07-31", tz='UTC')


all_repos_v1beta1 = dd.read_parquet(final_dirs['v1beta1'])['repository_full_name'].drop_duplicates().compute()

for version, path in final_dirs.items():
    print(f"\n=== Processing {version} ===")
    
    ddf = dd.read_parquet(path)
    ddf['author_date'] = dd.to_datetime(ddf['author_date'], utc=True)
    

    ddf_filtered = ddf[(ddf['author_date'] >= start_date) & (ddf['author_date'] <= end_date)]
    

    repo_counts = ddf_filtered.groupby('repository_full_name')['author_date'].count().compute()
    

    if version == "v1":
        repo_counts = repo_counts[~repo_counts.index.isin(all_repos_v1beta1)]
    

    all_repos = ddf['repository_full_name'].drop_duplicates().compute()
    if version == "v1":
        all_repos = all_repos[~all_repos.isin(all_repos_v1beta1)]
    
    repo_counts_per_month = repo_counts / 12
    
    repos_without_pr = all_repos[~all_repos.isin(repo_counts.index)]
    zero_df = pd.Series(0, index=repos_without_pr)
    
    all_counts = pd.concat([repo_counts_per_month, zero_df])
    results = all_counts.value_counts().sort_index()
    
    print("Repository monthly PR average histogram:")
    print(results)
    print("\nTotal repositories:", len(all_counts))


In [32]:
import os
import matplotlib.font_manager as font_manager

def saveFigure(fig,fileName):
    fig.savefig(fileName,bbox_inches='tight',dpi=100, pad_inches = 0)


def get_font_properties(routePath=os.getcwd()):
    
    fontpath = routePath+'/NimbusSanL-Reg.otf'
    prop = font_manager.FontProperties(fname=fontpath, size=16)
    return prop

def get_props_as_dict():
    routePath=os.getcwd()
    fontpath = routePath+'/NimbusSanL-Reg.otf'
    return {
        "name": fontpath,
        "size": 26
    }

def get_alt_font_properties(type="san",size=15):
    routePath=os.getcwd()
    if type == "rom":
        fontpath = routePath+'/NimbusRomNo9L-Reg.otf'
    else:
        fontpath = routePath+'/NimbusSanL-Reg.otf'
    prop = font_manager.FontProperties(fname=fontpath, size=size)
    return prop

def save_fig(ax, title):
    ax.figure.savefig(title,  dpi=300,  bbox_inches='tight', format="pdf", pad_inches = 0)


fontsize = 17
legend_fontsize = fontsize
smallfontsize = fontsize -1

prop = get_alt_font_properties(type="rom",size=fontsize)
prop_small = get_alt_font_properties(type="rom",size=smallfontsize)
prop_legend = get_alt_font_properties(type="rom",size=legend_fontsize)


In [None]:

    # Calculate and print the distribution of average commits per month
    distribution = repo_counts_per_month.value_counts().sort_index()
    #print("Distribution of average commits per month:")
    #print(distribution)

    # Automatically bin into 4 quantile groups
    grouped = pd.qcut(repo_counts_per_month, q=4, labels=["Q1 (lowest)", "Q2", "Q3", "Q4 (highest)"])
    print(grouped.value_counts())

    # Optional: plot histogram
    import matplotlib.pyplot as plt
    plt.figure(figsize=(8,4))
    repo_counts_per_month.hist(bins=30)
    plt.xlabel("Average commits per month (v1beta1)")
    plt.ylabel("Number of repositories")
    plt.title("Distribution of average commits per month")


In [None]:
# Equal-width bins (fallback if qcut fails)
grouped_cut = pd.cut(repo_counts_per_month, bins=4)
print("Bin intervals (equal-width):", grouped_cut.cat.categories)

In [None]:
# Custom bins: <1, 1-25, 26-50, >50
bins = [0, 1, 11, 30, repo_counts_per_month.max()+1]
labels = ["<1", "1-11", "11-30", ">30"]
grouped_custom = pd.cut(repo_counts_per_month, bins=bins, labels=labels, right=False)

print("Custom commit frequency categories:")
print(grouped_custom.value_counts())