In [None]:
import dask.dataframe as dd
import pandas as pd

final_dirs = {
    "v1": "../data/final/v1",
    "v1beta1": "../data/final/v1beta1",
    "v1alpha1": "../data/final/v1alpha1",
}

start_date = pd.Timestamp("2024-08-01", tz='UTC')
end_date   = pd.Timestamp("2025-07-31", tz='UTC')

for version, path in final_dirs.items():
    print(f"\n=== Processing {version} ===")
    

    ddf = dd.read_parquet(path)
    

    ddf['author_date'] = dd.to_datetime(ddf['author_date'], utc=True)
    

    ddf_filtered = ddf[(ddf['author_date'] >= start_date) & (ddf['author_date'] <= end_date)]
    

    repo_counts = ddf_filtered.groupby('repository_full_name')['author_date'].count().compute()


    all_repos = ddf['repository_full_name'].drop_duplicates().compute()
    

    repos_without_pr = all_repos[~all_repos.isin(repo_counts.index)]
    
    repo_counts_per_month = repo_counts / 12
    
    zero_df = pd.Series(0, index=repos_without_pr)
    
    all_counts = pd.concat([repo_counts_per_month, zero_df])
    
    results = all_counts.value_counts().sort_index()
    
    print("Repository monthly PR average results:")
    print(results)
    print("\nTotal repositories:", len(all_counts))