In [6]:
import pandas as pd
import requests
import time
import os

def get_all_pages(url, headers):
    results = []
    while url:
        response = requests.get(url, headers=headers)
        results.extend(response.json())
        if 'next' in response.links:
            url = response.links['next']['url']
        else:
            url = None
        time.sleep(1)  # To avoid hitting rate limits
    return results

def get_repo_info(repo, token):
    headers = {
        'Authorization': f'token {token}'
    }
    
    base_url = f'https://api.github.com/repos/{repo}'
    
    # Get basic repo information
    repo_info = requests.get(base_url, headers=headers).json()
    
    # Get contributors count
    contributors_url = f'{base_url}/contributors'
    contributors = get_all_pages(contributors_url, headers)
    contributors_count = len(contributors)
    
    # Get open issues count
    open_issues_url = f'{base_url}/issues?state=open'
    open_issues = get_all_pages(open_issues_url, headers)
    open_issues_count = len(open_issues)
    
    # Get closed issues count
    closed_issues_url = f'{base_url}/issues?state=closed'
    closed_issues = get_all_pages(closed_issues_url, headers)
    closed_issues_count = len(closed_issues)
    
    # Get open pull requests count
    open_pulls_url = f'{base_url}/pulls?state=open'
    open_pulls = get_all_pages(open_pulls_url, headers)
    open_pulls_count = len(open_pulls)
    
    # Get closed pull requests count
    closed_pulls_url = f'{base_url}/pulls?state=closed'
    closed_pulls = get_all_pages(closed_pulls_url, headers)
    closed_pulls_count = len(closed_pulls)
    
    # Get releases count
    releases_url = f'{base_url}/releases'
    releases = get_all_pages(releases_url, headers)
    releases_count = len(releases)
    
    # Get total commits count (this endpoint is not paginated, it counts directly)
    commits_url = f'{base_url}/commits'
    commits = get_all_pages(commits_url, headers)
    commits_count = len(commits)
    
    # Get other repository information
    forks_count = repo_info.get('forks_count', 0)
    stargazers_count = repo_info.get('stargazers_count', 0)
    watchers_count = repo_info.get('subscribers_count', 0)
    
    # Compile all information into a dictionary
    info = {
        'CreationDate': repo_info.get('created_at'),
        'Language': repo_info.get('language'),
        'Contributors': contributors_count,
        'OpenIssues': open_issues_count - open_pulls_count,
        'ClosedIssues': closed_issues_count - closed_pulls_count,
        'Commits': commits_count,
        'OpenPullRequest': open_pulls_count,
        'ClosedPullRequest': closed_pulls_count,
        'Releases': releases_count,
        'Forks': forks_count,
        'Stars': stargazers_count,
        'Watchers': watchers_count
    }
    
    return info

token = os.environ['GITHUB_TOKEN']
df = pd.read_csv('CycloneNSpdxTools.csv')
repo_statistics = {}

for index, row in df.iterrows():
    repo = row['link'].split('https://github.com/')[-1]
    if repo not in repo_statistics:
        repo_statistics[repo] = get_repo_info(repo, token)
    for key, value in repo_statistics[repo].items():
        df.at[index, key] = value
    df.to_csv('CycloneNSpdxTools.csv', index=False)

In [11]:
import pickle
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
import cliffs_delta as cd 

from collections import Counter
from scipy.stats import mannwhitneyu
from plotly.subplots import make_subplots

# Load the commit counts by repository
with open('commit_counts_by_repo.pkl', 'rb') as f:
    commit_count_repos = pickle.load(f)

# Load the CSV file
df = pd.read_csv('CycloneNSpdxTools.csv')

# Initialize counters
spdx_counter = Counter()
cyclonedx_counter = Counter()

# Update counters based on the format
for index, row in df.iterrows():
    if row['Format'] == 'SPDX':
        spdx_counter.update(v for v in commit_count_repos[row['Repo']].values())
    elif row['Format'] == 'CycloneDx':
        cyclonedx_counter.update(v for v in commit_count_repos[row['Repo']].values())

# Filter counts into specified ranges
def filter_counts(counter, min_value, max_value=100000000):
    return [count for count in counter.elements() if min_value <= count < max_value]

spdx_1_10 = filter_counts(spdx_counter, 1, 10)
spdx_10_100 = filter_counts(spdx_counter, 10, 100)
spdx_100_1000 = filter_counts(spdx_counter, 100, 1000)
spdx_1000_more = filter_counts(spdx_counter, 1000)
spdx_all = filter_counts(spdx_counter, 1)

cyclonedx_1_10 = filter_counts(cyclonedx_counter, 1, 10)
cyclonedx_10_100 = filter_counts(cyclonedx_counter, 10, 100)
cyclonedx_100_1000 = filter_counts(cyclonedx_counter, 100, 1000)
cyclonedx_1000_more = filter_counts(cyclonedx_counter, 1000)
cyclonedx_all = filter_counts(cyclonedx_counter, 1)

# Create subplots
fig = make_subplots(
    rows=1, cols=4, shared_yaxes=False, 
    subplot_titles=('1-10', '10-100', '100-1K', '1K+')
)

# Define colors
colors = ['#1f77b4', '#ff7f0e']

# Add boxplots for 1-10 range
fig.add_trace(go.Box(y=spdx_1_10, name='SPDX', boxmean=True, marker_color=colors[0]), row=1, col=1)
fig.add_trace(go.Box(y=cyclonedx_1_10, name='CycloneDx', boxmean=True, marker_color=colors[1]), row=1, col=1)

# Add boxplots for 10-100 range
fig.add_trace(go.Box(y=spdx_10_100, name='SPDX', boxmean=True, marker_color=colors[0]), row=1, col=2)
fig.add_trace(go.Box(y=cyclonedx_10_100, name='CycloneDx', boxmean=True, marker_color=colors[1]), row=1, col=2)

# Add boxplots for 100-1000 range
fig.add_trace(go.Box(y=spdx_100_1000, name='SPDX', boxmean=True, marker_color=colors[0]), row=1, col=3)
fig.add_trace(go.Box(y=cyclonedx_100_1000, name='CycloneDx', boxmean=True, marker_color=colors[1]), row=1, col=3)

# Add boxplots for 1000-10000 range
fig.add_trace(go.Box(y=spdx_1000_more, name='SPDX', boxmean=True, marker_color=colors[0]), row=1, col=4)
fig.add_trace(go.Box(y=cyclonedx_1000_more, name='CycloneDx', boxmean=True, marker_color=colors[1]), row=1, col=4)

fig.update_layout(
    # title={
    #     'text': 'Groups by Number of Commits',
    #     'x': 0.5,
    #     'xanchor': 'center',
    #     'yanchor': 'top'
    # },
    yaxis_title='Number of Commits',
    margin=dict(l=10, r=10, t=20, b=10),
    showlegend=False,
)

# Update y-axis annotation fonts for all subplots
fig.update_yaxes(title_font=dict(size=12))

# Display the plot
pio.show(fig)

pio.write_image(fig, 'returning_commit_count_distribution.pdf')

# Perform Mann-Whitney U test and calculate Cliff's delta for each group
results = []
groups = ['1-10', '10-100', '100-1k', '1k+', 'all']
spdx_groups = [spdx_1_10, spdx_10_100, spdx_100_1000, spdx_1000_more, spdx_all]
cyclonedx_groups = [cyclonedx_1_10, cyclonedx_10_100, cyclonedx_100_1000, cyclonedx_1000_more, cyclonedx_all]

for group, spdx_data, cyclonedx_data in zip(groups, spdx_groups, cyclonedx_groups):
    _, p_value = mannwhitneyu(spdx_data, cyclonedx_data, alternative='two-sided')
    cliff_delta, res = cd.cliffs_delta(spdx_data, cyclonedx_data)
    results.append((group, p_value, cliff_delta, res))

# Apply Bonferroni correction
alpha = 0.05
bonferroni_alpha = alpha / len(results)
corrected_results = [(group, p_value, cliff_delta, res, p_value < bonferroni_alpha) for group, p_value, cliff_delta, res in results]

# Save the results to a DataFrame and display
df_results = pd.DataFrame(corrected_results, columns=['Group', 'p-value', 'Cliff\'s delta', 'Effect size', 'Significant'])
print(df_results.to_latex(index=False, float_format='%.3f'))

\begin{tabular}{lrrlr}
\toprule
Group & p-value & Cliff's delta & Effect size & Significant \\
\midrule
1-10 & 0.000 & 0.084 & negligible & True \\
10-100 & 0.423 & -0.031 & negligible & False \\
100-1k & 0.387 & -0.058 & negligible & False \\
1k+ & 0.710 & 0.102 & negligible & False \\
all & 0.000 & 0.116 & negligible & True \\
\bottomrule
\end{tabular}



In [21]:
# import pandas as pd
# import plotly.graph_objects as go
# import plotly.io as pio
# import pickle

# from collections import Counter
    
# with open('commit_counts_by_repo.pkl', 'rb') as f:
#     commit_count_repos = pickle.load(f)

# df = pd.read_csv('CycloneNSpdxTools.csv')

# spdx_counter = Counter()
# cyclonedx_counter = Counter()
# for index, row in df.iterrows():
#     if row['Format'] == 'SPDX':
#         spdx_counter.update(v for v in commit_count_repos[row['Repo']].values())
#     elif row['Format'] == 'CycloneDx':
#         cyclonedx_counter.update(v for v in commit_count_repos[row['Repo']].values())

# # Sort the counts for plotting
# spdx_x, spdx_y = zip(*sorted(spdx_counter.items()))
# cyclonedx_x, cyclonedx_y = zip(*sorted(cyclonedx_counter.items()))

# # Create the line plot
# fig = go.Figure()

# fig.add_trace(go.Scatter(
#     x=spdx_x,
#     y=spdx_y,
#     mode='lines+markers',
#     name='SPDX',
#     line=dict(color='blue')
# ))

# fig.add_trace(go.Scatter(
#     x=cyclonedx_x,
#     y=cyclonedx_y,
#     mode='lines+markers',
#     name='CycloneDx',
#     line=dict(color='red')
# ))

# # Update layout
# fig.update_layout(
#     # title='Commit Count Distribution for SPDX and CycloneDx',
#     # xaxis=dict(title='Number of Commits per Contributor (log scale)', type='log'),
#     xaxis=dict(title='Number of Commits per Contributor'),
#     yaxis=dict(title='Number of Contributors (log scale)', type='log'),
#     legend_title='Format',
#     margin=dict(l=0, r=0, t=0, b=0),
#     legend=dict(
#         title='Format',
#         x=0.8,
#         y=0.8,
#         bgcolor='rgba(255, 255, 255, 0.5)',
#         bordercolor='black',
#         borderwidth=1
#     )
# )

# pio.write_image(fig, 'commit_count_distribution.pdf', format='pdf')

# # Show the plot
# fig.show()

In [14]:
# import pandas as pd
# import plotly.graph_objects as go
# import plotly.io as pio
# import pickle
# from collections import Counter

# # Load data from files
# with open('commit_counts_by_repo.pkl', 'rb') as f:
#     commit_count_repos = pickle.load(f)

# df = pd.read_csv('CycloneNSpdxTools.csv')

# # Initialize counters
# spdx_counter = Counter()
# cyclonedx_counter = Counter()

# # Update counters based on the dataframe
# for index, row in df.iterrows():
#     if row['Format'] == 'SPDX':
#         spdx_counter.update(v for v in commit_count_repos[row['Repo']].values() if v > 1)
#     elif row['Format'] == 'CycloneDx':
#         cyclonedx_counter.update(v for v in commit_count_repos[row['Repo']].values() if v > 1)

# # Prepare data for plotting
# spdx_data = [count for count in spdx_counter.elements()]
# cyclonedx_data = [count for count in cyclonedx_counter.elements()]

# # Create the unified violin plot
# fig = go.Figure()

# fig.add_trace(go.Violin(
#     y=spdx_data,
#     x=['Commit Counts'] * len(spdx_data),
#     box_visible=True,
#     line_color='blue',
#     spanmode='hard',
#     side='negative',  # Set to show on the left side
#     name='SPDX'
# ))

# fig.add_trace(go.Violin(
#     y=cyclonedx_data,
#     x=['Commit Counts'] * len(cyclonedx_data),
#     box_visible=True,
#     line_color='green',
#     spanmode='hard',
#     side='positive',  # Set to show on the right side
#     name='CycloneDx'
# ))

# # Update layout for log scale
# fig.update_layout(
#     title="Unified Violin Plot of Commit Counts",
#     xaxis_title="Format",
#     yaxis_title="Commit Counts",
#     yaxis_type="log",
#     violingap=0.4,  # Adjusts the gap between violins
#     violinmode='overlay',  # Overlays the violins side by side
# )

# # Show the plot
# pio.show(fig)
