# Setup cell

In [None]:
import os

from dotenv import load_dotenv
import pandas as pd
import logging

from git_operations import compare_git_clone_speed
from metrics import Metrics
from platform_analysis import get_most_present_owner, get_unique_owner_number
from platforms import Platforms
import github
import gitea_forgejo
import gitlab
import platform_analysis

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load environment variables from .env file
load_dotenv()

# Load tokens from environment variables
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
GITLAB_TOKEN = os.getenv("GITLAB_TOKEN")
GITEA_TOKEN = os.getenv("GITEA_TOKEN")
FORGEJO_TOKEN = os.getenv("FORGEJO_TOKEN")

# Set up headers for API requests
GITHUB_HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}
GITEA_HEADERS = {"Authorization": f"token {GITEA_TOKEN}"}
FORGEJO_HEADERS = {"Authorization": f"token {FORGEJO_TOKEN}"}
GITLAB_HEADERS = {"Authorization": f"Bearer {GITLAB_TOKEN}"}

# Initialize platform instances
github_instance = github.GitHub(GITHUB_HEADERS)
gitea_instance = gitea_forgejo.GiteaForgejo(GITEA_HEADERS)
forgejo_instance = gitea_forgejo.GiteaForgejo(FORGEJO_HEADERS)
gitlab_instance = gitlab.Gitlab(GITLAB_HEADERS)

# Demonstration Code

### Selecting repositories from GitHub

In [None]:
# Fetch a given number of GitHub repositories
data = github_instance.fetch_repositories(50)
file_name = "temp.csv"

# Save the gathered data to a CSV file
if data:
    df = pd.DataFrame(data)
    df.to_csv(file_name, index=False)
    logger.info(f"Saved repositories to '{file_name}'.")
else:
    logger.error("No repositories were fetched.")

### Randomly picking clonable repositories and adding Git metrics

In [None]:
input_file = "temp.csv"
output_file = "temp.csv"
# Load the csv file that contains the repositories
df = pd.read_csv(input_file)
final_df = pd.read_csv(output_file)

# Select clonable repositories from the DataFrame
df = github_instance.select_clonable_repositories(df, Platforms.GITHUB, 10)

# Save updated CSV
df.to_csv(output_file, index=False)
logger.info(f"Done! Updated file saved as {output_file}")

### Adding platform specific metrics to the dataframe

In [None]:
input_file = "temp.csv"
output_file = "temp.csv"

# Load the csv file that contains the repositories
df = pd.read_csv(input_file)

# Add a chosen metric for each repository of the DataFrame
github_instance.add_metric(df, Metrics.ISSUE)

# Save updated CSV
df.to_csv(output_file, index=False)
logger.info(f"Done! Updated file saved as {output_file}")

### Plotting the distribution of the selected metric

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.COMMIT)

### Performing Propensity Score Matching

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Perform matching and create the plot
platform_analysis.propensity_score_matching(df_github, df_gitlab, "GitHub", Metrics.COMMIT.value, ["#branches", "#contributors", "size", "created"], caliper=0.00001) #ajouter les dates de création

# Experimental Code

In [None]:
import time

start_time = time.time()  # Record start time

github_instance.fetch_repositories(5000)

end_time = time.time()  # Record end time
duration = end_time - start_time  # Calculate duration

print(f"Function executed in {duration:.2f} seconds.")

In [None]:
input_file = "speed_test.csv"
output_file = "speed_test.csv"

df = pd.read_csv(input_file)
compare_git_clone_speed(df, Platforms.GITHUB)

# Save updated CSV
df.to_csv(output_file, index=False)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

input_file = "temp.csv"

df = pd.read_csv(input_file)

faster_method = df['shallow_clone_time'] < df['full_clone_time']

shallow_faster = faster_method.sum()
full_faster = len(df) - shallow_faster

total = len(df)
percentages = {
    'Shallow clone': (shallow_faster / total) * 100,
    'Clone complet': (full_faster / total) * 100
}

plt.bar(percentages.keys(), percentages.values(), color=['skyblue', 'salmon'])
plt.ylabel('Pourcentage de dépôts (%)')
plt.ylim(0, 100)
plt.savefig('Figures/clone_speed.png', dpi=300, bbox_inches='tight')
plt.show()

print("Somme des temps de clonage :")
print(df[['shallow_clone_time', 'full_clone_time']].sum())

# Fetching Repositories from Github

In [None]:
# Fetch a given number of GitHub repositories
data = github_instance.fetch_repositories(5000)
file_name = "Github_data/github_5k_repositories.csv"

# Save the gathered data to a CSV file
if data:
    df = pd.DataFrame(data)
    df.to_csv(file_name, index=False)
    logger.info(f"Saved repositories to '{file_name}'.")
else:
    logger.error("No repositories were fetched.")

### Select Clonable Github repositories and add Git metrics to the dataframe

In [None]:
input_file = "Github_data/github_5k_repositories.csv"
output_file = "Github_data/github_repositories_metrics.csv"
# Load the csv file that contains the repositories
df = pd.read_csv(input_file)

df = github_instance.select_clonable_repositories(df, Platforms.GITHUB, 1000)

# Save updated CSV
df.to_csv(output_file, index=False)
logger.info(f"Done! Updated file saved as {output_file}")

### Add Platform specific metrics to the dataframe

In [None]:
input_file = "Github_data/github_repositories_metrics.csv"
output_file = "Github_data/github_repositories_metrics.csv"

# Load the csv file that contains the repositories
df = pd.read_csv(input_file)

# Add a chosen metric for each repository of the DataFrame
github_instance.add_metric(df, Metrics.ISSUE)

# Save updated CSV
df.to_csv(output_file, index=False)
logger.info(f"Done! Updated file saved as {output_file}")

# Fetching Repositories from Gitea

In [None]:
# Fetch a given number of Gitea repositories
data = gitea_instance.fetch_repositories(1000, platform=Platforms.GITEA)
file_name = "Gitea_data/gitea_1k_repositories.csv"

# Save the gathered data to a CSV file
if data:
    df = pd.DataFrame(data)
    df.to_csv(file_name, index=False)
    logger.info(f"Saved repositories to '{file_name}'.")
else:
    logger.error("No repositories were fetched.")

### Select Clonable Gitea repositories and add Git metrics to the dataframe

In [None]:
input_file = "Gitea_data/gitea_2k_repositories.csv"
output_file = "Gitea_data/gitea_repositories_metrics.csv"

df = pd.read_csv(input_file)
final_df = pd.read_csv(output_file)
df = gitea_instance.select_clonable_repositories(df, Platforms.GITEA, 1000)
df.to_csv(output_file, index=False)

logger.info(f"Done! Updated file saved as {output_file}")

### Add Platform specific metrics to the dataframe

In [None]:
input_file = "Gitea_data/gitea_repositories_metrics.csv"
output_file = "Gitea_data/gitea_repositories_metrics.csv"

# Load the csv file that contains the repositories
df = pd.read_csv(input_file)

# Add chosen metric for each repository of the DataFrame
gitea_instance.add_metric(df, Platforms.GITEA, Metrics.ISSUE)
gitea_instance.add_metric(df, Platforms.GITEA, Metrics.PULL_REQUEST)

# Save updated CSV
df.to_csv(output_file, index=False)
logger.info(f"Done! Updated file saved as {output_file}")

# Fetching Repositories from Forgejo

In [None]:
# Fetch a given number of Forgejo repositories
data = forgejo_instance.fetch_repositories(5000, platform=Platforms.FORGEJO)
file_name = "Forgejo_data/forgejo_5k_repositories.csv"

# Save the gathered data to a CSV file
if data:
    df = pd.DataFrame(data)
    df.to_csv(file_name, index=False)
    logger.info(f"Saved repositories to '{file_name}'.")
else:
    logger.error("No repositories were fetched.")

### Select Clonable Forgejo repositories and add Git metrics to the dataframe

In [None]:
input_file = "Forgejo_data/forgejo_5k_repositories.csv"
output_file = "Forgejo_data/forgejo_repositories_metrics.csv"

df = pd.read_csv(input_file)

df = forgejo_instance.select_clonable_repositories(df, Platforms.FORGEJO, 1000)

df.to_csv(output_file, index=False)
logger.info(f"Done! Updated file saved as {output_file}")

### Add Platform specific metrics to the dataframe

In [None]:
input_file = "Forgejo_data/forgejo_repositories_metrics.csv"
output_file = "Forgejo_data/forgejo_repositories_metrics.csv"

# Load the csv file that contains the repositories
df = pd.read_csv(input_file)

# Add chosen metric for each repository of the DataFrame
forgejo_instance.add_metric(df, Platforms.FORGEJO, Metrics.ISSUE)
forgejo_instance.add_metric(df, Platforms.FORGEJO, Metrics.PULL_REQUEST)

# Save updated CSV
df.to_csv(output_file, index=False)
logger.info(f"Done! Updated file saved as {output_file}")

# Fetching Repositories from Gitlab

In [None]:
# Fetch a given number of Gitlab repositories
data = gitlab_instance.fetch_repositories(5000)
file_name = "Gitlab_data/gitlab_5k_repositories.csv"

# Save the gathered data to a CSV file
if data:
    df = pd.DataFrame(data)
    df.to_csv(file_name, index=False)
    logger.info(f"Saved repositories to '{file_name}'.")
else:
    logger.error("No repositories were fetched.")

### Select Clonable GitLab repositories and add Git metrics to the dataframe

In [None]:
input_file = "Gitlab_data/gitlab_5k_repositories.csv"
output_file = "Gitlab_data/gitlab_repositories_metrics.csv"

df = pd.read_csv(input_file)

df = gitlab_instance.select_clonable_repositories(df, Platforms.GITLAB, 1000)

df.to_csv(output_file, index=False)
logger.info(f"Done! Updated file saved as {output_file}")

### Adding Platform specific metrics to the dataframe

In [None]:
input_file = "Gitlab_data/gitlab_repositories_metrics.csv"
output_file = "Gitlab_data/gitlab_repositories_metrics.csv"

# Load the csv file that contains the repositories
df = pd.read_csv(input_file)

# Add chosen metric for each repository of the DataFrame
gitlab_instance.add_metric(df, Metrics.ISSUE)

# Save updated CSV
df.to_csv(output_file, index=False)
logger.info(f"Done! Updated file saved as {output_file}")

# Analysis of the data

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_30days_repositories.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_30days_repositories.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_30days_repositories.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_30days_repositories.csv")

get_most_present_owner(df_github, df_gitlab, df_gitea, df_forgejo)
get_unique_owner_number(df_github, df_gitlab, df_gitea, df_forgejo)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_15days_repositories.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_15days_repositories.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_15days_repositories.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_15days_repositories.csv")

get_most_present_owner(df_github, df_gitlab, df_gitea, df_forgejo)
get_unique_owner_number(df_github, df_gitlab, df_gitea, df_forgejo)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_7days_repositories.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_7days_repositories.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_7days_repositories.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_7days_repositories.csv")

get_most_present_owner(df_github, df_gitlab, df_gitea, df_forgejo)
get_unique_owner_number(df_github, df_gitlab, df_gitea, df_forgejo)

### Plotting the creation date of the repositories

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics_with_date_bias.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics_with_date_bias.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics_with_date_bias.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics_with_date_bias.csv")

# Create the plot
platform_analysis.plot_step_lines(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.CREATED)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_30days_repositories.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_30days_repositories.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_30days_repositories.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_30days_repositories.csv")

# Create the plot
platform_analysis.plot_step_lines(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.CREATED)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_15days_repositories.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_15days_repositories.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_15days_repositories.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_15days_repositories.csv")

# Create the plot
platform_analysis.plot_step_lines(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.CREATED)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_7days_repositories.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_7days_repositories.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_7days_repositories.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_7days_repositories.csv")

# Create the plot
platform_analysis.plot_step_lines(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.CREATED)

### Plotting the last update date of the repositories

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_30days_repositories.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_30days_repositories.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_30days_repositories.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_30days_repositories.csv")

# Create the plot
platform_analysis.plot_step_lines(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.UPDATED)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_15days_repositories.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_15days_repositories.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_15days_repositories.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_15days_repositories.csv")

# Create the plot
platform_analysis.plot_step_lines(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.UPDATED)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_7days_repositories.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_7days_repositories.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_7days_repositories.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_7days_repositories.csv")

# Create the plot
platform_analysis.plot_step_lines(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.UPDATED)

### Checking the distribution of the alphanumeric metrics

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_alphanumeric_distribution(Metrics.MAIN_LANGUAGE,df_github, df_gitlab, df_gitea, df_forgejo)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_alphanumeric_distribution(Metrics.LANGUAGE_DISTRIBUTION,df_github, df_gitlab, df_gitea, df_forgejo)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_alphanumeric_distribution(Metrics.LICENSE,df_github, df_gitlab, df_gitea, df_forgejo)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.COMMIT)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.SIZE)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.CONTRIBUTOR)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.BRANCH)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.ISSUE)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.PULL_REQUEST)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.CREATED)

# Correlation between our metrics

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

print("GITHUB")
platform_analysis.create_correlation_matrix(df_github)
print("GITLAB")
platform_analysis.create_correlation_matrix(df_gitlab)
print("GITEA")
platform_analysis.create_correlation_matrix(df_gitea)
print("FORGEJO")
platform_analysis.create_correlation_matrix(df_forgejo)

### Lorentz curves to visualize check how important the biggest repositories are in the dataset

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

print("GITHUB")
platform_analysis.plot_lorenz_curve(df_github['#commits'])
print("GITLAB")
platform_analysis.plot_lorenz_curve(df_gitlab['#commits'])
print("GITEA")
platform_analysis.plot_lorenz_curve(df_gitea['#commits'])
print("FORGEJO")
platform_analysis.plot_lorenz_curve(df_forgejo['#commits'])

# Statistical analysis using propensity score matching

### Age comparison

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.CREATED)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Perform matching and create the plot
platform_analysis.propensity_score_matching(df_gitea, df_forgejo, "Gitea", Metrics.CREATED.value, ["#branches", "#commits", "#contributors", "size"], caliper=0.0001)

### Commit count comparison

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.COMMIT)

In [None]:
# Load the dataset
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")

# Filter rows where #commits > 10000
filtered_df = df_gitlab[df_gitlab['#commits'] > 10000]

# Calculate statistics on #contributors
max_contributors = filtered_df['#contributors'].max()
mean_contributors = filtered_df['#contributors'].mean()
median_contributors = filtered_df['#contributors'].median()

# Display the results
print(f"length of filtered_df: {len(filtered_df)}")
print(f"Max #contributors: {max_contributors}")
print(f"Mean #contributors: {mean_contributors}")
print(f"Median #contributors: {median_contributors}")


In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Perform matching and create the plot
platform_analysis.propensity_score_matching(df_github, df_gitlab, "GitHub", Metrics.COMMIT.value, ["#branches", "#contributors", "size", "created"], caliper=0.00001)

### Commit count comparison

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.CONTRIBUTOR)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Perform matching and create the plot
platform_analysis.propensity_score_matching(df_gitea, df_forgejo, "Gitea", Metrics.CONTRIBUTOR.value, ["#branches", "#commits", "size", "created"], caliper=0.001) #ajouter les dates de création

### Size comparison

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.SIZE)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Perform matching and create the plot
platform_analysis.propensity_score_matching(df_gitea, df_forgejo, "Gitea", Metrics.SIZE.value, ["#branches", "#commits", "#contributors", "created"], caliper=0.1)

### Stars Comparison

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.STAR)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Perform matching and create the plot
platform_analysis.propensity_score_matching(df_gitea, df_forgejo, "Gitea", Metrics.STAR.value, ["#pull_requests", "#issues", "#forks"], caliper=0.01)

### Issues Comparison

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.ISSUE)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Perform matching and create the plot
platform_analysis.propensity_score_matching(df_gitea, df_forgejo, "Gitea", Metrics.ISSUE.value, ["#pull_requests", "#stars", "#forks"], caliper=0.001)

### Pull requests comparison

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Create the plot
platform_analysis.plot_numeric_distribution(df_github, df_gitlab, df_gitea, df_forgejo, Metrics.PULL_REQUEST)

In [None]:
# Load Data
df_github = pd.read_csv("Github_data/github_repositories_metrics.csv")
df_gitlab = pd.read_csv("Gitlab_data/gitlab_repositories_metrics.csv")
df_gitea = pd.read_csv("Gitea_data/gitea_repositories_metrics.csv")
df_forgejo = pd.read_csv("Forgejo_data/forgejo_repositories_metrics.csv")

# Perform matching and create the plot
platform_analysis.propensity_score_matching(df_gitea, df_forgejo, "Gitea", Metrics.PULL_REQUEST.value, ["#issues", "#stars", "#forks"], caliper=0.001)