# Project Git Metrics for Landscape Analysis

Project git metrics for software landscape analysis related to Cytomining ecosystem.

## Setup

Set an environment variable named `LANDSCAPE_ANALYSIS_GH_TOKEN` to a [GitHub access token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens). E.g.: `export LANDSCAPE_ANALYSIS_GH_TOKEN=token_here`

In [1]:
import os
from datetime import datetime

import pandas as pd
import pytz
from box import Box
from github import Auth, Github

# set github authorization and client
github_client = Github(
    auth=Auth.Token(os.environ.get("LANDSCAPE_ANALYSIS_GH_TOKEN")), per_page=100
)
# get the current datetime
tz = pytz.timezone("UTC")
current_datetime = datetime.now(tz)

In [2]:
# gather projects data
projects = Box.from_yaml(filename="data/projects.yaml").projects

# check the number of projects
print("number of projects: ", len(projects))

number of projects:  1239


In [3]:
# show the keys available for the projects
projects[0].keys()

dict_keys(['category', 'homepage_url', 'name', 'repo_url'])

In [4]:
def try_to_detect_license(repo):
    """
    Tries to detect the license from GitHub API
    """

    try:
        return repo.get_license().license.spdx_id
    except:
        return None

In [5]:
def try_to_gather_commit_count(repo):
    """
    Tries to detect commit count of repo from GitHub API
    """

    try:
        return len(list(repo.get_commits()))
    except:
        return 0

In [6]:
def try_to_gather_most_recent_commit_date(repo):
    """
    Tries to detect most recent commit date of repo from GitHub API
    """

    try:
        return repo.pushed_at.replace(tzinfo=pytz.UTC)
    except:
        return None

In [7]:
df_projects = pd.DataFrame(
    # create a list of repo data records for a dataframe
    [
        {
            "Project Name": repo.name,
            "GitHub Repository ID": repo.id,
            "Project Homepage": repo.homepage,
            "Project Repo URL": repo.html_url,
            "Project Landscape Category": project.category,
            "GitHub Stars": repo.stargazers_count,
            "GitHub Forks": repo.forks_count,
            "GitHub Subscribers": repo.subscribers_count,
            "GitHub Open Issues": repo.get_issues(state="open").totalCount,
            "GitHub Contributors": repo.get_contributors().totalCount,
            "GitHub License Type": try_to_detect_license(repo),
            "GitHub Description": repo.description,
            "GitHub Topics": repo.topics,
            # gather org name if it exists
            "GitHub Organization": repo.organization.login
            if repo.organization
            else None,
            "GitHub Network Count": repo.network_count,
            "GitHub Detected Languages": repo.get_languages(),
            "Date Created": repo.created_at.replace(tzinfo=pytz.UTC),
            "Date Most Recent Commit": try_to_gather_most_recent_commit_date(repo),
            # placeholders for later datetime calculations
            "Duration Created to Most Recent Commit": "",
            "Duration Created to Now": "",
            "Duration Most Recent Commit to Now": "",
            "Repository Size (KB)": repo.size,
            "GitHub Repo Archived": repo.archived,
        }
        # make a request for github repo data with pygithub
        for project, repo in [
            (
                project,
                github_client.get_repo(
                    project.repo_url.replace("https://github.com/", "")
                ),
            )
            for project in projects
        ]
    ]
)

# show the result
df_projects

Following Github server redirection from /repos/theislab/scanpy to /repositories/80342493
Following Github server redirection from /repos/YosefLab/scvi-tools to /repositories/102567256
Following Github server redirection from /repos/shenorrLab/bseqsc to /repositories/62131343


Unnamed: 0,Project Name,GitHub Repository ID,Project Homepage,Project Repo URL,Project Landscape Category,GitHub Stars,GitHub Forks,GitHub Subscribers,GitHub Open Issues,GitHub Contributors,...,GitHub Organization,GitHub Network Count,GitHub Detected Languages,Date Created,Date Most Recent Commit,Duration Created to Most Recent Commit,Duration Created to Now,Duration Most Recent Commit to Now,Repository Size (KB),GitHub Repo Archived
0,pycytominer,195106954,https://pycytominer.readthedocs.io,https://github.com/cytomining/pycytominer,"[loi-focus, cytomining-ecosystem]",52,32,6,83,22,...,cytomining,32,"{'Python': 373578, 'Jupyter Notebook': 16489, ...",2019-07-03 18:22:51+00:00,2023-10-11 13:58:57+00:00,,,,721073,False
1,CytoSnake,459692818,https://cytosnake.readthedocs.io,https://github.com/WayScience/CytoSnake,"[loi-focus, cytomining-ecosystem]",3,3,0,35,3,...,WayScience,3,{'Python': 132842},2022-02-15 18:02:45+00:00,2023-09-01 23:10:17+00:00,,,,780,False
2,CytoTable,534282624,https://cytomining.github.io/CytoTable/,https://github.com/cytomining/CytoTable,"[loi-focus, cytomining-ecosystem]",3,4,4,41,4,...,cytomining,4,{'Python': 154533},2022-09-08 15:46:25+00:00,2023-10-13 13:46:51+00:00,,,,6829,False
3,IDR_stream,523111118,,https://github.com/WayScience/IDR_stream,"[loi-focus, cytomining-ecosystem]",4,2,2,2,2,...,WayScience,2,"{'Jupyter Notebook': 311010, 'Python': 88583}",2022-08-09 21:16:48+00:00,2023-02-24 22:08:54+00:00,,,,37026,False
4,pandas,858127,https://pandas.pydata.org,https://github.com/pandas-dev/pandas,[cytomining-ecosystem-relevant-open-source],40031,16810,1121,3656,411,...,pandas-dev,16810,"{'Python': 20324063, 'Cython': 1277513, 'HTML'...",2010-08-24 01:37:33+00:00,2023-10-16 16:52:20+00:00,,,,334787,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1234,Vaccine-associated-enhanced-respiratory-pathol...,492803234,,https://github.com/Berlin-Hamster-Single-Cell-...,[related-tools-github-query-result],1,0,0,1,1,...,Berlin-Hamster-Single-Cell-Consortium,0,{'R': 47789},2022-05-16 11:21:45+00:00,2022-07-17 10:27:24+00:00,,,,60,False
1235,Marmoset_FetalBrain_singlecellRNAseq,484170652,,https://github.com/parulvarma123/Marmoset_Feta...,[related-tools-github-query-result],1,0,1,0,1,...,,0,{'R': 18458},2022-04-21 18:56:18+00:00,2022-04-21 22:53:17+00:00,,,,7,False
1236,Myotome-volume-Nucleus-count-and-Color-analysis,482789058,,https://github.com/peggyscshu/Myotome-volume-N...,[related-tools-github-query-result],1,0,1,6,1,...,,0,"{'ImageJ Macro': 17729, 'Python': 807}",2022-04-18 09:28:35+00:00,2022-04-27 08:42:53+00:00,,,,103,False
1237,thymus_spatial_atlas,634860333,,https://github.com/Teichlab/thymus_spatial_atlas,[related-tools-github-query-result],1,0,0,0,1,...,Teichlab,0,"{'Jupyter Notebook': 165406342, 'Python': 1268...",2023-05-01 11:53:13+00:00,2023-09-22 13:04:47+00:00,,,,395652,False


In [8]:
# calculate time deltas
df_projects["Duration Created to Most Recent Commit"] = (
    df_projects["Date Most Recent Commit"] - df_projects["Date Created"]
)
df_projects["Duration Created to Now"] = current_datetime - df_projects["Date Created"]
df_projects["Duration Most Recent Commit to Now"] = (
    current_datetime - df_projects["Date Most Recent Commit"]
)

# create a years count for project time duration
df_projects["Duration Created to Now in Years"] = (
    df_projects["Duration Created to Now"].dt.days / 365
)

# show the result
df_projects

Unnamed: 0,Project Name,GitHub Repository ID,Project Homepage,Project Repo URL,Project Landscape Category,GitHub Stars,GitHub Forks,GitHub Subscribers,GitHub Open Issues,GitHub Contributors,...,GitHub Network Count,GitHub Detected Languages,Date Created,Date Most Recent Commit,Duration Created to Most Recent Commit,Duration Created to Now,Duration Most Recent Commit to Now,Repository Size (KB),GitHub Repo Archived,Duration Created to Now in Years
0,pycytominer,195106954,https://pycytominer.readthedocs.io,https://github.com/cytomining/pycytominer,"[loi-focus, cytomining-ecosystem]",52,32,6,83,22,...,32,"{'Python': 373578, 'Jupyter Notebook': 16489, ...",2019-07-03 18:22:51+00:00,2023-10-11 13:58:57+00:00,1560 days 19:36:06,1565 days 22:59:56.559263,5 days 03:23:50.559263,721073,False,4.287671
1,CytoSnake,459692818,https://cytosnake.readthedocs.io,https://github.com/WayScience/CytoSnake,"[loi-focus, cytomining-ecosystem]",3,3,0,35,3,...,3,{'Python': 132842},2022-02-15 18:02:45+00:00,2023-09-01 23:10:17+00:00,563 days 05:07:32,607 days 23:20:02.559263,44 days 18:12:30.559263,780,False,1.663014
2,CytoTable,534282624,https://cytomining.github.io/CytoTable/,https://github.com/cytomining/CytoTable,"[loi-focus, cytomining-ecosystem]",3,4,4,41,4,...,4,{'Python': 154533},2022-09-08 15:46:25+00:00,2023-10-13 13:46:51+00:00,399 days 22:00:26,403 days 01:36:22.559263,3 days 03:35:56.559263,6829,False,1.104110
3,IDR_stream,523111118,,https://github.com/WayScience/IDR_stream,"[loi-focus, cytomining-ecosystem]",4,2,2,2,2,...,2,"{'Jupyter Notebook': 311010, 'Python': 88583}",2022-08-09 21:16:48+00:00,2023-02-24 22:08:54+00:00,199 days 00:52:06,432 days 20:05:59.559263,233 days 19:13:53.559263,37026,False,1.183562
4,pandas,858127,https://pandas.pydata.org,https://github.com/pandas-dev/pandas,[cytomining-ecosystem-relevant-open-source],40031,16810,1121,3656,411,...,16810,"{'Python': 20324063, 'Cython': 1277513, 'HTML'...",2010-08-24 01:37:33+00:00,2023-10-16 16:52:20+00:00,4801 days 15:14:47,4801 days 15:45:14.559263,0 days 00:30:27.559263,334787,False,13.153425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1234,Vaccine-associated-enhanced-respiratory-pathol...,492803234,,https://github.com/Berlin-Hamster-Single-Cell-...,[related-tools-github-query-result],1,0,0,1,1,...,0,{'R': 47789},2022-05-16 11:21:45+00:00,2022-07-17 10:27:24+00:00,61 days 23:05:39,518 days 06:01:02.559263,456 days 06:55:23.559263,60,False,1.419178
1235,Marmoset_FetalBrain_singlecellRNAseq,484170652,,https://github.com/parulvarma123/Marmoset_Feta...,[related-tools-github-query-result],1,0,1,0,1,...,0,{'R': 18458},2022-04-21 18:56:18+00:00,2022-04-21 22:53:17+00:00,0 days 03:56:59,542 days 22:26:29.559263,542 days 18:29:30.559263,7,False,1.484932
1236,Myotome-volume-Nucleus-count-and-Color-analysis,482789058,,https://github.com/peggyscshu/Myotome-volume-N...,[related-tools-github-query-result],1,0,1,6,1,...,0,"{'ImageJ Macro': 17729, 'Python': 807}",2022-04-18 09:28:35+00:00,2022-04-27 08:42:53+00:00,8 days 23:14:18,546 days 07:54:12.559263,537 days 08:39:54.559263,103,False,1.495890
1237,thymus_spatial_atlas,634860333,,https://github.com/Teichlab/thymus_spatial_atlas,[related-tools-github-query-result],1,0,0,0,1,...,0,"{'Jupyter Notebook': 165406342, 'Python': 1268...",2023-05-01 11:53:13+00:00,2023-09-22 13:04:47+00:00,144 days 01:11:34,168 days 05:29:34.559263,24 days 04:18:00.559263,395652,False,0.460274


In [9]:
# filter the results
df_projects = df_projects[
    # filter projects which are < 50 KB
    df_projects["Repository Size (KB)"]
    >= 50
    # filter projects which have been archived
    & ~df_projects["GitHub Repo Archived"]
][  # filter projects which have no detected programming languages
    df_projects["GitHub Detected Languages"].str.len()
    > 0
    # Drop duplicates based on github repository id
].drop_duplicates(
    subset="GitHub Repository ID"
)
df_projects.tail()

Unnamed: 0,Project Name,GitHub Repository ID,Project Homepage,Project Repo URL,Project Landscape Category,GitHub Stars,GitHub Forks,GitHub Subscribers,GitHub Open Issues,GitHub Contributors,...,GitHub Network Count,GitHub Detected Languages,Date Created,Date Most Recent Commit,Duration Created to Most Recent Commit,Duration Created to Now,Duration Most Recent Commit to Now,Repository Size (KB),GitHub Repo Archived,Duration Created to Now in Years
1232,GeneExpressMap,925122,,https://github.com/coreyflynn/GeneExpressMap,[related-tools-github-query-result],1,0,2,0,1,...,0,{'Objective-C': 166172},2010-09-20 14:58:40+00:00,2011-06-21 13:44:00+00:00,273 days 22:45:20,4774 days 02:24:07.559263,4500 days 03:38:47.559263,1052,False,13.079452
1234,Vaccine-associated-enhanced-respiratory-pathol...,492803234,,https://github.com/Berlin-Hamster-Single-Cell-...,[related-tools-github-query-result],1,0,0,1,1,...,0,{'R': 47789},2022-05-16 11:21:45+00:00,2022-07-17 10:27:24+00:00,61 days 23:05:39,518 days 06:01:02.559263,456 days 06:55:23.559263,60,False,1.419178
1235,Marmoset_FetalBrain_singlecellRNAseq,484170652,,https://github.com/parulvarma123/Marmoset_Feta...,[related-tools-github-query-result],1,0,1,0,1,...,0,{'R': 18458},2022-04-21 18:56:18+00:00,2022-04-21 22:53:17+00:00,0 days 03:56:59,542 days 22:26:29.559263,542 days 18:29:30.559263,7,False,1.484932
1236,Myotome-volume-Nucleus-count-and-Color-analysis,482789058,,https://github.com/peggyscshu/Myotome-volume-N...,[related-tools-github-query-result],1,0,1,6,1,...,0,"{'ImageJ Macro': 17729, 'Python': 807}",2022-04-18 09:28:35+00:00,2022-04-27 08:42:53+00:00,8 days 23:14:18,546 days 07:54:12.559263,537 days 08:39:54.559263,103,False,1.49589
1237,thymus_spatial_atlas,634860333,,https://github.com/Teichlab/thymus_spatial_atlas,[related-tools-github-query-result],1,0,0,0,1,...,0,"{'Jupyter Notebook': 165406342, 'Python': 1268...",2023-05-01 11:53:13+00:00,2023-09-22 13:04:47+00:00,144 days 01:11:34,168 days 05:29:34.559263,24 days 04:18:00.559263,395652,False,0.460274


In [10]:
# negate this duration value for sorting descendingly,
# with projects that have been more recently changed sorting to the top
df_projects["Negative Duration Most Recent Commit to Now"] = -df_projects[
    "Duration Most Recent Commit to Now"
]
df_projects = df_projects.sort_values(
    by=[
        "GitHub Stars",
        "GitHub Subscribers",
        "GitHub Contributors",
        "GitHub Forks",
        "GitHub Open Issues",
        "Negative Duration Most Recent Commit to Now",
        "Duration Created to Most Recent Commit",
    ],
    ascending=False,
)
df_projects

Unnamed: 0,Project Name,GitHub Repository ID,Project Homepage,Project Repo URL,Project Landscape Category,GitHub Stars,GitHub Forks,GitHub Subscribers,GitHub Open Issues,GitHub Contributors,...,GitHub Detected Languages,Date Created,Date Most Recent Commit,Duration Created to Most Recent Commit,Duration Created to Now,Duration Most Recent Commit to Now,Repository Size (KB),GitHub Repo Archived,Duration Created to Now in Years,Negative Duration Most Recent Commit to Now
4,pandas,858127,https://pandas.pydata.org,https://github.com/pandas-dev/pandas,[cytomining-ecosystem-relevant-open-source],40031,16810,1121,3656,411,...,"{'Python': 20324063, 'Cython': 1277513, 'HTML'...",2010-08-24 01:37:33+00:00,2023-10-16 16:52:20+00:00,4801 days 15:14:47,4801 days 15:45:14.559263,0 days 00:30:27.559263,334787,False,13.153425,-1 days +23:29:32.440737
5,numpy,908607,https://numpy.org,https://github.com/numpy/numpy,[cytomining-ecosystem-relevant-open-source],24741,8647,595,2193,435,...,"{'Python': 10458450, 'C': 6220071, 'C++': 2057...",2010-09-13 23:02:39+00:00,2023-10-16 14:25:39+00:00,4780 days 15:23:00,4780 days 18:20:08.559263,0 days 02:57:08.559263,131902,False,13.095890,-1 days +21:02:51.440737
14,arrow,51905353,https://arrow.apache.org/,https://github.com/apache/arrow,[cytomining-ecosystem-relevant-open-source],12614,3096,351,3908,367,...,"{'C++': 26864997, 'Java': 7353737, 'Go': 56198...",2016-02-17 08:00:23+00:00,2023-10-16 17:18:03+00:00,2798 days 09:17:40,2798 days 09:22:24.559263,0 days 00:04:44.559263,171101,False,7.665753,-1 days +23:55:15.440737
16,duckdb,138754790,http://www.duckdb.org,https://github.com/duckdb/duckdb,[cytomining-ecosystem-relevant-open-source],12393,1157,157,309,253,...,"{'C++': 33576510, 'C': 1761733, 'Python': 1407...",2018-06-26 15:04:45+00:00,2023-10-16 14:58:09+00:00,1937 days 23:53:24,1938 days 02:18:02.559263,0 days 02:24:38.559263,227662,False,5.309589,-1 days +21:35:21.440737
15,parquet-mr,20675636,,https://github.com/apache/parquet-mr,[cytomining-ecosystem-relevant-open-source],2179,1332,95,130,190,...,"{'Java': 5923431, 'Shell': 14860, 'Python': 14...",2014-06-10 07:00:07+00:00,2023-10-16 15:19:17+00:00,3415 days 08:19:10,3415 days 10:22:40.559263,0 days 02:03:30.559263,18475,False,9.356164,-1 days +21:56:29.440737
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151,ImagingCells,137526283,,https://github.com/jesnyder/ImagingCells,[related-tools-github-query-result],0,0,1,0,1,...,{'Jupyter Notebook': 1523},2018-06-15 19:50:00+00:00,2018-08-31 19:21:33+00:00,76 days 23:31:33,1948 days 21:32:47.559263,1871 days 22:01:14.559263,1,False,5.336986,-1872 days +01:58:45.440737
224,course-bia,119301640,,https://github.com/denzf/course-bia,[related-tools-github-query-result],0,0,1,0,1,...,{'Python': 6010},2018-01-28 21:58:13+00:00,2018-01-24 03:22:19+00:00,-5 days +05:24:06,2086 days 19:24:34.559263,2091 days 14:00:28.559263,203,False,5.715068,-2092 days +09:59:31.440737
156,Cell-virulence-Detection-using-Image-Processing,163268436,,https://github.com/arushigupta148/Cell-virulen...,[related-tools-github-query-result],0,0,0,0,1,...,{'Python': 12989},2018-12-27 08:27:06+00:00,2019-05-12 23:09:02+00:00,136 days 14:41:56,1754 days 08:55:41.559263,1617 days 18:13:45.559263,1751,False,4.805479,-1618 days +05:46:14.440737
160,Image-analysis,152904377,,https://github.com/dguin/Image-analysis,[related-tools-github-query-result],0,0,0,0,1,...,{'MATLAB': 40670},2018-10-13 18:53:42+00:00,2018-10-13 19:32:22+00:00,0 days 00:38:40,1828 days 22:29:05.559263,1828 days 21:50:25.559263,26,False,5.008219,-1829 days +02:09:34.440737


In [11]:
# export to parquet for later use
df_projects.to_parquet("data/project-github-metrics.parquet")