# Project Git Metrics for Landscape Analysis

Project git metrics for software landscape analysis related to Cytomining ecosystem.

## Setup

Set an environment variable named `LANDSCAPE_ANALYSIS_GH_TOKEN` to a [GitHub access token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/managing-your-personal-access-tokens). E.g.: `export LANDSCAPE_ANALYSIS_GH_TOKEN=token_here`

In [1]:
import os
from datetime import datetime

import pandas as pd
import pytz
from box import Box
from github import Auth, Github

# set github authorization and client
github_client = Github(
    auth=Auth.Token(os.environ.get("LANDSCAPE_ANALYSIS_GH_TOKEN")), per_page=100
)
# get the current datetime
tz = pytz.timezone("UTC")
current_datetime = datetime.now(tz)

In [2]:
# gather projects data
projects = Box.from_yaml(filename="data/projects.yaml").projects

# check the number of projects
print("number of projects: ", len(projects))
print("project names: ", [project["name"] for project in projects])

number of projects:  66
project names:  ['pycytominer', 'cyosnake', 'cytotable', 'single-cell-classifier', 'CellTypeClassification', 'scDINO', 'CellSegmenter', 'paper-bray2017', 'LIBPB-1390-Image3C', 'u-shape3D', 'CellWalker', 'CellularComposition', 'CellQuant', 'CellMorphology', 'Image-segmentation-and-feature-extraction-', 'CellEmbeddings', 'pheno-ml', 'leafnet', 'analyzing-leaf-morphology', 'cellwalker', 'Aging-Cell-Morphology-Cell-transformations-and-image-processing', 'Image-Analysis-of-Cellulose-Nanocrystals', 'segmorph', 'ImagingCells', 'Cell-Classifier', 'CellWalker_old', 'CV-processing-cells-in-image', 'G_CELLIQ_IMAGE_PROCESSING_TOOL', 'Table-recognition', 'Objects-counting-on-images', 'SynapseMech', 'vampire-analysis', 'Cell-segmentation-and-feature-extraction', 'Cell-virulence-Detection-using-Image-Processing', 'NeuroGAN', 'Nuclei-Detection-and-Counting', 'drug-induced_morphology_changes_quantification', 'Blood-Cancer-Detection-from-Image-dataset-', 'SpotAnalizer', 'Image-an

In [3]:
# show the keys available for the projects
projects[0].keys()

dict_keys(['homepage_url', 'name', 'repo_url', 'tags'])

In [4]:
def try_to_detect_license(repo):
    """
    Tries to detect the license from GitHub API
    """

    try:
        return repo.get_license().license.spdx_id
    except:
        return None

In [5]:
df_projects = pd.DataFrame(
    # create a list of repo data records for a dataframe
    [
        {
            "Project Name": repo.name,
            "Project Homepage": repo.homepage,
            "Project Repo URL": repo.html_url,
            "Commit Count": len(list(repo.get_commits())),
            "GitHub Stars": repo.stargazers_count,
            "GitHub Forks": repo.forks_count,
            "GitHub Watchers": repo.subscribers_count,
            "GitHub Open Issues": repo.get_issues(state="open").totalCount,
            "GitHub Contributors": repo.get_contributors().totalCount,
            "GitHub License Type": try_to_detect_license(repo),
            "GitHub Detected Languages": repo.get_languages(),
            "Date Created": repo.created_at.replace(tzinfo=pytz.UTC),
            "Date Most Recent Commit": repo.get_commits()[0].commit.author.date.replace(
                tzinfo=pytz.UTC
            ),
            "Duration Created to Most Recent Commit": "",
            "Duration Most Recent Commit to Now": "",
            "Repository Size (KB)": repo.size,
            "GitHub Repo Archived": repo.archived,
        }
        # make a request for github repo data with pygithub
        for repo in [
            github_client.get_repo(project.repo_url.replace("https://github.com/", ""))
            for project in projects
        ]
    ]
)

# calculate time deltas
df_projects["Duration Created to Most Recent Commit"] = (
    df_projects["Date Most Recent Commit"] - df_projects["Date Created"]
)
df_projects["Duration Most Recent Commit to Now"] = (
    current_datetime - df_projects["Date Most Recent Commit"]
)

# show the result
df_projects

Unnamed: 0,Project Name,Project Homepage,Project Repo URL,Commit Count,GitHub Stars,GitHub Forks,GitHub Watchers,GitHub Open Issues,GitHub Contributors,GitHub License Type,GitHub Detected Languages,Date Created,Date Most Recent Commit,Duration Created to Most Recent Commit,Duration Most Recent Commit to Now,Repository Size (KB),GitHub Repo Archived
0,pycytominer,https://pycytominer.readthedocs.io,https://github.com/cytomining/pycytominer,598,52,32,6,85,22,BSD-3-Clause,"{'Python': 373578, 'Jupyter Notebook': 16489, ...",2019-07-03 18:22:51+00:00,2023-10-03 17:40:10+00:00,1552 days 23:17:19,7 days 05:25:37.082138,720941,False
1,CytoSnake,https://cytosnake.readthedocs.io,https://github.com/WayScience/CytoSnake,178,3,3,0,35,3,CC-BY-4.0,{'Python': 132842},2022-02-15 18:02:45+00:00,2023-09-01 23:09:07+00:00,563 days 05:06:22,38 days 23:56:40.082138,780,False
2,CytoTable,https://cytomining.github.io/CytoTable/,https://github.com/cytomining/CytoTable,114,3,4,4,42,4,BSD-3-Clause,{'Python': 157082},2022-09-08 15:46:25+00:00,2023-10-06 14:01:20+00:00,392 days 22:14:55,4 days 09:04:27.082138,6817,False
3,single-cell-classifier,,https://github.com/cytodata/single-cell-classi...,201,6,7,14,15,4,CC-BY-4.0,"{'HTML': 886029, 'Jupyter Notebook': 77142, 'P...",2019-11-04 13:42:58+00:00,2020-02-12 19:01:09+00:00,100 days 05:18:11,1336 days 04:04:38.082138,45769,False
4,CellTypeClassification,https://github.com/sxslabjhu/CellTypeClassific...,https://github.com/sxslabjhu/CellTypeClassific...,75,16,1,1,0,1,MIT,{'MATLAB': 6099},2019-01-26 23:16:25+00:00,2019-03-28 13:45:46+00:00,60 days 14:29:21,1657 days 09:20:01.082138,2236,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,BioMorph_Space,,https://github.com/srijitseal/BioMorph_Space,15,0,0,1,0,1,MIT,{'Jupyter Notebook': 9775608},2023-07-14 15:09:09+00:00,2023-10-10 19:38:39+00:00,88 days 04:29:30,0 days 03:27:08.082138,3659,False
62,Blood-Cell-Characterization,,https://github.com/Jeevan-J/Blood-Cell-Charact...,23,0,0,1,0,1,GPL-3.0,{},2019-03-15 04:41:45+00:00,2019-03-15 09:59:50+00:00,0 days 05:18:05,1670 days 13:05:57.082138,1181,False
63,cytominer,,https://github.com/cran/cytominer,3,0,0,3,0,1,NOASSERTION,{'R': 69830},2017-09-17 19:27:03+00:00,2020-05-09 04:00:03+00:00,964 days 08:33:00,1249 days 19:05:44.082138,462,False
64,history,,https://github.com/mhowerton91/history,3,10,6,13,0,2,,{},2017-06-25 12:38:34+00:00,2021-01-02 07:15:46+00:00,1286 days 18:37:12,1011 days 15:50:01.082138,12,False


In [6]:
# filter the results
df_projects = df_projects[
    # filter projects which are < 50 KB
    df_projects["Repository Size (KB)"]
    >= 50
    # filter projects which have been archived
    & ~df_projects["GitHub Repo Archived"]
]
df_projects.tail()

Unnamed: 0,Project Name,Project Homepage,Project Repo URL,Commit Count,GitHub Stars,GitHub Forks,GitHub Watchers,GitHub Open Issues,GitHub Contributors,GitHub License Type,GitHub Detected Languages,Date Created,Date Most Recent Commit,Duration Created to Most Recent Commit,Duration Most Recent Commit to Now,Repository Size (KB),GitHub Repo Archived
61,BioMorph_Space,,https://github.com/srijitseal/BioMorph_Space,15,0,0,1,0,1,MIT,{'Jupyter Notebook': 9775608},2023-07-14 15:09:09+00:00,2023-10-10 19:38:39+00:00,88 days 04:29:30,0 days 03:27:08.082138,3659,False
62,Blood-Cell-Characterization,,https://github.com/Jeevan-J/Blood-Cell-Charact...,23,0,0,1,0,1,GPL-3.0,{},2019-03-15 04:41:45+00:00,2019-03-15 09:59:50+00:00,0 days 05:18:05,1670 days 13:05:57.082138,1181,False
63,cytominer,,https://github.com/cran/cytominer,3,0,0,3,0,1,NOASSERTION,{'R': 69830},2017-09-17 19:27:03+00:00,2020-05-09 04:00:03+00:00,964 days 08:33:00,1249 days 19:05:44.082138,462,False
64,history,,https://github.com/mhowerton91/history,3,10,6,13,0,2,,{},2017-06-25 12:38:34+00:00,2021-01-02 07:15:46+00:00,1286 days 18:37:12,1011 days 15:50:01.082138,12,False
65,A,,https://github.com/chikitang/A,1,6,3,2,1,1,,{},2022-06-08 04:21:46+00:00,2022-06-08 04:21:47+00:00,0 days 00:00:01,489 days 18:44:00.082138,44,False


In [7]:
# negate this duration value for sorting descendingly,
# with projects that have been more recently changed sorting to the top
df_projects["Negative Duration Most Recent Commit to Now"] = -df_projects[
    "Duration Most Recent Commit to Now"
]
df_projects = df_projects.sort_values(
    by=[
        "GitHub Stars",
        "GitHub Watchers",
        "GitHub Contributors",
        "GitHub Forks",
        "GitHub Open Issues",
        "Negative Duration Most Recent Commit to Now",
        "Duration Created to Most Recent Commit",
    ],
    ascending=False,
)
df_projects

Unnamed: 0,Project Name,Project Homepage,Project Repo URL,Commit Count,GitHub Stars,GitHub Forks,GitHub Watchers,GitHub Open Issues,GitHub Contributors,GitHub License Type,GitHub Detected Languages,Date Created,Date Most Recent Commit,Duration Created to Most Recent Commit,Duration Most Recent Commit to Now,Repository Size (KB),GitHub Repo Archived,Negative Duration Most Recent Commit to Now
0,pycytominer,https://pycytominer.readthedocs.io,https://github.com/cytomining/pycytominer,598,52,32,6,85,22,BSD-3-Clause,"{'Python': 373578, 'Jupyter Notebook': 16489, ...",2019-07-03 18:22:51+00:00,2023-10-03 17:40:10+00:00,1552 days 23:17:19,7 days 05:25:37.082138,720941,False,-8 days +18:34:22.917862
52,cytominer,https://cytomining.github.io/cytominer/,https://github.com/cytomining/cytominer,445,43,28,7,6,8,NOASSERTION,{'R': 110992},2015-07-20 22:38:45+00:00,2023-06-29 20:02:36+00:00,2900 days 21:23:51,103 days 03:03:11.082138,246366,False,-104 days +20:56:48.917862
51,2022_Haghighi_NatureMethods,,https://github.com/carpenter-singh-lab/2022_Ha...,71,40,7,3,6,3,BSD-3-Clause,"{'Jupyter Notebook': 579210, 'Python': 50652, ...",2021-06-15 19:41:17+00:00,2023-06-21 00:13:02+00:00,735 days 04:31:45,111 days 22:52:45.082138,111691,False,-112 days +01:07:14.917862
7,paper-bray2017,,https://github.com/gigascience/paper-bray2017,13,32,15,4,1,2,,{'Shell': 2969},2016-12-13 06:39:31+00:00,2023-07-05 14:18:21+00:00,2395 days 07:38:50,97 days 08:47:26.082138,865,False,-98 days +15:12:33.917862
9,u-shape3D,https://www.nature.com/articles/s41592-019-0539-z,https://github.com/DanuserLab/u-shape3D,24,23,2,3,4,3,GPL-3.0,{'MATLAB': 1853673},2019-07-05 16:48:41+00:00,2023-06-20 22:33:21+00:00,1446 days 05:44:40,112 days 00:32:26.082138,34748,False,-113 days +23:27:33.917862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,Blood-Cell-Characterization,,https://github.com/Jeevan-J/Blood-Cell-Charact...,23,0,0,1,0,1,GPL-3.0,{},2019-03-15 04:41:45+00:00,2019-03-15 09:59:50+00:00,0 days 05:18:05,1670 days 13:05:57.082138,1181,False,-1671 days +10:54:02.917862
13,CellMorphology,,https://github.com/KnightofDawn/CellMorphology,36,0,0,1,0,1,,{'Python': 65906},2018-09-11 04:25:09+00:00,2018-09-10 20:41:27+00:00,-1 days +16:16:18,1856 days 02:24:20.082138,161,False,-1857 days +21:35:39.917862
23,ImagingCells,,https://github.com/jesnyder/ImagingCells,2,0,0,1,0,1,,{'Jupyter Notebook': 1523},2018-06-15 19:50:00+00:00,2018-08-31 19:21:28+00:00,76 days 23:31:28,1866 days 03:44:19.082138,1,False,-1867 days +20:15:40.917862
33,Cell-virulence-Detection-using-Image-Processing,,https://github.com/arushigupta148/Cell-virulen...,5,0,0,0,0,1,,{'Python': 12989},2018-12-27 08:27:06+00:00,2019-05-12 23:09:00+00:00,136 days 14:41:54,1611 days 23:56:47.082138,1751,False,-1612 days +00:03:12.917862
