In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2022 Pen-Yuan Hsing
# SPDX-License-Identifier: AGPL-3.0-or-later

In [1]:
# Python Standard Library
import json

# External libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import Markdown as md # For inserting variables into Markdown
from plotly.subplots import make_subplots

In [2]:
# Mercury widget(s)

repo_name: str = "FOSSASystems/FOSSASAT-2"

In [3]:
# Read saved data

with open("./contrib/GitHub_repos_data.json") as json_file: 
    json_data = json.load(json_file)

In [4]:
# Initialise Pandas DataFrames to hold loaded data

repositories_df: pd.DataFrame = pd.DataFrame() # Repository URL and platform
files_editability_df: pd.DataFrame = pd.DataFrame() # 
files_info_df: pd.DataFrame = pd.DataFrame()
tags_df: pd.DataFrame = pd.DataFrame()
license_df: pd.DataFrame = pd.DataFrame()
commits_level_df: pd.DataFrame = pd.DataFrame()
issues_level_df: pd.DataFrame = pd.DataFrame()

In [5]:
# Go through each item in the loaded `json_data` list, which would represent
# a repository. Then load the data about each repository into the respective 
# DataFrames.

for repo in json_data: 
    
    # print(f"Processing repository: {repo['repository']}")
    
    #
    # Repository URL and platform
    #
    
    repositories_df = pd.concat(
        [
            repositories_df, 
            pd.DataFrame(
                {
                    "repository": [repo["repository"]], 
                    "platform": ["GitHub"]
                }
            )
        ], ignore_index=True
    )
    
    #
    # Files count, openness, and encoding
    #
    
    files_editability: dict = {
        "repository": [repo["repository"]], 
        "files_count": [repo["requested_data"]["files_editability"]["files_count"]], 
        "open": [repo["requested_data"]["files_editability"]["files_openness"]["open"]], 
        "closed": [repo["requested_data"]["files_editability"]["files_openness"]["closed"]], 
        "openness_other": [repo["requested_data"]["files_editability"]["files_openness"]["other"]], 
        "binary": [repo["requested_data"]["files_editability"]["files_encoding"]["binary"]], 
        "text": [repo["requested_data"]["files_editability"]["files_encoding"]["text"]], 
        "encoding_other": [repo["requested_data"]["files_editability"]["files_encoding"]["other"]]
    }
    
    files_editability_df = pd.concat(
        [files_editability_df, pd.DataFrame(files_editability)], 
        ignore_index=True
    )
    
    #
    # File types breakdown
    #
    
    files_info: dict = {
        "repository": [repo["repository"]], 
        "total_files": [repo["requested_data"]["files_info"]["total_files"]], 
        "ecad_files": [repo["requested_data"]["files_info"]["ecad_files"]], 
        "mcad_files": [repo["requested_data"]["files_info"]["mcad_files"]], 
        "image_files": [repo["requested_data"]["files_info"]["image_files"]], 
        "data_files": [repo["requested_data"]["files_info"]["data_files"]], 
        "document_files": [repo["requested_data"]["files_info"]["document_files"]], 
        "other_files": [repo["requested_data"]["files_info"]["other_files"]], 
        "ecad_proportion": [repo["requested_data"]["files_info"]["ecad_proportion"]], 
        "mcad_proportion": [repo["requested_data"]["files_info"]["mcad_proportion"]], 
        "image_proportion": [repo["requested_data"]["files_info"]["image_proportion"]], 
        "data_proportion": [repo["requested_data"]["files_info"]["data_proportion"]], 
        "document_proportion": [repo["requested_data"]["files_info"]["document_proportion"]], 
        "other_proportion": [repo["requested_data"]["files_info"]["other_proportion"]]
    }
    
    files_info_df = pd.concat(
        [files_info_df, pd.DataFrame(files_info)], 
        ignore_index=True
    )
    
    #
    # Repository tags
    #
    
    # Create list of tags
    repo_tags_list: list = [tag["topic"]["name"] for tag in repo["requested_data"]["tags"]]
    # Add "(no tags)" tag if there are not tags
    if len(repo_tags_list) == 0: 
        repo_tags_list = ["(no tags)"]
    
    for tag in repo_tags_list: 
        tags_df = pd.concat(
            [
                tags_df, 
                pd.DataFrame(
                    {
                        "repository": [repo["repository"]], 
                        "tag": [tag]
                    }
                )
            ], 
            ignore_index=True
        )
    
    #
    # Repository license
    #
    
    # Handle situation with no license
    if repo["requested_data"]["license"] == None: 
        license_info: dict = {
            "repository": [repo["repository"]], 
            "name": ["none detected"], 
            "spdxId": ["NA"], 
            "url": ["NA"], 
            "permissions": ["NA"], 
            "conditions": ["NA"], 
            "limitations": ["NA"]
        }
    # Otherwise, add information about license
    else: 
        permissions: list = [
            i["label"] for i in repo["requested_data"]["license"]["permissions"]
        ]
        conditions: list = [
            i["label"] for i in repo["requested_data"]["license"]["conditions"]
        ]
        limitations: list = [
            i["label"] for i in repo["requested_data"]["license"]["limitations"]
        ]
        license_info: dict = {
            "repository": [repo["repository"]], 
            "name": [repo["requested_data"]["license"]["name"]], 
            "spdxId": [repo["requested_data"]["license"]["spdxId"]], 
            "url": [repo["requested_data"]["license"]["url"]], 
            "permissions": [str(permissions)], 
            "conditions": [str(conditions)], 
            "limitations": [str(limitations)]
        }
    
    license_df = pd.concat(
        [license_df, pd.DataFrame(license_info)], 
        ignore_index=True
    )
    
    #
    # Commits
    #
    
    commits_list: list = [
        pd.DataFrame(
            {
                "repository": [repo["repository"]], 
                "oid": [commit["oid"]], 
                "committedDate": [commit["committedDate"]], 
                "messageHeadline": [commit["messageHeadline"]], 
                "commitUrl": [commit["commitUrl"]]
            }
        ) for commit in repo["requested_data"]["commits_level"]
    ]
    
    commits_level_df = pd.concat(
        [commits_level_df] + commits_list, 
        ignore_index=True
    )
    
    #
    # Issues
    #
    
    issues_list: list = [
        pd.DataFrame(
            {
                "repository": [repo["repository"]], 
                "number": [issue["number"]], 
                "title": [issue["title"]], 
                "createdAt": [issue["createdAt"]], 
                "updatedAt": [issue["updatedAt"]], 
                "url": [issue["url"]], 
                "state": [issue["state"]], 
                "closedAt": [issue["closedAt"]]
            }
        ) for issue in repo["requested_data"]["issues_level"]
    ]
    
    issues_level_df = pd.concat(
        [issues_level_df] + issues_list, 
        ignore_index=True
    )

In [6]:
# Sort commits and issues history DataFrames by date

commits_level_df = commits_level_df.sort_values("committedDate")
issues_level_df = issues_level_df.sort_values("createdAt")

In [7]:
#
# Set DataFrame column data types
#

# Types as defined in: 
# https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#basics-dtypes

repositories_df = repositories_df.astype(
    {
        "repository": str, 
        "platform": "category"
    }
)

files_editability_df = files_editability_df.astype(
    {
        "repository": str, 
        "files_count": "int", 
        "open": "int", 
        "closed": "int", 
        "openness_other": "int", 
        "binary": "int", 
        "text": "int", 
        "encoding_other": "int"
    }
)

files_info_df = files_info_df.astype(
    {
        "repository": str, 
        "total_files": "int", 
        "ecad_files": "int", 
        "mcad_files": "int", 
        "image_files": "int", 
        "data_files": "int", 
        "document_files": "int", 
        "other_files": "int", 
        "ecad_proportion": "float", 
        "mcad_proportion": "float", 
        "image_proportion": "float", 
        "data_proportion": "float", 
        "document_proportion": "float", 
        "other_proportion": "float"
    }
)

tags_df = tags_df.astype(
    {
        "repository": str, 
        "tag": str
    }
)

license_df = license_df.astype(
    {
        "repository": str, 
        "name": str, 
        "spdxId": str, 
        "url": str, 
        "permissions": str, 
        "conditions": str, 
        "limitations": str
    }
)

commits_level_df = commits_level_df.astype(
    {
        "repository": str, 
        "oid": str, 
        "committedDate": "datetime64[ns]", 
        "messageHeadline": str, 
        "commitUrl": str
    }
)

issues_level_df = issues_level_df.astype(
    {
        "repository": str, 
        "number": "int", 
        "title": str, 
        "createdAt": "datetime64[ns]", 
        "updatedAt": "datetime64[ns]", 
        "url": str, 
        "state": "category", 
        "closedAt": "datetime64[ns]"
    }
)

In [8]:
# Filter for selected repository

repo_url: str = "https://github.com/" + repo_name

repositories_df = repositories_df[repositories_df["repository"] == repo_url]
files_editability_df = files_editability_df[files_editability_df["repository"] == repo_url]
files_info_df = files_info_df[files_info_df["repository"] == repo_url]
tags_df = tags_df[tags_df["repository"] == repo_url]
license_df = license_df[license_df["repository"] == repo_url]
commits_level_df = commits_level_df[commits_level_df["repository"] == repo_url]
issues_level_df = issues_level_df[issues_level_df["repository"] == repo_url]

# Commits history

In [9]:
# Summarise commits per year-month. See: 
# https://community.plotly.com/t/line-graph-with-monthly-data-points-over-multiple-years/12969
# https://stackoverflow.com/a/66753155/186904

commits_yearmonth_df = commits_level_df.reset_index(
    ).set_index( 
        # Need to set a datetime as index for grouping
        pd.DatetimeIndex(commits_level_df["committedDate"])
    ).groupby( 
        # Group by year-month
        # https://stackoverflow.com/a/26649199/186904
        pd.Grouper(freq="M")
    ).agg( 
        # Summarise into new "commits" column
        # https://stackoverflow.com/a/57669243/186904
        # Though this might be easier: 
        # https://stackoverflow.com/a/57669243/186904
        commits=pd.NamedAgg(column="oid", aggfunc="count")
    ).reset_index()

In [10]:
# Plot commits history

commits_history_plot = px.line(
    commits_yearmonth_df, 
    x="committedDate", 
    y="commits"
)
commits_history_plot.show()

# Issues opened/closed

In [11]:
# Summaries number of issues created/closed by year-month

issues_created_yearmonth_df = issues_level_df.reset_index(
    ).set_index(
        pd.DatetimeIndex(issues_level_df["createdAt"])
    ).groupby(
        pd.Grouper(freq="M")
    ).agg(
        created=pd.NamedAgg(column="createdAt", aggfunc="count"),
    ).reset_index()

issues_closed_yearmonth_df = issues_level_df.reset_index(
    ).set_index(
        pd.DatetimeIndex(issues_level_df["closedAt"])
    ).groupby(
        pd.Grouper(freq="M")
    ).agg(
        closed=pd.NamedAgg(column="closedAt", aggfunc="count"),
    ).reset_index()

In [12]:
# Plot issues history
# How to overlay multiple graphs:
# https://stackoverflow.com/q/60372991/186904

issues_history_plot = go.Figure()

issues_history_plot.add_trace(
    go.Scatter(
        x=issues_created_yearmonth_df["createdAt"], 
        y=issues_created_yearmonth_df["created"], 
        name="Opened"
    )
)


issues_history_plot.add_trace(
    go.Scatter(
        x=issues_closed_yearmonth_df["closedAt"], 
        y=issues_closed_yearmonth_df["closed"], 
        name="Closed"
    )
)

issues_history_plot.show()

# Tags

In [13]:
str(
    list(tags_df["tag"])
).replace(
    "[", ""
).replace(
    "]", ""
).replace(
    "'", ""
)

'(no tags)'

# File type breakdown

In [14]:
# Create pie chart

files_info_plot = go.Figure(
    data=[
        go.Pie(
            labels=["ECAD", "MCAD", "image", "data", "document", "other"], 
            values=[
                int(files_info_df["ecad_files"]), 
                int(files_info_df["mcad_files"]), 
                int(files_info_df["image_files"]), 
                int(files_info_df["data_files"]), 
                int(files_info_df["document_files"]), 
                int(files_info_df["other_files"])
            ], 
            hole=0.3
        )
    ]
)

colors: list = ['gold', 'mediumturquoise', 'darkorange', 'lightgreen']

files_info_plot.update_traces(
    hoverinfo="label+percent", 
    textinfo="value", 
    textfont_size=14, 
    marker=dict(
        colors=colors, 
        line=dict(color='#000000', width=2)
    )
)

files_info_plot.show()

# File editability breakdown

In [15]:
# Plot with two charts side-by-side via `domain`: 
# https://plotly.com/python/pie-charts/
# https://stackoverflow.com/a/47591495/186904

files_editability_plot = make_subplots(
    rows=1, 
    cols=2, 
    specs=[[{"type": "domain"}, {"type": "domain"}]]
)

files_editability_plot.add_trace(
    go.Pie(
        labels=["open", "closed", "other"], 
        values=[
            int(files_editability_df["open"]), 
            int(files_editability_df["closed"]), 
            int(files_editability_df["openness_other"])
        ], 
        name="Open/closed format"
    ), 
    1, 1 # Specifies where this pie chart goes in the plot
)

files_editability_plot.add_trace(
    go.Pie(
        labels=["binary", "text", "other"], 
        values=[
            int(files_editability_df["binary"]), 
            int(files_editability_df["text"]), 
            int(files_editability_df["encoding_other"])
        ], 
        name="Encoding"
    ), 
    1, 2
)

files_editability_plot.update_traces(
    hoverinfo="label+value", 
    textinfo="label+percent", 
    textfont_size=14, 
    marker=dict(
        line=dict(color='#000000', width=2)
    ), 
    hole=0.3
)

files_editability_plot.show()

# License information

In [16]:
# Get specific value from DataFrame: 
# https://stackoverflow.com/a/53577162/186904
license_name: str = license_df.loc[license_df.index, "name"].iat[0]
license_url: str = license_df.loc[license_df.index, "url"].iat[0]

if license_name == "none detected": 
    license_name = "(no license detected)"
    license_url = "https://choosealicense.com/no-permission/"
elif license_name == "Other": 
    license_name = "(other)"
    license_url = license_df.loc[license_df.index, "repository"].iat[0]

# Incorporating variables into Markdown
# https://stackoverflow.com/a/57023238/186904
license_md = f"""
## {license_name}

### Click [here]({license_url}) for more.
"""
md(license_md)


## GNU General Public License v3.0

### Click [here](http://choosealicense.com/licenses/gpl-3.0/) for more.
