## Scraping Data from CodeClimate


In [73]:
import requests
from collections import defaultdict
import json
import os
from dotenv import load_dotenv
import datetime
import subprocess as sp
import pandas as pd

In [2]:
load_dotenv()

True

In [3]:
github_token = os.getenv("GITHUB_ACCESS_TOKEN")
sonarqube_token = os.getenv("SONARQUBE_ACCESS_TOKEN_REACT")
sonarqube_server_url = os.getenv("SONARQUBE_SERVER_URL")

In [4]:
demo_github_slug = "facebook"
demo_repo_name = "react"
demo_repo_path = "C:/SWE Class/Github Desktop/TestProjects/react"

In [5]:
github_token, sonarqube_token, sonarqube_server_url

('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',
 'sqp_172ad79aae5ff0c3ba353b1dfa54d4bbfa46ccbe',
 'http://localhost:9000')

In [6]:
def iso_time_to_unix_epoch(iso_time: str):
    return int(datetime.datetime.strptime(iso_time, "%Y-%m-%dT%H:%M:%SZ").timestamp())

## Git CLI Data Extraction

In [7]:
def get_tag_list(repo_path: str, max_output: int = 20) -> list[str]:
    # Subprocess to get the git directory
    sp.run(
        ["git", "rev-parse", "--git-dir"],
        cwd=repo_path,
        capture_output=True,
        check=True,
    )

    # Subprocess to get the git tags in the repo
    result = sp.run(
        ["git", "tag", "-l"],
        cwd=repo_path,
        capture_output=True,
        check=True,
    )

    tag_list = result.stdout.decode("utf-8").split("\n")
    tag_list = [tag for tag in tag_list if tag != ""]

    if max_output is not None and len(tag_list) > max_output:
        tag_list = tag_list[-max_output:]

    return tag_list

In [8]:
get_tag_list(demo_repo_path)

['v16.6.3',
 'v16.7.0',
 'v16.7.0-alpha.1',
 'v16.8.0',
 'v16.8.0-alpha.1',
 'v16.8.1',
 'v16.8.2',
 'v16.8.3',
 'v16.8.4',
 'v16.8.5',
 'v16.8.6',
 'v16.9.0',
 'v16.9.0-alpha.0',
 'v16.9.0-rc.0',
 'v17.0.0',
 'v17.0.1',
 'v17.0.2',
 'v18.0.0',
 'v18.1.0',
 'v18.2.0']

In [9]:
def get_tag_and_timestamp(repo_path: str) -> list[tuple[str, str, int]]:
    tag_list = get_tag_list(repo_path)
    tag_and_timestamp = []

    for tag in tag_list:
        result = sp.run(
            ["git", "log", "-1", "--format=%cd %ct", "--date=short", tag],
            cwd=repo_path,
            capture_output=True,
            check=True,
            text=True,
        )

        date, timestamp = result.stdout.strip().split()
        tag_and_timestamp.append((tag, date, int(timestamp)))
        # print(result.stdout.strip())
    return tag_and_timestamp

In [10]:
tags_and_timestamps = get_tag_and_timestamp(demo_repo_path)
tags_and_timestamps

[('v16.6.3', '2018-11-12', 1542074346),
 ('v16.7.0', '2018-12-14', 1544814203),
 ('v16.7.0-alpha.1', '2018-11-13', 1542137020),
 ('v16.8.0', '2019-02-05', 1549388061),
 ('v16.8.0-alpha.1', '2019-01-15', 1547592942),
 ('v16.8.1', '2019-02-06', 1549473399),
 ('v16.8.2', '2019-02-14', 1550164887),
 ('v16.8.3', '2019-02-21', 1550769698),
 ('v16.8.4', '2019-03-05', 1551827761),
 ('v16.8.5', '2019-03-22', 1553266152),
 ('v16.8.6', '2019-03-27', 1553756148),
 ('v16.9.0', '2019-08-05', 1565035265),
 ('v16.9.0-alpha.0', '2019-04-03', 1554308571),
 ('v16.9.0-rc.0', '2019-08-05', 1565035265),
 ('v17.0.0', '2020-10-20', 1603226024),
 ('v17.0.1', '2020-10-22', 1603367960),
 ('v17.0.2', '2021-03-22', 1616443295),
 ('v18.0.0', '2022-03-29', 1648570053),
 ('v18.1.0', '2022-04-26', 1651004928),
 ('v18.2.0', '2022-06-08', 1654732789)]

In [11]:
# Checkout to a specific tag and show the output of the operation
def checkout_to_tag(repo_path: str, tag: str):
    result = sp.run(
        ["git", "checkout", tag],
        cwd=repo_path,
        capture_output=True,
        check=True,
        text=True,
    )
    print(result.stdout.strip())

In [12]:
# Checkout to a specific tag use sonar-scanner to scan the project
def sonar_scan(repo_path: str, tag: str):
    result = sp.run(
        [
            "sonar-scanner",
            "-D",
            f"sonar.projectBaseDir={demo_repo_path}",
            "-D",
            "sonar.projectVersion=" + tag,
            "-D",
            f"sonar.projectKey={demo_repo_name}",
            "-D",
            "sonar.sources=.",
            "-D",
            f"sonar.host.url={sonarqube_server_url}",
            "-D",
            "sonar.login=" + sonarqube_token,
        ],
        cwd=repo_path,
        capture_output=True,
        check=True,
        text=True,
        shell=True,
    )
    # print(result.stdout)

In [13]:
# Call the sonarqube api to get the metrics of the project
def get_metrics() -> dict:
    result = sp.run(
        [
            "curl.exe",
            f"{sonarqube_server_url}/api/measures/component?component={demo_repo_name}&metricKeys=bugs,vulnerabilities,code_smells,coverage,duplicated_lines_density,violations,ncloc,security_rating,reliability_rating,sqale_rating",
        ],
        capture_output=True,
        check=True,
        text=True,
        # shell=True,
    )
    return json.loads(result.stdout)

'code_smells',
 'sqale_rating',
 'vulnerabilities',
 'security_rating',
 'duplicated_lines_density',
 'violations',
 'bugs',
 'reliability_rating',
 'coverage',
 'ncloc'

In [14]:
metrics_dict = {
    "version": [],
    "date": [],
    "timestamp": [],
    "bugs": [],
    "vulnerabilities": [],
    "code_smells": [],
    "coverage": [],
    "duplicated_lines_density": [],
    "violations": [],
    "ncloc": [],
    "security_rating": [],
    "reliability_rating": [],
    "sqale_rating": [],
}

In [15]:
# Get metrics of the project
def get_metrics_from_version_tags(
    repo_path: str, tag: str, date: str, timestamp: int, output_dict=metrics_dict
):
    checkout_to_tag(repo_path, tag)
    sonar_scan(repo_path, tag)
    metrics_result = get_metrics()

    output_dict["version"].append(tag)
    output_dict["date"].append(date)
    output_dict["timestamp"].append(timestamp)

    for metric in metrics_result["component"]["measures"]:
        output_dict[metric["metric"]].append(metric["value"])

    return output_dict

In [16]:
# A mock call to test sonar-scanner
sp.run(
    ["sonar-scanner", "-h"],
    capture_output=True,
    check=True,
    text=True,
    shell=True,
)

CompletedProcess(args=['sonar-scanner', '-h'], returncode=0, stdout='INFO: \nINFO: usage: sonar-scanner [options]\nINFO: \nINFO: Options:\nINFO:  -D,--define <arg>     Define property\nINFO:  -h,--help             Display help information\nINFO:  -v,--version          Display version information\nINFO:  -X,--debug            Produce execution debug output\n', stderr='')

In [61]:
for tag_and_timestamp in tags_and_timestamps:
    get_metrics_from_version_tags(demo_repo_path, *tag_and_timestamp)























In [62]:
metrics_dict

{'version': ['v16.6.3',
  'v16.7.0',
  'v16.7.0-alpha.1',
  'v16.8.0',
  'v16.8.0-alpha.1',
  'v16.8.1',
  'v16.8.2',
  'v16.8.3',
  'v16.8.4',
  'v16.8.5',
  'v16.8.6',
  'v16.9.0',
  'v16.9.0-alpha.0',
  'v16.9.0-rc.0',
  'v17.0.0',
  'v17.0.1',
  'v17.0.2',
  'v18.0.0',
  'v18.1.0',
  'v18.2.0'],
 'date': ['2018-11-12',
  '2018-12-14',
  '2018-11-13',
  '2019-02-05',
  '2019-01-15',
  '2019-02-06',
  '2019-02-14',
  '2019-02-21',
  '2019-03-05',
  '2019-03-22',
  '2019-03-27',
  '2019-08-05',
  '2019-04-03',
  '2019-08-05',
  '2020-10-20',
  '2020-10-22',
  '2021-03-22',
  '2022-03-29',
  '2022-04-26',
  '2022-06-08'],
 'timestamp': [1542074346,
  1544814203,
  1542137020,
  1549388061,
  1547592942,
  1549473399,
  1550164887,
  1550769698,
  1551827761,
  1553266152,
  1553756148,
  1565035265,
  1554308571,
  1565035265,
  1603226024,
  1603367960,
  1616443295,
  1648570053,
  1651004928,
  1654732789],
 'bugs': ['165',
  '175',
  '165',
  '195',
  '195',
  '195',
  '197',
  '19

In [63]:
metrics_dataset = pd.DataFrame(metrics_dict, index=None)
metrics_dataset

Unnamed: 0,version,date,timestamp,bugs,vulnerabilities,code_smells,coverage,duplicated_lines_density,violations,ncloc,security_rating,reliability_rating,sqale_rating
0,v16.6.3,2018-11-12,1542074346,165,5,1673,0.0,9.3,1843,113173,4.0,5.0,1.0
1,v16.7.0,2018-12-14,1544814203,175,3,1707,0.0,9.0,1885,115388,4.0,5.0,1.0
2,v16.7.0-alpha.1,2018-11-13,1542137020,165,5,1675,0.0,9.3,1845,113612,4.0,5.0,1.0
3,v16.8.0,2019-02-05,1549388061,195,3,1702,0.0,8.8,1900,117072,4.0,5.0,1.0
4,v16.8.0-alpha.1,2019-01-15,1547592942,195,3,1702,0.0,8.8,1900,117072,4.0,5.0,1.0
5,v16.8.1,2019-02-06,1549473399,195,3,1703,0.0,8.8,1901,117102,4.0,5.0,1.0
6,v16.8.2,2019-02-14,1550164887,197,3,1714,0.0,9.0,1914,118027,4.0,5.0,1.0
7,v16.8.3,2019-02-21,1550769698,197,3,1715,0.0,9.0,1915,118190,4.0,5.0,1.0
8,v16.8.4,2019-03-05,1551827761,195,3,1713,0.0,8.9,1911,118398,4.0,5.0,1.0
9,v16.8.5,2019-03-22,1553266152,195,3,1727,0.0,9.5,1925,120058,4.0,5.0,1.0


In [64]:
metrics_dataset.to_csv("../Datasets/react_metrics.csv", index=False)

## Issues Narrowing from SonarQube

In [120]:
# A function that can convert given time string into minutes. The time string may include hours, minutes and seconds. It is in the following format: "1d2h3min". The days, hours or minutes may be missing. If the string is empty, return 0.


def time_str_to_minutes(time: str) -> int:
    if time == "":
        return 0

    time = time.replace("d", " ").replace("h", " ").replace("min", " ").split()
    time = [int(t) for t in time]

    if len(time) == 1:
        return time[0]
    elif len(time) == 2:
        return time[0] * 60 + time[1]
    elif len(time) == 3:
        return time[0] * 24 * 60 + time[1] * 60 + time[2]
    else:
        raise ValueError("Invalid time string")

In [121]:
clean_code_attribute_categories = [
    "RESPONSIBLE",
    "ADAPTABLE",
    "CONSISTENT",
    "INTENTIONAL",
]

severities = ["LOW", "MEDIUM", "HIGH"]

quality_attributes = [
    "RELIABILITY",
    "SECURITY",
    "MAINTAINABILITY",
]

In [122]:
clean_code_attribute_dict = defaultdict()

for category in clean_code_attribute_categories:
    clean_code_attribute_dict[category] = defaultdict(list)

clean_code_attribute_dict

defaultdict(None,
            {'RESPONSIBLE': defaultdict(list, {}),
             'ADAPTABLE': defaultdict(list, {}),
             'CONSISTENT': defaultdict(list, {}),
             'INTENTIONAL': defaultdict(list, {})})

In [123]:
def get_quality_issue_values_from_api(clean_code_category: str, severity: str):
    """
    return a tuple of (total_security_issues, total_reliability_issues, total_maintainability_issues, total_debt)
    """

    result = sp.run(
        [
            "curl.exe",
            f"{sonarqube_server_url}/api/issues/search?cleanCodeAttributeCategories={clean_code_category}",
        ],
        capture_output=True,
        check=True,
        text=True,
        # shell=True,
    )

    issues_list = json.loads(result.stdout)["issues"]

    # Get the software quality, maintainability and debt from the issues list
    filtered_issues_list = [
        {
            "software_quality_category": issue["impacts"][0]["softwareQuality"],
            "severity": issue["impacts"][0]["severity"],
            "debt": time_str_to_minutes(issue["debt"]),
        }
        for issue in issues_list
        if issue["type"] == "CODE_SMELL" and issue["impacts"][0]["severity"] == severity
    ]

    # Get the total number of security, reliability and maintainability quality issue count and debt
    total_security_issues = len(
        [
            issue
            for issue in filtered_issues_list
            if issue["software_quality_category"] == "SECURITY"
        ]
    )
    total_reliability_issues = len(
        [
            issue
            for issue in filtered_issues_list
            if issue["software_quality_category"] == "RELIABILITY"
        ]
    )
    total_maintainability_issues = len(
        [
            issue
            for issue in filtered_issues_list
            if issue["software_quality_category"] == "MAINTAINABILITY"
        ]
    )

    total_debt = sum([issue["debt"] for issue in filtered_issues_list])

    return (
        total_security_issues,
        total_reliability_issues,
        total_maintainability_issues,
        total_debt,
    )

In [124]:
get_quality_issue_values_from_api("CONSISTENT", "HIGH")

(0, 0, 2, 10)

In [125]:
# For each version, get the total number of security, reliability and maintainability quality issue count and debt and store them in the corrsponding dictionary based on the clean code category

for tag_and_timestamp in tags_and_timestamps:
    tag, date, timestamp = tag_and_timestamp
    checkout_to_tag(demo_repo_path, tag)
    sonar_scan(demo_repo_path, tag)

    for clean_code_category in clean_code_attribute_categories:
        clean_code_attribute_dict[clean_code_category]["version"].append(tag)
        clean_code_attribute_dict[clean_code_category]["date"].append(date)
        clean_code_attribute_dict[clean_code_category]["timestamp"].append(timestamp)

        for severity in severities:
            (
                total_security_issues,
                total_reliability_issues,
                total_maintainability_issues,
                total_debt,
            ) = get_quality_issue_values_from_api(clean_code_category, severity)

            clean_code_attribute_dict[clean_code_category][
                f"security_issues_{severity.lower()}"
            ].append(total_security_issues)

            clean_code_attribute_dict[clean_code_category][
                f"reliability_issues_{severity.lower()}"
            ].append(total_reliability_issues)

            clean_code_attribute_dict[clean_code_category][
                f"maintainability_issues_{severity.lower()}"
            ].append(total_maintainability_issues)

            clean_code_attribute_dict[clean_code_category][
                f"total_debt_{severity.lower()}"
            ].append(total_debt)























In [126]:
for clean_code_attribute, attribute_dict in clean_code_attribute_dict.items():
    dataframe = pd.DataFrame(attribute_dict, index=None)
    dataframe.to_csv(
        f"../Datasets/{demo_repo_name}/{demo_repo_name}_{clean_code_attribute.lower()}_issues.csv",
        index=False,
    )