## Scraping Data from CodeClimate


In [1]:
import requests
import json
import os
from dotenv import load_dotenv
import datetime
import subprocess as sp
import pandas as pd

In [2]:
load_dotenv()

True

In [5]:
github_token = os.getenv("GITHUB_ACCESS_TOKEN")
sonarqube_token = os.getenv("SONARQUBE_ACCESS_TOKEN_VUE")
sonarqube_server_url = os.getenv("SONARQUBE_SERVER_URL")

In [6]:
demo_github_slug = "vuejs"
demo_repo_name = "vue"
demo_repo_path = "C:/SWE Class/Github Desktop/TestProjects/vue"

In [7]:
sonarqube_token, sonarqube_server_url

('sqp_6a56b3fba4ce9fb8a298c75695285615a47d87dc', 'http://localhost:9000')

## Git CLI Data Extraction

In [8]:
def get_tag_list(repo_path: str, max_output: int = 20) -> list[str]:
    # Subprocess to get the git directory
    sp.run(
        ["git", "rev-parse", "--git-dir"],
        cwd=repo_path,
        capture_output=True,
        check=True,
    )

    # Subprocess to get the git tags in the repo
    result = sp.run(
        ["git", "tag", "-l"],
        cwd=repo_path,
        capture_output=True,
        check=True,
    )

    tag_list = result.stdout.decode("utf-8").split("\n")
    tag_list = [tag for tag in tag_list if tag != ""]

    if max_output is not None and len(tag_list) > max_output:
        tag_list = tag_list[-max_output:]

    return tag_list

In [9]:
get_tag_list(demo_repo_path)

['v2.7.0-beta.4',
 'v2.7.0-beta.5',
 'v2.7.0-beta.6',
 'v2.7.0-beta.7',
 'v2.7.0-beta.8',
 'v2.7.1',
 'v2.7.10',
 'v2.7.11',
 'v2.7.12',
 'v2.7.13',
 'v2.7.14',
 'v2.7.15',
 'v2.7.2',
 'v2.7.3',
 'v2.7.4',
 'v2.7.5',
 'v2.7.6',
 'v2.7.7',
 'v2.7.8',
 'v2.7.9']

In [10]:
def get_tag_and_timestamp(repo_path: str) -> list[tuple[str, str, int]]:
    tag_list = get_tag_list(repo_path)
    tag_and_timestamp = []

    for tag in tag_list:
        result = sp.run(
            ["git", "log", "-1", "--format=%cd %ct", "--date=short", tag],
            cwd=repo_path,
            capture_output=True,
            check=True,
            text=True,
        )

        date, timestamp = result.stdout.strip().split()
        tag_and_timestamp.append((tag, date, int(timestamp)))
        # print(result.stdout.strip())
    return tag_and_timestamp

In [11]:
tags_and_timestamps = get_tag_and_timestamp(demo_repo_path)
tags_and_timestamps

[('v2.7.0-beta.4', '2022-06-21', 1655779879),
 ('v2.7.0-beta.5', '2022-06-22', 1655864569),
 ('v2.7.0-beta.6', '2022-06-26', 1656232914),
 ('v2.7.0-beta.7', '2022-06-27', 1656313230),
 ('v2.7.0-beta.8', '2022-06-28', 1656381608),
 ('v2.7.1', '2022-07-04', 1656931115),
 ('v2.7.10', '2022-08-23', 1661218182),
 ('v2.7.11', '2022-10-11', 1665486786),
 ('v2.7.12', '2022-10-12', 1665582096),
 ('v2.7.13', '2022-10-14', 1665718916),
 ('v2.7.14', '2022-11-09', 1667997592),
 ('v2.7.15', '2023-10-23', 1698047740),
 ('v2.7.2', '2022-07-05', 1656989687),
 ('v2.7.3', '2022-07-06', 1657084582),
 ('v2.7.4', '2022-07-08', 1657266381),
 ('v2.7.5', '2022-07-13', 1657681717),
 ('v2.7.6', '2022-07-15', 1657876348),
 ('v2.7.7', '2022-07-16', 1657982731),
 ('v2.7.8', '2022-07-22', 1658460279),
 ('v2.7.9', '2022-08-19', 1660883232)]

In [12]:
# Checkout to a specific tag and show the output of the operation
def checkout_to_tag(repo_path: str, tag: str):
    result = sp.run(
        ["git", "checkout", tag],
        cwd=repo_path,
        capture_output=True,
        check=True,
        text=True,
    )
    print(result.stdout.strip())

In [13]:
# Checkout to a specific tag use sonar-scanner to scan the project
def sonar_scan(repo_path: str, tag: str):
    result = sp.run(
        [
            "sonar-scanner",
            "-D",
            f"sonar.projectBaseDir={demo_repo_path}",
            "-D",
            "sonar.projectVersion=" + tag,
            "-D",
            f"sonar.projectKey={demo_repo_name}",
            "-D",
            "sonar.sources=.",
            "-D",
            f"sonar.host.url={sonarqube_server_url}",
            "-D",
            "sonar.login=" + sonarqube_token,
        ],
        cwd=repo_path,
        capture_output=True,
        check=True,
        text=True,
        shell=True,
    )
    # print(result.stdout)

In [14]:
# Call the sonarqube api to get the metrics of the project
def get_metrics() -> dict:
    result = sp.run(
        [
            "curl.exe",
            f"{sonarqube_server_url}/api/measures/component?component={demo_repo_name}&metricKeys=bugs,vulnerabilities,code_smells,coverage,duplicated_lines_density,violations,ncloc,security_rating,reliability_rating,sqale_rating",
        ],
        capture_output=True,
        check=True,
        text=True,
        # shell=True,
    )
    return json.loads(result.stdout)

'code_smells',
 'sqale_rating',
 'vulnerabilities',
 'security_rating',
 'duplicated_lines_density',
 'violations',
 'bugs',
 'reliability_rating',
 'coverage',
 'ncloc'

In [15]:
metrics_dict = {
    "version": [],
    "date": [],
    "timestamp": [],
    "bugs": [],
    "vulnerabilities": [],
    "code_smells": [],
    "coverage": [],
    "duplicated_lines_density": [],
    "violations": [],
    "ncloc": [],
    "security_rating": [],
    "reliability_rating": [],
    "sqale_rating": [],
}

In [16]:
# Get metrics of the project
def get_metrics_from_version_tags(
    repo_path: str, tag: str, date: str, timestamp: int, output_dict=metrics_dict
):
    checkout_to_tag(repo_path, tag)
    sonar_scan(repo_path, tag)
    metrics_result = get_metrics()

    output_dict["version"].append(tag)
    output_dict["date"].append(date)
    output_dict["timestamp"].append(timestamp)

    for metric in metrics_result["component"]["measures"]:
        output_dict[metric["metric"]].append(metric["value"])

    return output_dict

In [15]:
# A mock call to test sonar-scanner
sp.run(
    ["sonar-scanner", "-h"],
    capture_output=True,
    check=True,
    text=True,
    shell=True,
)

CompletedProcess(args=['sonar-scanner', '-h'], returncode=0, stdout='INFO: \nINFO: usage: sonar-scanner [options]\nINFO: \nINFO: Options:\nINFO:  -D,--define <arg>     Define property\nINFO:  -h,--help             Display help information\nINFO:  -v,--version          Display version information\nINFO:  -X,--debug            Produce execution debug output\n', stderr='')

In [None]:
for tag_and_timestamp in tags_and_timestamps:
    get_metrics_from_version_tags(demo_repo_path, *tag_and_timestamp)

In [None]:
metrics_dict

In [18]:
metrics_dataset = pd.DataFrame(metrics_dict, index=None)
metrics_dataset

Unnamed: 0,version,date,timestamp,bugs,vulnerabilities,code_smells,coverage,duplicated_lines_density,violations,ncloc,security_rating,reliability_rating,sqale_rating
0,v2.7.0-beta.4,2022-06-21,1655779879,152,1,978,0.0,9.3,1131,60574,5.0,4.0,1.0
1,v2.7.0-beta.5,2022-06-22,1655864569,152,1,977,0.0,9.3,1130,60569,5.0,4.0,1.0
2,v2.7.0-beta.6,2022-06-26,1656232914,152,1,977,0.0,9.3,1130,60609,5.0,4.0,1.0
3,v2.7.0-beta.7,2022-06-27,1656313230,152,1,977,0.0,9.3,1130,60588,5.0,4.0,1.0
4,v2.7.0-beta.8,2022-06-28,1656381608,152,1,977,0.0,9.3,1130,60591,5.0,4.0,1.0
5,v2.7.1,2022-07-04,1656931115,153,1,977,0.0,9.3,1131,60648,5.0,4.0,1.0
6,v2.7.10,2022-08-23,1661218182,171,1,1018,0.0,9.2,1190,62723,5.0,4.0,1.0
7,v2.7.11,2022-10-11,1665486786,171,1,1019,0.0,9.2,1191,62791,5.0,4.0,1.0
8,v2.7.12,2022-10-12,1665582096,171,1,1019,0.0,9.2,1191,62768,5.0,4.0,1.0
9,v2.7.13,2022-10-14,1665718916,171,1,1019,0.0,9.2,1191,62779,5.0,4.0,1.0


In [19]:
metrics_dataset.to_csv("../Datasets/vue_metrics.csv", index=False)

## SonarQube Data Extraction For Severity and Code Smells
