## Scraping Data from CodeClimate


In [1]:
import requests
import json
import os
from dotenv import load_dotenv
import datetime
import subprocess as sp
import pandas as pd

In [2]:
load_dotenv()

True

In [3]:
github_token = os.getenv("GITHUB_ACCESS_TOKEN")
sonarqube_token = os.getenv("SONARQUBE_ACCESS_TOKEN")
sonarqube_server_url = os.getenv("SONARQUBE_SERVER_URL")

In [4]:
demo_github_slug = "youzan"
demo_repo_name = "vant"
demo_repo_path = "C:/SWE Class/Github Desktop/TestProjects/vant"

In [5]:
github_token, sonarqube_token, sonarqube_server_url

('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',
 'sqp_2d92763dccbe484e3ca1d7f9ada63af2ac83b710',
 'http://localhost:9000')

In [6]:
def iso_time_to_unix_epoch(iso_time: str):
    return int(datetime.datetime.strptime(iso_time, "%Y-%m-%dT%H:%M:%SZ").timestamp())

## Git CLI Data Extraction

In [7]:
def get_tag_list(repo_path: str, max_output: int = 20) -> list[str]:
    # Subprocess to get the git directory
    sp.run(
        ["git", "rev-parse", "--git-dir"],
        cwd=repo_path,
        capture_output=True,
        check=True,
    )

    # Subprocess to get the git tags in the repo
    result = sp.run(
        ["git", "tag", "-l"],
        cwd=repo_path,
        capture_output=True,
        check=True,
    )

    tag_list = result.stdout.decode("utf-8").split("\n")
    tag_list = [tag for tag in tag_list if tag != ""]

    if len(tag_list) > max_output:
        tag_list = tag_list[-max_output:]

    return tag_list

In [8]:
get_tag_list(demo_repo_path)

['v4.3.1',
 'v4.3.2',
 'v4.4.0',
 'v4.4.1',
 'v4.5.0',
 'v4.6.0',
 'v4.6.1',
 'v4.6.2',
 'v4.6.3',
 'v4.6.4',
 'v4.6.4-beta.2',
 'v4.6.5',
 'v4.6.6',
 'v4.6.7',
 'v4.6.8',
 'v4.7.0',
 'v4.7.1',
 'v4.7.2',
 'v4.7.3',
 'v4.8.0']

In [9]:
def get_tag_and_timestamp(repo_path: str) -> list[tuple[str, str, int]]:
    tag_list = get_tag_list(repo_path)
    tag_and_timestamp = []

    for tag in tag_list:
        result = sp.run(
            ["git", "log", "-1", "--format=%cd %ct", "--date=short", tag],
            cwd=repo_path,
            capture_output=True,
            check=True,
            text=True,
        )

        date, timestamp = result.stdout.strip().split()
        tag_and_timestamp.append((tag, date, int(timestamp)))
        # print(result.stdout.strip())
    return tag_and_timestamp

In [19]:
tags_and_timestamps = get_tag_and_timestamp(demo_repo_path)
tags_and_timestamps

[('v4.3.1', '2023-05-04', 1683164209),
 ('v4.3.2', '2023-05-14', 1684072917),
 ('v4.4.0', '2023-05-21', 1684679638),
 ('v4.4.1', '2023-05-28', 1685280871),
 ('v4.5.0', '2023-06-11', 1686494535),
 ('v4.6.0', '2023-06-24', 1687574300),
 ('v4.6.1', '2023-07-02', 1688290864),
 ('v4.6.2', '2023-07-09', 1688905738),
 ('v4.6.3', '2023-07-23', 1690109073),
 ('v4.6.4', '2023-08-06', 1691326799),
 ('v4.6.4-beta.2', '2023-07-30', 1690693244),
 ('v4.6.5', '2023-08-16', 1692181559),
 ('v4.6.6', '2023-08-20', 1692519712),
 ('v4.6.7', '2023-09-04', 1693837403),
 ('v4.6.8', '2023-09-10', 1694310416),
 ('v4.7.0', '2023-09-24', 1695564091),
 ('v4.7.1', '2023-10-06', 1696560298),
 ('v4.7.2', '2023-10-15', 1697363094),
 ('v4.7.3', '2023-10-29', 1698572613),
 ('v4.8.0', '2023-11-19', 1700370237)]

In [11]:
# Checkout to a specific tag and show the output of the operation
def checkout_to_tag(repo_path: str, tag: str):
    result = sp.run(
        ["git", "checkout", tag],
        cwd=repo_path,
        capture_output=True,
        check=True,
        text=True,
    )
    print(result.stdout.strip())

In [21]:
# Checkout to a specific tag use sonar-scanner to scan the project
def sonar_scan(repo_path: str, tag: str):
    result = sp.run(
        [
            "sonar-scanner",
            f"-D sonar.projectBaseDir={demo_repo_path}",
            "-D",
            "sonar.projectVersion=" + tag,
            "-D",
            "sonar.projectKey=Vant",
            "-D",
            "sonar.sources=.",
            "-D",
            f"sonar.host.url={sonarqube_server_url}",
            "-D",
            "sonar.login=" + sonarqube_token,
        ],
        cwd=repo_path,
        capture_output=True,
        check=True,
        text=True,
        shell=True,
    )
    # print(result.stdout)

In [22]:
# Call the sonarqube api to get the metrics of the project
def get_metrics() -> dict:
    result = sp.run(
        [
            "curl.exe",
            f"{sonarqube_server_url}/api/measures/component?component=Vant&metricKeys=bugs,vulnerabilities,code_smells,coverage,duplicated_lines_density,violations,ncloc,security_rating,reliability_rating,sqale_rating",
        ],
        capture_output=True,
        check=True,
        text=True,
        # shell=True,
    )
    return json.loads(result.stdout)

'code_smells',
 'sqale_rating',
 'vulnerabilities',
 'security_rating',
 'duplicated_lines_density',
 'violations',
 'bugs',
 'reliability_rating',
 'coverage',
 'ncloc'

In [25]:
metrics_dict = {
    "version": [],
    "date": [],
    "timestamp": [],
    "bugs": [],
    "vulnerabilities": [],
    "code_smells": [],
    "coverage": [],
    "duplicated_lines_density": [],
    "violations": [],
    "ncloc": [],
    "security_rating": [],
    "reliability_rating": [],
    "sqale_rating": [],
}

In [26]:
# Get metrics of the project
def get_metrics_from_version_tags(
    repo_path: str, tag: str, date: str, timestamp: int, output_dict=metrics_dict
):
    checkout_to_tag(repo_path, tag)
    sonar_scan(repo_path, tag)
    metrics_result = get_metrics()

    output_dict["version"].append(tag)
    output_dict["date"].append(date)
    output_dict["timestamp"].append(timestamp)

    for metric in metrics_result["component"]["measures"]:
        output_dict[metric["metric"]].append(metric["value"])

    return output_dict

In [27]:
for tag_and_timestamp in tags_and_timestamps:
    get_metrics_from_version_tags(demo_repo_path, *tag_and_timestamp)























In [28]:
metrics_dict

{'version': ['v4.3.1',
  'v4.3.2',
  'v4.4.0',
  'v4.4.1',
  'v4.5.0',
  'v4.6.0',
  'v4.6.1',
  'v4.6.2',
  'v4.6.3',
  'v4.6.4',
  'v4.6.4-beta.2',
  'v4.6.5',
  'v4.6.6',
  'v4.6.7',
  'v4.6.8',
  'v4.7.0',
  'v4.7.1',
  'v4.7.2',
  'v4.7.3',
  'v4.8.0'],
 'date': ['2023-05-04',
  '2023-05-14',
  '2023-05-21',
  '2023-05-28',
  '2023-06-11',
  '2023-06-24',
  '2023-07-02',
  '2023-07-09',
  '2023-07-23',
  '2023-08-06',
  '2023-07-30',
  '2023-08-16',
  '2023-08-20',
  '2023-09-04',
  '2023-09-10',
  '2023-09-24',
  '2023-10-06',
  '2023-10-15',
  '2023-10-29',
  '2023-11-19'],
 'timestamp': [1683164209,
  1684072917,
  1684679638,
  1685280871,
  1686494535,
  1687574300,
  1688290864,
  1688905738,
  1690109073,
  1691326799,
  1690693244,
  1692181559,
  1692519712,
  1693837403,
  1694310416,
  1695564091,
  1696560298,
  1697363094,
  1698572613,
  1700370237],
 'bugs': ['23',
  '23',
  '23',
  '23',
  '23',
  '23',
  '20',
  '20',
  '20',
  '20',
  '20',
  '20',
  '22',
  '22'

In [31]:
metrics_dataset = pd.DataFrame(metrics_dict, index=None)
metrics_dataset

Unnamed: 0,version,date,timestamp,bugs,vulnerabilities,code_smells,coverage,duplicated_lines_density,violations,ncloc,security_rating,reliability_rating,sqale_rating
0,v4.3.1,2023-05-04,1683164209,23,0,220,0.0,3.8,243,64687,1.0,3.0,1.0
1,v4.3.2,2023-05-14,1684072917,23,0,221,0.0,3.8,244,64873,1.0,3.0,1.0
2,v4.4.0,2023-05-21,1684679638,23,0,222,0.0,3.8,245,65537,1.0,3.0,1.0
3,v4.4.1,2023-05-28,1685280871,23,0,222,0.0,3.8,245,65661,1.0,3.0,1.0
4,v4.5.0,2023-06-11,1686494535,23,0,227,0.0,3.8,250,66322,1.0,3.0,1.0
5,v4.6.0,2023-06-24,1687574300,23,0,233,0.0,3.8,256,67559,1.0,3.0,1.0
6,v4.6.1,2023-07-02,1688290864,20,0,231,0.0,3.8,251,67449,1.0,3.0,1.0
7,v4.6.2,2023-07-09,1688905738,20,0,234,0.0,3.8,254,67612,1.0,3.0,1.0
8,v4.6.3,2023-07-23,1690109073,20,0,235,0.0,3.8,255,67697,1.0,3.0,1.0
9,v4.6.4,2023-08-06,1691326799,20,0,236,0.0,3.9,256,67861,1.0,3.0,1.0


In [None]:
metrics_dataset.to_csv("../Datasets/vant_metrics.csv", index=False)