## Scraping Data from CodeClimate


In [1]:
import requests
import json
import os
from dotenv import load_dotenv
import datetime
import subprocess as sp
import pandas as pd

In [2]:
load_dotenv()

True

In [3]:
github_token = os.getenv("GITHUB_ACCESS_TOKEN")
sonarqube_token = os.getenv("SONARQUBE_ACCESS_TOKEN_SVELTE")
sonarqube_server_url = os.getenv("SONARQUBE_SERVER_URL")

In [4]:
demo_github_slug = "sveltejs"
demo_repo_name = "svelte"
demo_repo_path = "C:/SWE Class/Github Desktop/TestProjects/svelte"
export_filename = "svelte_metrics.csv"

In [5]:
sonarqube_token, sonarqube_server_url

('sqp_a5e6cf7a744b547fa655e6c7c628c2c48cb6c27a', 'http://localhost:9000')

## Git CLI Data Extraction

In [6]:
def get_tag_list(repo_path: str, max_output: int = 20) -> list[str]:
    # Subprocess to get the git directory
    sp.run(
        ["git", "rev-parse", "--git-dir"],
        cwd=repo_path,
        capture_output=True,
        check=True,
    )

    # Subprocess to get the git tags in the repo
    result = sp.run(
        ["git", "tag", "-l"],
        cwd=repo_path,
        capture_output=True,
        check=True,
    )

    tag_list = result.stdout.decode("utf-8").split("\n")
    tag_list = [tag for tag in tag_list if tag != ""]

    if max_output is not None and len(tag_list) > max_output:
        tag_list = tag_list[-max_output:]

    return tag_list

In [7]:
get_tag_list(demo_repo_path)

['v3.59.2',
 'v3.6.0',
 'v3.6.1',
 'v3.6.10',
 'v3.6.11',
 'v3.6.2',
 'v3.6.3',
 'v3.6.4',
 'v3.6.5',
 'v3.6.6',
 'v3.6.7',
 'v3.6.8',
 'v3.6.9',
 'v3.7.0',
 'v3.7.1',
 'v3.8.0',
 'v3.8.1',
 'v3.9.0',
 'v3.9.1',
 'v3.9.2']

In [8]:
def get_tag_and_timestamp(repo_path: str) -> list[tuple[str, str, int]]:
    tag_list = get_tag_list(repo_path)
    tag_and_timestamp = []

    for tag in tag_list:
        result = sp.run(
            ["git", "log", "-1", "--format=%cd %ct", "--date=short", tag],
            cwd=repo_path,
            capture_output=True,
            check=True,
            text=True,
        )

        date, timestamp = result.stdout.strip().split()
        tag_and_timestamp.append((tag, date, int(timestamp)))
        # print(result.stdout.strip())
    return tag_and_timestamp

In [9]:
tags_and_timestamps = get_tag_and_timestamp(demo_repo_path)
tags_and_timestamps

[('v3.59.2', '2023-06-20', 1687278807),
 ('v3.6.0', '2019-06-25', 1561482399),
 ('v3.6.1', '2019-06-25', 1561488438),
 ('v3.6.10', '2019-07-30', 1564512743),
 ('v3.6.11', '2019-08-03', 1564854522),
 ('v3.6.2', '2019-06-29', 1561834889),
 ('v3.6.3', '2019-07-01', 1562030596),
 ('v3.6.4', '2019-07-03', 1562167706),
 ('v3.6.5', '2019-07-07', 1562509830),
 ('v3.6.6', '2019-07-09', 1562684987),
 ('v3.6.7', '2019-07-10', 1562762101),
 ('v3.6.8', '2019-07-23', 1563890081),
 ('v3.6.9', '2019-07-26', 1564146680),
 ('v3.7.0', '2019-08-04', 1564965986),
 ('v3.7.1', '2019-08-06', 1565096394),
 ('v3.8.0', '2019-08-12', 1565608379),
 ('v3.8.1', '2019-08-14', 1565831814),
 ('v3.9.0', '2019-08-20', 1566305769),
 ('v3.9.1', '2019-08-22', 1566478839),
 ('v3.9.2', '2019-09-03', 1567560718)]

In [10]:
# Checkout to a specific tag and show the output of the operation
def checkout_to_tag(repo_path: str, tag: str):
    result = sp.run(
        ["git", "checkout", tag],
        cwd=repo_path,
        capture_output=True,
        check=True,
        text=True,
    )
    print(result.stdout.strip())

In [11]:
# Checkout to a specific tag use sonar-scanner to scan the project
def sonar_scan(repo_path: str, tag: str):
    result = sp.run(
        [
            "sonar-scanner",
            "-D",
            f"sonar.projectBaseDir={demo_repo_path}",
            "-D",
            "sonar.projectVersion=" + tag,
            "-D",
            f"sonar.projectKey={demo_repo_name}",
            "-D",
            "sonar.sources=.",
            "-D",
            f"sonar.host.url={sonarqube_server_url}",
            "-D",
            "sonar.login=" + sonarqube_token,
        ],
        cwd=repo_path,
        capture_output=True,
        check=True,
        text=True,
        shell=True,
    )
    # print(result.stdout)

In [12]:
# Call the sonarqube api to get the metrics of the project
def get_metrics() -> dict:
    result = sp.run(
        [
            "curl.exe",
            f"{sonarqube_server_url}/api/measures/component?component={demo_repo_name}&metricKeys=bugs,vulnerabilities,code_smells,coverage,duplicated_lines_density,violations,ncloc,security_rating,reliability_rating,sqale_rating",
        ],
        capture_output=True,
        check=True,
        text=True,
        # shell=True,
    )
    return json.loads(result.stdout)

'code_smells',
 'sqale_rating',
 'vulnerabilities',
 'security_rating',
 'duplicated_lines_density',
 'violations',
 'bugs',
 'reliability_rating',
 'coverage',
 'ncloc'

In [13]:
metrics_dict = {
    "version": [],
    "date": [],
    "timestamp": [],
    "bugs": [],
    "vulnerabilities": [],
    "code_smells": [],
    "coverage": [],
    "duplicated_lines_density": [],
    "violations": [],
    "ncloc": [],
    "security_rating": [],
    "reliability_rating": [],
    "sqale_rating": [],
}

In [14]:
# Get metrics of the project
def get_metrics_from_version_tags(
    repo_path: str, tag: str, date: str, timestamp: int, output_dict=metrics_dict
):
    checkout_to_tag(repo_path, tag)
    sonar_scan(repo_path, tag)
    metrics_result = get_metrics()

    output_dict["version"].append(tag)
    output_dict["date"].append(date)
    output_dict["timestamp"].append(timestamp)

    for metric in metrics_result["component"]["measures"]:
        output_dict[metric["metric"]].append(metric["value"])

    return output_dict

In [15]:
# A mock call to test sonar-scanner
sp.run(
    ["sonar-scanner", "-h"],
    capture_output=True,
    check=True,
    text=True,
    shell=True,
)

CompletedProcess(args=['sonar-scanner', '-h'], returncode=0, stdout='INFO: \nINFO: usage: sonar-scanner [options]\nINFO: \nINFO: Options:\nINFO:  -D,--define <arg>     Define property\nINFO:  -h,--help             Display help information\nINFO:  -v,--version          Display version information\nINFO:  -X,--debug            Produce execution debug output\n', stderr='')

In [16]:
for tag_and_timestamp in tags_and_timestamps:
    get_metrics_from_version_tags(demo_repo_path, *tag_and_timestamp)























In [17]:
metrics_dict

{'version': ['v3.59.2',
  'v3.6.0',
  'v3.6.1',
  'v3.6.10',
  'v3.6.11',
  'v3.6.2',
  'v3.6.3',
  'v3.6.4',
  'v3.6.5',
  'v3.6.6',
  'v3.6.7',
  'v3.6.8',
  'v3.6.9',
  'v3.7.0',
  'v3.7.1',
  'v3.8.0',
  'v3.8.1',
  'v3.9.0',
  'v3.9.1',
  'v3.9.2'],
 'date': ['2023-06-20',
  '2019-06-25',
  '2019-06-25',
  '2019-07-30',
  '2019-08-03',
  '2019-06-29',
  '2019-07-01',
  '2019-07-03',
  '2019-07-07',
  '2019-07-09',
  '2019-07-10',
  '2019-07-23',
  '2019-07-26',
  '2019-08-04',
  '2019-08-06',
  '2019-08-12',
  '2019-08-14',
  '2019-08-20',
  '2019-08-22',
  '2019-09-03'],
 'timestamp': [1687278807,
  1561482399,
  1561488438,
  1564512743,
  1564854522,
  1561834889,
  1562030596,
  1562167706,
  1562509830,
  1562684987,
  1562762101,
  1563890081,
  1564146680,
  1564965986,
  1565096394,
  1565608379,
  1565831814,
  1566305769,
  1566478839,
  1567560718],
 'bugs': ['116',
  '21',
  '21',
  '22',
  '22',
  '21',
  '21',
  '21',
  '21',
  '22',
  '22',
  '22',
  '22',
  '22',
 

In [18]:
metrics_dataset = pd.DataFrame(metrics_dict, index=None)
metrics_dataset

Unnamed: 0,version,date,timestamp,bugs,vulnerabilities,code_smells,coverage,duplicated_lines_density,violations,ncloc,security_rating,reliability_rating,sqale_rating
0,v3.59.2,2023-06-20,1687278807,116,0,838,0.0,6.8,954,63516,1.0,5.0,1.0
1,v3.6.0,2019-06-25,1561482399,21,0,733,0.0,5.8,754,36649,1.0,4.0,1.0
2,v3.6.1,2019-06-25,1561488438,21,0,733,0.0,5.8,754,36649,1.0,4.0,1.0
3,v3.6.10,2019-07-30,1564512743,22,0,739,0.0,5.6,761,37188,1.0,4.0,1.0
4,v3.6.11,2019-08-03,1564854522,22,0,739,0.0,6.3,761,37501,1.0,4.0,1.0
5,v3.6.2,2019-06-29,1561834889,21,0,733,0.0,5.7,754,36759,1.0,4.0,1.0
6,v3.6.3,2019-07-01,1562030596,21,0,733,0.0,5.7,754,36759,1.0,4.0,1.0
7,v3.6.4,2019-07-03,1562167706,21,0,740,0.0,5.7,761,36901,1.0,4.0,1.0
8,v3.6.5,2019-07-07,1562509830,21,0,740,0.0,5.7,761,36901,1.0,4.0,1.0
9,v3.6.6,2019-07-09,1562684987,22,0,740,0.0,5.6,762,37099,1.0,4.0,1.0


In [19]:
metrics_dataset.to_csv(f"../Datasets/{export_filename}", index=False)