## Scraping Data from CodeClimate


In [37]:
import requests
import json
import os
from dotenv import load_dotenv
import datetime

In [38]:
load_dotenv()

True

In [39]:
def iso_time_to_unix_epoch(iso_time: str):
    return int(datetime.datetime.strptime(iso_time, "%Y-%m-%dT%H:%M:%SZ").timestamp())

In [40]:
token = os.getenv("CODECLIMATE_ACCESS_TOKEN")
github_token = os.getenv("GITHUB_ACCESS_TOKEN")
demo_github_slug = "NotAlpha45"
demo_repository_name = "vant"
default_headers = {
    "Accept": "application/vnd.api+json",
    "Authorization": f"Token token={token}",
}
github_users_and_repos = {
    "youzan": "vant",
}

In [41]:
response = requests.get(
    url=f"https://api.codeclimate.com/v1/repos?github_slug={demo_github_slug}/{demo_repository_name}",
    headers=default_headers,
)

repository_data = response.json()["data"][0]
repository_data

{'id': '655a1112bfaf6e2b94a00f9d',
 'type': 'repos',
 'attributes': {'analysis_version': 77760,
  'badge_token': 'd35f358f74604fabea81',
  'branch': 'main',
  'created_at': '2023-11-19T13:43:46.719Z',
  'delegated_config_repo_id': '',
  'diff_coverage_enforced': True,
  'diff_coverage_threshold': 50,
  'enable_notifications': True,
  'github_slug': 'NotAlpha45/vant',
  'human_name': 'vant',
  'last_activity_at': '2023-11-19T13:44:23.462Z',
  'test_reporter_id': None,
  'total_coverage_enforced': True,
  'vcs_database_id': '720749938',
  'vcs_host': 'https://github.com'},
 'relationships': {'latest_default_branch_snapshot': {'data': {'id': '655a1122678a9d0001048d09',
    'type': 'snapshots'}},
  'latest_default_branch_test_report': {'data': None},
  'account': {'data': None}},
 'links': {'self': 'https://codeclimate.com/github/NotAlpha45/vant',
  'services': 'https://api.codeclimate.com/v1/repos/655a1112bfaf6e2b94a00f9d/services',
  'web_coverage': 'https://codeclimate.com/github/NotAlp

In [42]:
repository_id = repository_data["id"]
repository_id

'655a1112bfaf6e2b94a00f9d'

In [43]:
latest_snapshot_id = repository_data["relationships"]["latest_default_branch_snapshot"][
    "data"
]["id"]
latest_snapshot_id

'655a1122678a9d0001048d09'

In [66]:
# Get time series metrics (remediation_minutes) of the repository

response = requests.get(
    url=f"https://api.codeclimate.com/v1/repos/{repository_id}/metrics/remediation_minutes?filter[from]=2023-08-20&filter[to]=2023-11-19",
    headers=default_headers,
)

response.json()["data"]

{'id': '655a1137a0367900010000b9',
 'type': 'metrics',
 'attributes': {'name': 'remediation_minutes',
  'points': [{'timestamp': 1691971200, 'value': None},
   {'timestamp': 1692576000, 'value': None},
   {'timestamp': 1693180800, 'value': None},
   {'timestamp': 1693785600, 'value': None},
   {'timestamp': 1694390400, 'value': None},
   {'timestamp': 1694995200, 'value': None},
   {'timestamp': 1695600000, 'value': None},
   {'timestamp': 1696204800, 'value': None},
   {'timestamp': 1696809600, 'value': None},
   {'timestamp': 1697414400, 'value': None},
   {'timestamp': 1698019200, 'value': None},
   {'timestamp': 1698624000, 'value': 58330.36},
   {'timestamp': 1699228800, 'value': 58330.36},
   {'timestamp': 1699833600, 'value': 58330.36}]}}

In [69]:
# Get the ref points of the repository

response = requests.get(
    url=f"https://api.codeclimate.com/v1/repos/{repository_id}/ref_points",
    headers=default_headers,
)

ref_points = response.json()["data"]
ref_points[:]

[{'id': '655a1122377b2200010063ff',
  'type': 'ref_points',
  'attributes': {'analyzed': True,
   'branch': 'main',
   'commit_sha': 'f325fc862c262d662f746aea30878007a64e97f7',
   'created_at': '2023-11-19T13:44:02.499Z',
   'ref': 'refs/heads/main'},
  'relationships': {'snapshot': {'data': {'id': '655a1122678a9d0001048d09',
     'type': 'snapshots'}}}},
 {'id': '655a11229ad2c02d5966d069',
  'type': 'ref_points',
  'attributes': {'analyzed': True,
   'branch': 'main',
   'commit_sha': 'f325fc862c262d662f746aea30878007a64e97f7',
   'created_at': '2023-11-19T05:11:29.000Z',
   'ref': 'refs/heads/main'},
  'relationships': {'snapshot': {'data': {'id': '655a1122678a9d0001048d09',
     'type': 'snapshots'}}}},
 {'id': '655a11225665562c0b360029',
  'type': 'ref_points',
  'attributes': {'analyzed': True,
   'branch': 'main',
   'commit_sha': 'f72903b576160a27eb79e3e9da2a826ae2d1b691',
   'created_at': '2023-11-12T01:45:04.000Z',
   'ref': 'refs/heads/main'},
  'relationships': {'snapshot': 

In [68]:
# Get issues of the repository

response = requests.get(
    url=f"https://api.codeclimate.com/v1/repos/{repository_id}/snapshots/{latest_snapshot_id}/issues",
    headers=default_headers,
)

issues = response.json()["data"]
issues[:2]

[{'id': '655a112fd6d271000100045c',
  'type': 'issues',
  'attributes': {'categories': ['Complexity'],
   'check_name': 'file_lines',
   'constant_name': 'packages/vant-area-data/src/index.ts',
   'content': {'body': ''},
   'description': 'File `index.ts` has 3932 lines of code (exceeds 250 allowed). Consider refactoring.',
   'engine_name': 'structure',
   'fingerprint': '0d9f0c0a931a110c14fc8421c9c533cc',
   'location': {'path': 'packages/vant-area-data/src/index.ts',
    'end_line': 3942,
    'start_line': 1},
   'other_locations': [],
   'remediation_points': 54220800,
   'severity': 'major'},
  'meta': {'permissions': {'manageable': False}}},
 {'id': '655a1136d6d27100010004e5',
  'type': 'issues',
  'attributes': {'categories': ['Duplication'],
   'check_name': 'similar-code',
   'constant_name': 'packages/vant-area-data/src/index.ts',
   'content': {'body': "## Duplicated Code\n\nDuplicated code can lead to software that is hard to understand and difficult to change. The Don't R

## Github API Data Extraction


In [45]:
github_request_headers = {
    "Accept": "application/vnd.github.v3+json",
    "Authorization": f"token {github_token}",
}

In [46]:
response = requests.get(
    url=f"https://api.github.com/repos/{demo_github_slug}/{demo_repository_name}/commits",
    headers=github_request_headers,
)

commits = response.json()
commits
# Sort the commits by date
commits.sort(
    key=lambda commit: iso_time_to_unix_epoch(commit["commit"]["author"]["date"])
)
commits[:2]

[{'sha': 'ebedef2bef4454868e719b3fd1056b7ec03b9d5d',
  'node_id': 'C_kwDOKtchNdoAKGViZWRlZjJiZWY0NDU0ODY4ZTcxOWIzZmQxMDU2YjdlYzAzYjlkNWQ',
  'commit': {'author': {'name': 'Abi Raja',
    'email': 'abimanyuraja@gmail.com',
    'date': '2023-11-16T17:26:29Z'},
   'committer': {'name': 'GitHub',
    'email': 'noreply@github.com',
    'date': '2023-11-16T17:26:29Z'},
   'message': 'Update README.md',
   'tree': {'sha': '1b68d45e1d3b93390a9be4ad1a6c6854921f01d2',
    'url': 'https://api.github.com/repos/abi/screenshot-to-code/git/trees/1b68d45e1d3b93390a9be4ad1a6c6854921f01d2'},
   'url': 'https://api.github.com/repos/abi/screenshot-to-code/git/commits/ebedef2bef4454868e719b3fd1056b7ec03b9d5d',
   'comment_count': 0,
   'verification': {'verified': True,
    'reason': 'valid',
    'signature': '-----BEGIN PGP SIGNATURE-----\n\nwsBcBAABCAAQBQJlVlDFCRBK7hj4Ov3rIwAAUrQIAExq8HE6I2vPNK+dCxKsYTpv\nZcQi9b3Qjt2/0Lb2Nmt8lyxVkpzwvZObnYUnjLot9qtKsmQPCts+LPkigh2FW7yT\nUjTvuvPyMNB7ZBvLpqGnujVZ0tu6cT2LHu

In [57]:
# Get the releases for the repository

response = requests.get(
    url=f"https://api.github.com/repos/youzan/vant/releases",
    headers=github_request_headers,
)

releases = list(response.json())

# Sort the releases by published date
releases.sort(key=lambda release: iso_time_to_unix_epoch(release["published_at"]))

# Take the url, tag_name, target_commitsh, id and published_at fields
releases = [
    {
        "url": release["url"],
        "tag_name": release["tag_name"],
        "target_commitish": release["target_commitish"],
        "id": release["id"],
        "published_at": release["published_at"],
    }
    for release in releases
]

tag_names_in_releases = [release["tag_name"] for release in releases]
release_timestamps = [release["published_at"] for release in releases]

tag_names_in_releases.sort()

releases, tag_names_in_releases

([{'url': 'https://api.github.com/repos/youzan/vant/releases/94548765',
   'tag_name': 'v4.1.0',
   'target_commitish': 'main',
   'id': 94548765,
   'published_at': '2023-03-05T11:47:24Z'},
  {'url': 'https://api.github.com/repos/youzan/vant/releases/96106954',
   'tag_name': 'v4.1.1',
   'target_commitish': 'main',
   'id': 96106954,
   'published_at': '2023-03-19T12:44:26Z'},
  {'url': 'https://api.github.com/repos/youzan/vant/releases/96922877',
   'tag_name': 'v4.1.2',
   'target_commitish': 'main',
   'id': 96922877,
   'published_at': '2023-03-26T04:13:32Z'},
  {'url': 'https://api.github.com/repos/youzan/vant/releases/99645904',
   'tag_name': 'v4.2.0',
   'target_commitish': 'main',
   'id': 99645904,
   'published_at': '2023-04-16T03:14:32Z'},
  {'url': 'https://api.github.com/repos/youzan/vant/releases/101405585',
   'tag_name': 'v4.2.1',
   'target_commitish': 'main',
   'id': 101405585,
   'published_at': '2023-04-30T13:17:33Z'},
  {'url': 'https://api.github.com/repos/you

In [63]:
release_timestamps

['2023-03-05T11:47:24Z',
 '2023-03-19T12:44:26Z',
 '2023-03-26T04:13:32Z',
 '2023-04-16T03:14:32Z',
 '2023-04-30T13:17:33Z',
 '2023-05-03T14:20:40Z',
 '2023-05-04T01:37:10Z',
 '2023-05-14T14:02:25Z',
 '2023-05-21T14:34:23Z',
 '2023-05-28T13:34:58Z',
 '2023-06-11T14:42:37Z',
 '2023-06-24T02:38:44Z',
 '2023-07-02T09:41:26Z',
 '2023-07-09T12:29:20Z',
 '2023-07-20T12:53:52Z',
 '2023-07-23T10:45:22Z',
 '2023-07-30T05:03:29Z',
 '2023-08-06T13:00:28Z',
 '2023-08-16T10:26:23Z',
 '2023-08-20T08:22:18Z',
 '2023-09-04T14:23:54Z',
 '2023-09-10T01:47:23Z',
 '2023-09-17T11:40:14Z',
 '2023-09-24T14:02:05Z',
 '2023-10-06T02:45:23Z',
 '2023-10-10T13:09:41Z',
 '2023-10-15T09:45:18Z',
 '2023-10-17T01:17:18Z',
 '2023-10-29T09:43:56Z',
 '2023-11-19T05:04:26Z']