## Scraping Data from CodeClimate


In [1]:
import requests
import json
import os
from dotenv import load_dotenv
import datetime

In [2]:
load_dotenv()

True

In [3]:
def iso_time_to_unix_epoch(iso_time: str):
    return int(datetime.datetime.strptime(iso_time, "%Y-%m-%dT%H:%M:%SZ").timestamp())

In [4]:
token = os.getenv("CODECLIMATE_ACCESS_TOKEN")
github_token = os.getenv("GITHUB_ACCESS_TOKEN")
demo_github_slug = "NotAlpha45"
demo_repository_name = "projects_realestate"
default_headers = {
    "Accept": "application/vnd.api+json",
    "Authorization": f"Token token={token}",
}
github_users_and_repos = {
    "youzan": "vant",
}

In [6]:
response = requests.get(
    url=f"https://api.codeclimate.com/v1/repos?github_slug={demo_github_slug}/{demo_repository_name}",
    headers=default_headers,
)

repository_data = response.json()["data"][0]
repository_data

{'id': '6559b9e57b97066afb7a5cb3',
 'type': 'repos',
 'attributes': {'analysis_version': 77760,
  'badge_token': '2e25782275e47f284baf',
  'branch': 'main',
  'created_at': '2023-11-19T07:31:49.567Z',
  'delegated_config_repo_id': '',
  'diff_coverage_enforced': True,
  'diff_coverage_threshold': 50,
  'enable_notifications': True,
  'github_slug': 'NotAlpha45/projects_realestate',
  'human_name': 'projects_realestate',
  'last_activity_at': '2023-11-19T07:32:02.483Z',
  'test_reporter_id': '646a2b002d3f807414c7f67db045302a3564b68ec5f745bf7cbf80218ec86c41',
  'total_coverage_enforced': True,
  'vcs_database_id': '720665536',
  'vcs_host': 'https://github.com'},
 'relationships': {'latest_default_branch_snapshot': {'data': {'id': '6559b9e6678a9d00010464ab',
    'type': 'snapshots'}},
  'latest_default_branch_test_report': {'data': None},
  'account': {'data': None}},
 'links': {'self': 'https://codeclimate.com/github/NotAlpha45/projects_realestate',
  'services': 'https://api.codeclimat

In [7]:
repository_id = repository_data["id"]
repository_id

'6559b9e57b97066afb7a5cb3'

In [8]:
latest_snapshot_id = repository_data["relationships"]["latest_default_branch_snapshot"][
    "data"
]["id"]
latest_snapshot_id

'6559b9e6678a9d00010464ab'

In [9]:
# Get the ref points for the repository

response = requests.get(
    url=f"https://api.codeclimate.com/v1/repos/{repository_id}/ref_points",
    headers=default_headers,
)

ref_points = response.json()["data"]
ref_points

[{'id': '6559b9e69312d9000100677c',
  'type': 'ref_points',
  'attributes': {'analyzed': True,
   'branch': 'main',
   'commit_sha': '0bf2eb07d889b3b3bf286d71a7a010ec930bb71b',
   'created_at': '2023-11-19T07:31:50.796Z',
   'ref': 'refs/heads/main'},
  'relationships': {'snapshot': {'data': {'id': '6559b9e6678a9d00010464ab',
     'type': 'snapshots'}}}}]

## Github API Data Extraction


In [10]:
github_request_headers = {
    "Accept": "application/vnd.github.v3+json",
    "Authorization": f"token {github_token}",
}

In [13]:
response = requests.get(
    url=f"https://api.github.com/repos/abi/screenshot-to-code/commits",
    headers=github_request_headers,
)

commits = response.json()
commits
# Sort the commits by date
commits.sort(
    key=lambda commit: iso_time_to_unix_epoch(commit["commit"]["author"]["date"])
)
commits[:2]

[{'sha': 'ebedef2bef4454868e719b3fd1056b7ec03b9d5d',
  'node_id': 'C_kwDOKtchNdoAKGViZWRlZjJiZWY0NDU0ODY4ZTcxOWIzZmQxMDU2YjdlYzAzYjlkNWQ',
  'commit': {'author': {'name': 'Abi Raja',
    'email': 'abimanyuraja@gmail.com',
    'date': '2023-11-16T17:26:29Z'},
   'committer': {'name': 'GitHub',
    'email': 'noreply@github.com',
    'date': '2023-11-16T17:26:29Z'},
   'message': 'Update README.md',
   'tree': {'sha': '1b68d45e1d3b93390a9be4ad1a6c6854921f01d2',
    'url': 'https://api.github.com/repos/abi/screenshot-to-code/git/trees/1b68d45e1d3b93390a9be4ad1a6c6854921f01d2'},
   'url': 'https://api.github.com/repos/abi/screenshot-to-code/git/commits/ebedef2bef4454868e719b3fd1056b7ec03b9d5d',
   'comment_count': 0,
   'verification': {'verified': True,
    'reason': 'valid',
    'signature': '-----BEGIN PGP SIGNATURE-----\n\nwsBcBAABCAAQBQJlVlDFCRBK7hj4Ov3rIwAAUrQIAExq8HE6I2vPNK+dCxKsYTpv\nZcQi9b3Qjt2/0Lb2Nmt8lyxVkpzwvZObnYUnjLot9qtKsmQPCts+LPkigh2FW7yT\nUjTvuvPyMNB7ZBvLpqGnujVZ0tu6cT2LHu

In [14]:
# Get the releases for the repository

response = requests.get(
    url=f"https://api.github.com/repos/youzan/vant/releases",
    headers=github_request_headers,
)

releases = list(response.json())

# Sort the releases by published date
releases.sort(key=lambda release: iso_time_to_unix_epoch(release["published_at"]))

# Take the url, tag_name, target_commitsh, id and published_at fields

releases = [
    {
        "url": release["url"],
        "tag_name": release["tag_name"],
        "target_commitish": release["target_commitish"],
        "id": release["id"],
        "published_at": release["published_at"],
    }
    for release in releases
]

tag_names_in_releases = [release["tag_name"] for release in releases]

releases[:2], tag_names_in_releases

([{'url': 'https://api.github.com/repos/youzan/vant/releases/94548765',
   'tag_name': 'v4.1.0',
   'target_commitish': 'main',
   'id': 94548765,
   'published_at': '2023-03-05T11:47:24Z'},
  {'url': 'https://api.github.com/repos/youzan/vant/releases/96106954',
   'tag_name': 'v4.1.1',
   'target_commitish': 'main',
   'id': 96106954,
   'published_at': '2023-03-19T12:44:26Z'}],
 ['v4.1.0',
  'v4.1.1',
  'v4.1.2',
  'v4.2.0',
  'v4.2.1',
  'v4.3.0',
  'v4.3.1',
  'v4.3.2',
  'v4.4.0',
  'v4.4.1',
  'v4.5.0',
  'v4.6.0',
  'v4.6.1',
  'v4.6.2',
  'v3.6.12',
  'v4.6.3',
  'v4.6.4-beta.2',
  'v4.6.4',
  'v4.6.5',
  'v4.6.6',
  'v4.6.7',
  'v4.6.8',
  'v2.13.0',
  'v4.7.0',
  'v4.7.1',
  'v2.13.1',
  'v4.7.2',
  'v2.13.2',
  'v4.7.3',
  'v4.8.0'])