# git repo crawl

该范例里，我们将针对所有包含关键词“property”，为“verilog”语言的repo进行爬取，结果将以 `<repo_name>.zip` 的形式保存在`data\repos`文件夹下。

steps:

1. get all repos written in language 'systemverilog';
2. filter these repos with rule: keyword 'property' must appear in code;
3. download all repos;


# github API tutorial

* [官方文档](https://docs.github.com/en/rest?apiVersion=2022-11-28)

- - [search API](https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28)

- - [设置personal token](https://github.com/settings/tokens)

* [入门blog](https://blog.csdn.net/weixin_39132520/article/details/114925354)


# 注意事项：

* github's search API limit is set to return at most 1000 items, therefore we need to split whole query into chunks to retrieve items ranked over 1000

* For authenticated requests, 30 requests per minute, For unauthenticated requests, the rate limit allows you to make up to 10 requests per minute. 在code search里，实验来看，只能最快每分钟search 10次，所以我们应该还是属于 unauthenticated，虽然是给了token的（不给token，code search都用不了）。

* 对小数据量的repo爬取，同步执行就ok了。如要涉及到更大数据量的爬取，需要分布式，可以参考[这篇博客](https://ask.hellobi.com/blog/lrysjtu/3859)

In [1]:
import time
import json
import requests

import os
from os import path as osp

import subprocess
import wget

In [2]:
from dotenv import load_dotenv
import  os
_TOKEN = os.getenv('git_token')

HEADERS = {'Authorization': f'Bearer {_TOKEN}'}
ROOT = '.'

def test_api():
    url = 'https://api.github.com/octocat'

    resp = requests.get(url, headers=HEADERS)
    print(resp.content.decode('utf8'))

test_api()


               MMM.           .MMM
               MMMMMMMMMMMMMMMMMMM
               MMMMMMMMMMMMMMMMMMM      ____________________________
              MMMMMMMMMMMMMMMMMMMMM    |                            |
             MMMMMMMMMMMMMMMMMMMMMMM   | Practicality beats purity. |
            MMMMMMMMMMMMMMMMMMMMMMMM   |_   ________________________|
            MMMM::- -:::::::- -::MMMM    |/
             MM~:~ 00~:::::~ 00~:~MM
        .. MMMMM::.00:::+:::.00::MMMMM ..
              .MM::::: ._. :::::MM.
                 MMMM;:::::;MMMM
          -MM        MMMMMMM
          ^  M+     MMMMMMMMM
              MMMMMMM MM MM MM
                   MM MM MM MM
                   MM MM MM MM
                .~~MM~MM~MM~MM~~.
             ~~~~MM:~MM~~~MM~:MM~~~~
            ~~~~~~==~==~~~==~==~~~~~~
             ~~~~~~==~==~==~==~~~~~~
                 :~==~==~==~==~~



In [12]:
from github import Github
g = Github(_TOKEN)

g.get_rate_limit().core.raw_data

{'limit': 5000, 'used': 24, 'remaining': 4976, 'reset': 1712736884}

In [223]:
def init_dirs(root='.'):
    os.makedirs(osp.join(root, 'data'), exist_ok=True)
    os.makedirs(osp.join(root, 'data/repos'), exist_ok=True)

init_dirs(ROOT)

In [227]:
def get_url_repo(query_kw, page_no, stars_ub, item_per_page, lang):
      # %20 and + are all legal connectives between query words and properties
      # stars:<={stars}&sort=stars&order=desc
      url = f"https://api.github.com/search/repositories?q={query_kw}+language:{lang}+stars:<={stars_ub}&sort=stars&order=desc&page={page_no}&per_page={item_per_page}"
      print(f"request url: {url}")
      return url


def get_results(url, max_retry=5, req_every_k_secs=10):
      def _get(url):
            resp = requests.get(url, headers=HEADERS)
            if resp.status_code == 200:
                  txt = resp.content.decode()
                  return json.loads(txt)
            return None
      
      while max_retry>0:
            results = _get(url)
            if (results is not None) and (not results['incomplete_results']):
                  return results
            print(f'failed to get result from url! retry count: {max_retry}')
            time.sleep(req_every_k_secs)
            max_retry-=1
      return None

In [218]:
def sample(results, k=1):
    print(f"item count in this return: {len(results['items'])}")
    for item in results['items'][:k]:
        print("item name: {}, home_page: {}, stars: {}".format(item['full_name'], item['homepage'], item['stargazers_count']))

def save(results, out_path):
    with open(out_path, mode='w') as fp:
        json.dump(results['items'], indent=4, fp=fp)

def div_and_ceil(a, b):
    return a // b + int((a % b) != 0)

def get_lowest_star(items):
    return items[-1]['stargazers_count']

def process(url, out_path):
    results = None
    results = get_results(url)
    if results is not None:
        save(results, out_path)
    return results

def get_outpath(page_no):
    return osp.join(ROOT, f'data/res_{page_no}.json')

In [228]:
repo_query_kw = 'systemverilog'
lang = repo_query_kw

req_every_k_secs = 6
sample_every_k_pages = 5
stars_ub = 100000   # The upper bound of stars for query
item_per_page = 100
max_retrievable_item_limit = 1000
max_pages_per_search = div_and_ceil(max_retrievable_item_limit, item_per_page)

results = process(get_url_repo(repo_query_kw, 1, stars_ub, item_per_page, lang), out_path=get_outpath(1))
if results is not None:
    total_count = results['total_count']
    print(f"total queried count: {total_count}")
    sample(results, k=1)
    total_pages = div_and_ceil(total_count, item_per_page)
    time.sleep(req_every_k_secs)

request url: https://api.github.com/search/repositories?q=systemverilog+language:systemverilog+stars:<=100000&sort=stars&order=desc&page=1&per_page=100
total queried count: 1455
item count in this return: 100
item name: hdl-util/hdmi, home_page: https://purisa.me/blog/hdmi-released/, stars: 897


In [229]:
start = 2
for real_page_no in range(start, total_pages+1):
    print(f'On page: {real_page_no}')
    page_no = real_page_no % max_pages_per_search
    if page_no == 0:
        page_no = max_pages_per_search
        results = process(get_url_repo(repo_query_kw, page_no, stars_ub, item_per_page, lang), get_outpath(real_page_no))
        stars_ub = get_lowest_star(results['items'])
    else:
        results = process(get_url_repo(repo_query_kw, page_no, stars_ub, item_per_page, lang), get_outpath(real_page_no))
        # if star is not reducing, it means we have reached the long flatten part of the star curve
        # there is no way to ensure a stable order of repos at the moment:
        # see https://github.com/sourcegraph/sourcegraph/issues/2562
        # we have two choices thereafter:
        # 1. stop here
        # 2. continue:
        #   a. randomly sample at this star_count till no more new repo id arrives
        #   b. skip this star_count to the next one
        # here we choose to stop
        if get_lowest_star(results['items']) == stars_ub:
            break
    if real_page_no % sample_every_k_pages == 0:
        sample(results, k=1)
    time.sleep(3)

On page: 2
request url: https://api.github.com/search/repositories?q=systemverilog+language:systemverilog+stars:<=100000&sort=stars&order=desc&page=2&per_page=100
On page: 3
request url: https://api.github.com/search/repositories?q=systemverilog+language:systemverilog+stars:<=100000&sort=stars&order=desc&page=3&per_page=100
On page: 4
request url: https://api.github.com/search/repositories?q=systemverilog+language:systemverilog+stars:<=100000&sort=stars&order=desc&page=4&per_page=100
On page: 5
request url: https://api.github.com/search/repositories?q=systemverilog+language:systemverilog+stars:<=100000&sort=stars&order=desc&page=5&per_page=100
item count in this return: 100
item name: quangphan2405/SystemVerilog, home_page: , stars: 1
On page: 6
request url: https://api.github.com/search/repositories?q=systemverilog+language:systemverilog+stars:<=100000&sort=stars&order=desc&page=6&per_page=100
On page: 7
request url: https://api.github.com/search/repositories?q=systemverilog+language:

# filter repos by searching code inside repo with keywords 'property'

In [230]:
def load_and_save(data, out_path):
    full_data = []
    if osp.exists(out_path):
        with open(out_path, 'r') as fp:
            full_data = json.load(fp)
    full_data.extend(data)
    with open(out_path, 'w') as fp:
        json.dump(full_data, fp=fp, indent=4)

def get_url_code(query_kw, lang, repo):
    surfix = '{}+in:file+language:{}+repo:{}'.format(query_kw, lang, repo)
    return f"https://api.github.com/search/code?q={surfix}"

In [231]:
code_query_kw = 'property'

filenames = os.listdir('./data')
files = [osp.join('./data', f) for f in filenames if 'res_' in f]

In [232]:
for file_path in files:
    print(f'On file: {file_path}')
    with open(file_path, 'r') as fp:
        items = json.load(fp)

    filtered_repos = []
    startfrom = 0
    for i, item in enumerate(items):
        if i < startfrom:
            continue
        repo = item['html_url'].replace('https://github.com/', '')
        print(f'On No.{i}: {repo}')
        results = get_results(get_url_code(code_query_kw, lang, repo))
        if results['total_count'] != 0:
            filtered_repos.append({
                'repo_id': item['id'],
                'repo_name': item['full_name'],
                'download_url': item['clone_url'],
            })
        time.sleep(req_every_k_secs)
    load_and_save(filtered_repos, osp.join(ROOT, 'data/filtered_repos.json'))

On file: ./data\res_1.json
On No.0: hdl-util/hdmi
On No.1: pulp-platform/axi
On No.2: trivialmips/nontrivial-mips
On No.3: pulp-platform/common_cells
On No.4: VerificationExcellence/SystemVerilogReference
On No.5: WangXuan95/USTC-RVSoC
On No.6: taichi-ishitani/tvip-axi
On No.7: tymonx/logic
On No.8: chipsalliance/sv-tests
On No.9: veripool/verilog-mode
failed to get result from url! retry count: 5
On No.10: openhwgroup/core-v-mcu
On No.11: subbdue/systemverilog.io
On No.12: taichi-ishitani/tnoc
On No.13: ijor/fx68k
On No.14: GodelMachine/AHB2
On No.15: loykylewong/FPGA-Application-Development-and-Simulation
On No.16: trivialmips/TrivialMIPS
On No.17: karthisugumar/CSE240D-Hierarchical_Mesh_NoC-Eyeriss_v2
On No.18: StefanSredojevic/Deep-Neural-Network-Hardware-Accelerator
failed to get result from url! retry count: 5
On No.19: agalimberti/NoCRouter
On No.20: VerificationExcellence/SystemVerilogAssertions
On No.21: unixb0y/SystemVerilogSHA256
On No.22: amiq-consulting/svaunit
On No.23: i

# download all repos by git clone or wget

In [233]:
def clone_repos(repos):
    def _git(*args):
        return subprocess.check_call(['git'] + list(args))
    for repo in repos:
        clone_url = repo['download_url']
        ret = _git("clone", clone_url)

def download_repos(repos, out_dir):
    for i, repo in enumerate(repos):
        print(f"On No.{i} repo: {repo}")
        url = repo['download_url']
        download_url = url[:len(url) - 4] + "/archive/refs/heads/master.zip"
        file_name =  repo['repo_name'].replace("/", "#") + '.zip'
        try:
            wget.download(download_url, out=osp.join(out_dir, file_name))
        except Exception as e:
            print("Could not download file {}".format(file_name))
            print(e)

In [234]:
with open("./data/filtered_repos.json", mode='r') as fp:
    repos = json.load(fp)

In [235]:
download_repos(repos, out_dir=osp.join(ROOT, 'data/repos'))

On No.0 repo: {'repo_id': 129229399, 'repo_name': 'pulp-platform/axi', 'download_url': 'https://github.com/pulp-platform/axi.git'}
On No.1 repo: {'repo_id': 171665435, 'repo_name': 'trivialmips/nontrivial-mips', 'download_url': 'https://github.com/trivialmips/nontrivial-mips.git'}
On No.2 repo: {'repo_id': 118764818, 'repo_name': 'pulp-platform/common_cells', 'download_url': 'https://github.com/pulp-platform/common_cells.git'}
On No.3 repo: {'repo_id': 19295560, 'repo_name': 'VerificationExcellence/SystemVerilogReference', 'download_url': 'https://github.com/VerificationExcellence/SystemVerilogReference.git'}
On No.4 repo: {'repo_id': 108771045, 'repo_name': 'tymonx/logic', 'download_url': 'https://github.com/tymonx/logic.git'}
On No.5 repo: {'repo_id': 201299456, 'repo_name': 'chipsalliance/sv-tests', 'download_url': 'https://github.com/chipsalliance/sv-tests.git'}
On No.6 repo: {'repo_id': 8392762, 'repo_name': 'veripool/verilog-mode', 'download_url': 'https://github.com/veripool/ver