In [None]:
import requests
import time


github_base_url = "https://api.github.com"
github_auth_token = "" # use a personal access token here
headers = {"Authorization": f"Bearer {github_auth_token}"}

In [44]:
def is_rate_limit_exceeded(response):
    if response.status_code == 403 and "Retry-After" in response.headers:
        return "Retry-After"
    elif response.status_code == 403 and response.headers["X-RateLimit-Remaining"] == "0":
        return "X-RateLimit-Reset"
    else:
        return None

In [54]:
def handle_rate_limit(response, limit_header):
    retry_after = int(response.headers[limit_header])
    if limit_header == "X-RateLimit-Reset":
        retry_after -= int(time.time())

    retry_after += 1
    print(f"Rate limit exceeded. Retrying after {retry_after} seconds...")
    time.sleep(retry_after)

In [None]:
def get_repos(base_url, auth_token):
    query = "language:python+language:javascript+stars:>=100+topic:machine-learning+topic:data-analysis"
    start_url = f"{base_url}/search/repositories?q={query}"
    page = 1
    url = start_url
    repos = []

    while True:
        response = requests.get(url, headers=headers)
        rate_limit_exceeded = is_rate_limit_exceeded(response)

        if response.status_code == 200:
            repos.extend(response.json()["items"])
            link_header = response.headers.get("Link")
            if link_header and "rel=\"next\"" in link_header:
                page += 1
                url = start_url + "&page=" + str(page)
            else:
                break
        elif rate_limit_exceeded:
            limit_header = rate_limit_exceeded
            handle_rate_limit(response, limit_header)
            continue
        else:
            raise Exception(f"Error: {response.status_code}")
    return repos

repos = get_repos(github_base_url, github_auth_token)

print(repos)

[{'id': 843222, 'node_id': 'MDEwOlJlcG9zaXRvcnk4NDMyMjI=', 'name': 'scikit-learn', 'full_name': 'scikit-learn/scikit-learn', 'private': False, 'owner': {'login': 'scikit-learn', 'id': 365630, 'node_id': 'MDEyOk9yZ2FuaXphdGlvbjM2NTYzMA==', 'avatar_url': 'https://avatars.githubusercontent.com/u/365630?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/scikit-learn', 'html_url': 'https://github.com/scikit-learn', 'followers_url': 'https://api.github.com/users/scikit-learn/followers', 'following_url': 'https://api.github.com/users/scikit-learn/following{/other_user}', 'gists_url': 'https://api.github.com/users/scikit-learn/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/scikit-learn/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/scikit-learn/subscriptions', 'organizations_url': 'https://api.github.com/users/scikit-learn/orgs', 'repos_url': 'https://api.github.com/users/scikit-learn/repos', 'events_url': 'https://api.github.com/users/sc

In [47]:
def get_repo_names(repos):
    repo_names = []
    for repo in repos:
        repo_names.append(repo["full_name"])
    return repo_names

repo_names = get_repo_names(repos)

print(repo_names)

repo_data = {}

for repo_name in repo_names:
    repo_data[repo_name] = {}

['scikit-learn/scikit-learn', 'streamlit/streamlit', 'gradio-app/gradio', 'AMAI-GmbH/AI-Expert-Roadmap', 'ydataai/ydata-profiling', 'Yorko/mlcourse.ai', 'yzhao062/pyod', 'scikit-learn-contrib/imbalanced-learn', 'nidhaloff/igel', 'fbdesignpro/sweetviz', 'secretflow/secretflow', 'NannyML/nannyml', 'rio-labs/rio', 'hi-primus/optimus', 'sepandhaghighi/pycm', 'capitalone/DataProfiler', 'alan-turing-institute/CleverCSV', 'skrub-data/skrub', 'uxlfoundation/scikit-learn-intelex', 'nfstream/nfstream', 'ipython-books/cookbook-2nd', 'greppo-io/greppo', 'anish-lakkapragada/SeaLion', 'matousc89/padasip', 'lucasxlu/LagouJob', 'DeDolphins/DataHorse', 'GZTipDM/TipDM', 'ayush1997/visualize_ML', 'amalshehu/langchain-js-realworld', 'leanderme/sytora', 'ing-bank/probatus', 'FesonX/cn-text-classifier', 'acerbilab/pyvbmc', 'apachecn/ds100-textbook-zh', 'nla-group/classix', 'sissa-data-science/DADApy']


In [62]:
def get_commits(base_url, auth_token, repo_name):
    url = f"{base_url}/repos/{repo_name}/commits"
    response = requests.get(url, headers=headers)
    rate_limit_exceeded = is_rate_limit_exceeded(response)

    if response.status_code == 200:
        return response.json()
    elif rate_limit_exceeded:
        limit_header = rate_limit_exceeded
        handle_rate_limit(response, limit_header)
        get_commits(base_url, auth_token, repo_name)
    else:
        raise Exception(f"Error: {response.status_code}")

In [49]:
def get_readme(base_url, auth_token, repo_name):
    url = f"{base_url}/repos/{repo_name}/readme"
    response = requests.get(url, headers=headers)
    rate_limit_exceeded = is_rate_limit_exceeded(response)

    if response.status_code == 200:
        return response.json()
    elif response.status_code == 404:
        return None
    elif rate_limit_exceeded:
        limit_header = rate_limit_exceeded
        handle_rate_limit(response, limit_header)
        get_readme(base_url, auth_token, repo_name)
    else:
        raise Exception(f"Error: {response.status_code}")

In [50]:
def get_license(base_url, auth_token, repo_name):
    url = f"{base_url}/repos/{repo_name}/license"
    response = requests.get(url, headers=headers)
    rate_limit_exceeded = is_rate_limit_exceeded(response)

    if response.status_code == 200:
        return response.json()
    elif response.status_code == 404:
        return None
    elif rate_limit_exceeded:
        limit_header = rate_limit_exceeded
        handle_rate_limit(response, limit_header)
        get_license(base_url, auth_token, repo_name)
    else:
        raise Exception(f"Error: {response.status_code}")

In [63]:
for repo in repo_data:
    commits = get_commits(github_base_url, github_auth_token, repo)
    repo_data[repo]["commits"] = []
    for element in commits:
        commit = element["commit"]
        author = commit["author"]
        repo_data[repo]["commits"].append({"message": commit["message"], "author_name": author["name"], "author_email": author["email"], "date": author["date"]})

    readme = get_readme(github_base_url, github_auth_token, repo)
    if readme:
        repo_data[repo]["readme"] = readme
    else:
        repo_data[repo]["readme"] = None

    license = get_license(github_base_url, github_auth_token, repo)
    if license:
        repo_data[repo]["license"] = license
    else:
        repo_data[repo]["license"] = None

print(repo_data)

