In [1]:
import requests
from time import sleep
import tenacity
import json
from pathlib import Path

In [None]:
@tenacity.retry(
    stop=tenacity.stop_after_attempt(5),
    wait=tenacity.wait_exponential(min=2, max=30),
)
def request_with_timeout(url: str) -> dict:
    response = requests.get(url)
    response.raise_for_status()
    return response.json()


def search_github(keyword: str) -> list[dict]:
    """Page through the GitHub API to retrieve all repositories matching a keyword."""

    repos = []

    i = 1
    while True:
        data = request_with_timeout(f"https://api.github.com/search/repositories?q={keyword}&page={i}&per_page=100")
        repos.extend(data['items'])
        n = len(data['items'])
        print(f"Retrieved {n + (i-1) * 100} / {data['total_count']}")
        
        if len(data['items']) < 100:
            break
        i += 1
        sleep(1)
    return repos

In [None]:

from gh import crawl


In [None]:
crawl(year_min=2024, year_max=2024)

{
  search(
    type: REPOSITORY
    query: """
    uw-madison
    created:2024-01-01..2024-12-31
    """
    first: 30
    after: "Y3Vyc29yOjk="
  ) {
    pageInfo {
      endCursor
      hasNextPage
    }
    repos: edges {
      repo: node {
        ... on Repository {
          url
          mainReadme: object(expression: "main:README.md") {
            ... on Blob {
              text
            }
          }
          masterReadme: object(expression: "master:README.md") {
            ... on Blob {
              text
            }
          }
          allIssues: issues {
            totalCount
          }
          openIssues: issues(states: OPEN) {
            totalCount
          }
          commitsCountMain: object(expression: "main") {
            ... on Commit {
              history {
                totalCount
              }
            }
          }
          commitsCountMaster: object(expression: "master") {
            ... on Commit {
              history {
                totalCount
              }
            }
          }
        }
      }
    }
  }
}

In [2]:
import requests

In [5]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [22]:
graph_ql = """
{
  search(
    type: REPOSITORY
    query: "uw-madison"
    first: 3
  ) {
    pageInfo {
      endCursor
      hasNextPage
    }
    repos: edges {
      repo: node {
        ... on Repository {
          url
          mainReadme: object(expression: "main:README.md") {
            ... on Blob {
              text
            }
          }
          masterReadme: object(expression: "master:README.md") {
            ... on Blob {
              text
            }
          }
          allIssues: issues {
            totalCount
          }
          openIssues: issues(states: OPEN) {
            totalCount
          }
          commitsCountMain: object(expression: "main") {
            ... on Commit {
              history {
                totalCount
              }
            }
          }
          commitsCountMaster: object(expression: "master") {
            ... on Commit {
              history {
                totalCount
              }
            }
          }
        }
      }
    }
  }
}
"""


In [25]:
GH_GRAPHQL_URL = "https://api.github.com/graphql"

headers = {
    "Authorization": f"Bearer {os.getenv('GITHUB_TOKEN')}"
}

response = requests.post(GH_GRAPHQL_URL, headers=headers, json={"query": graph_ql})

In [26]:
response.json()

{'data': {'search': {'pageInfo': {'endCursor': 'Y3Vyc29yOjM=',
    'hasNextPage': True},
   'repos': [{'repo': {'url': 'https://github.com/rasbt/stat453-deep-learning-ss21',
      'mainReadme': {'text': '# stat453-deep-learning-ss21\nSTAT 453: Intro to Deep Learning @ UW-Madison (Spring 2021)\n'},
      'masterReadme': None,
      'allIssues': {'totalCount': 4},
      'openIssues': {'totalCount': 3},
      'commitsCountMain': {'history': {'totalCount': 44}},
      'commitsCountMaster': None}},
    {'repo': {'url': 'https://github.com/rasbt/stat453-deep-learning-ss20',
      'mainReadme': None,
      'masterReadme': {'text': '# STAT 453: Introduction to Deep Learning and Generative Models\n\n**Course Website: http://pages.stat.wisc.edu/~sraschka/teaching/stat453-ss2020/**\n\n\n## Topics Summary (Planned)\n\nBelow is a list of the topics I am planning to cover. Note that while these topics are numerated by lectures, note that some lectures are longer or shorter than others. Also, we may 