In [23]:
import requests
from dotenv import load_dotenv
import os
import time 
import json
load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

HEADERS = {"Authorization": f"Bearer {GITHUB_TOKEN}"}
API_URL = "https://api.github.com/graphql"

query = """
query ($after: String) {
  search(query: "stars:>1000", type: REPOSITORY, first: 50, after: $after) {
    repositoryCount
    pageInfo {
      endCursor
      hasNextPage
    }
    edges {
      node {
        ... on Repository {
          nameWithOwner
          description
          stargazerCount
          forkCount
          url
          updatedAt
          issues(states: OPEN) {
            totalCount
          }
          watchers {
            totalCount
          }
        }
      }
    }
  }
}
"""


In [None]:
return_data = []
cursor = None

while True:
    variables = {"after": cursor}
    response = requests.post(API_URL, json={"query": query, "variables": variables}, headers=HEADERS)
    data = response.json()
    print(data)

    # Optional: basic error handling
    if "errors" in data:
        print("GraphQL error:", data["errors"])
        break

    return_data.append(data)

    page_info = data['data']['search']['pageInfo']
    if not page_info["hasNextPage"]:
        break

    cursor = page_info["endCursor"]

    # Optional: sleep to respect rate limits
    time.sleep(1)

# Write all responses to a file
with open("github_response.json", "w") as f:
    json.dump(return_data, f, indent=2)

In [25]:
CURSOR_FILE = 'github_cursor.json'
DATA_FILE = 'github_response.json'

def save_state(cursor, data):
    with open(CURSOR_FILE, 'w') as f:
        json.dump({'cursor': cursor}, f)
    with open(DATA_FILE, 'w') as f:
        json.dump(data, f, indent=2)

def load_state():
    if os.path.exists(CURSOR_FILE) and os.path.exists(DATA_FILE):
        with open(CURSOR_FILE) as f:
            cursor = json.load(f).get('cursor')
        with open(DATA_FILE) as f:
            data = json.load(f)
        return cursor, data
    return None, []

def run_query():
    cursor, return_data = load_state()
    retry_delay = 1
    max_delay = 300

    while True:
        variables = {'after': cursor}
        try:
            response = requests.post(API_URL, json={'query': query, 'variables': variables}, headers=HEADERS)
        except requests.exceptions.RequestException as e:
            print(f'Connection error: {e}, retrying in {retry_delay}s...')
            time.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, max_delay)
            continue

        if response.status_code == 403:
            reset = int(response.headers.get('X-RateLimit-Reset', time.time() + 60))
            wait_time = max(reset - time.time(), 60)
            print(f'Rate limited! Sleeping for {wait_time:.2f} seconds...')
            time.sleep(wait_time)
            continue

        if response.status_code != 200:
            print(f'Error {response.status_code}: {response.text}, retrying in {retry_delay}s...')
            time.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, max_delay)
            continue

        data = response.json()
        if 'errors' in data:
            print('GraphQL error:', data['errors'])
            break

        return_data.append(data)
        save_state(cursor, return_data)

        page_info = data['data']['search']['pageInfo']
        if not page_info['hasNextPage']:
            print('No more pages to fetch.')
            break

        cursor = page_info['endCursor']
        retry_delay = 1  # reset delay after success
        time.sleep(1)    # be nice to GitHub

    print(f'Fetched {len(return_data)} pages of data.')

if __name__ == '__main__':
    run_query()


No more pages to fetch.
Fetched 20 pages of data.
