In [None]:
import requests
from dotenv import load_dotenv
import os
import time 
import json
load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")

HEADERS = {"Authorization": f"Bearer {GITHUB_TOKEN}"}
API_URL = "https://api.github.com/graphql"

query = """
query ($after: String) {
  search(query: "stars:>1000", type: REPOSITORY, first: 50, after: $after) {
    repositoryCount
    pageInfo {
      endCursor
      hasNextPage
    }
    edges {
      node {
        ... on Repository {
          nameWithOwner
          description
          stargazerCount
          forkCount
          url
          updatedAt
          issues(states: OPEN) {
            totalCount
          }
          watchers {
            totalCount
          }
        }
      }
    }
  }
}
"""


In [None]:
return_data = []
cursor = None

while True:
    variables = {"after": cursor}
    response = requests.post(API_URL, json={"query": query, "variables": variables}, headers=HEADERS)
    data = response.json()
    print(data)

    # Optional: basic error handling
    if "errors" in data:
        print("GraphQL error:", data["errors"])
        break

    return_data.append(data)

    page_info = data['data']['search']['pageInfo']
    if not page_info["hasNextPage"]:
        break

    cursor = page_info["endCursor"]

    # Optional: sleep to respect rate limits
    time.sleep(1)

# Write all responses to a file
with open("github_response.json", "w") as f:
    json.dump(return_data, f, indent=2)

In [None]:
CURSOR_FILE = 'github_cursor.json'
DATA_FILE = 'github_response_2.json'

def save_state(cursor, data):
    with open(CURSOR_FILE, '+a') as f:
        json.dump({'cursor': cursor}, f)
    with open(DATA_FILE, 'w') as f:
        json.dump(data, f, indent=2)

def load_state():
    if os.path.exists(CURSOR_FILE) and os.path.exists(DATA_FILE):
        with open(CURSOR_FILE) as f:
            cursor = json.load(f).get('cursor')
        with open(DATA_FILE) as f:
            data = json.load(f)
        return cursor, data
    return None, []

def run_query():
    cursor, return_data = load_state()
    retry_delay = 1
    max_delay = 300

    while True:
        variables = {'after': cursor}
        try:
            response = requests.post(API_URL, json={'query': query, 'variables': variables}, headers=HEADERS)
        except requests.exceptions.RequestException as e:
            print(f'Connection error: {e}, retrying in {retry_delay}s...')
            time.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, max_delay)
            continue

        if response.status_code == 403:
            reset = int(response.headers.get('X-RateLimit-Reset', time.time() + 60))
            wait_time = max(reset - time.time(), 60)
            print(f'Rate limited! Sleeping for {wait_time:.2f} seconds...')
            time.sleep(wait_time)
            continue

        if response.status_code != 200:
            print(f'Error {response.status_code}: {response.text}, retrying in {retry_delay}s...')
            time.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, max_delay)
            continue

        data = response.json()
        if 'errors' in data:
            print('GraphQL error:', data['errors'])
            break

        return_data.append(data)
        save_state(cursor, return_data)

        page_info = data['data']['search']['pageInfo']
        if not page_info['hasNextPage']:
            print('No more pages to fetch.')
            break

        cursor = page_info['endCursor']
        retry_delay = 1  # reset delay after success
        time.sleep(1)    # be nice to GitHub

    print(f'Fetched {len(return_data)} pages of data.')

if __name__ == '__main__':
    run_query()


No more pages to fetch.
Fetched 20 pages of data.


In [26]:
""""
FOSS project name, description, stars, forks, issues, watchers
"""

import json
import csv

# Load the JSON data from the file
with open('github_response.json', 'r') as f:
    data = json.load(f)

# Prepare the list to hold the rows for the CSV
rows = []

# Loop through the search results and extract the relevant fields
for item in data:
    for repo in item['data']['search']['edges']:
        repo_data = repo['node']
        rows.append([
            repo_data['nameWithOwner'],
            repo_data['description'],
            repo_data['stargazerCount'],
            repo_data['forkCount'],
            repo_data['issues']['totalCount'],
            repo_data['watchers']['totalCount']
        ])

# Write the data to a CSV file
with open('github_repositories.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # Write the header
    writer.writerow(["FOSS project name", "description", "stars", "forks", "issues", "watchers"])
    # Write the rows
    writer.writerows(rows)

print("Data successfully written to github_repositories.csv")


Data successfully written to github_repositories.csv


In [27]:
import os
import json
import time
import requests

API_URL = "https://api.github.com/graphql"
HEADERS = {
    "Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}"
}
OUTPUT_DIR = "github_data"
STEP = 1000

query_template = """
query($after: String, $range: String!) {
  search(query: $range, type: REPOSITORY, first: 50, after: $after) {
    repositoryCount
    pageInfo {
      endCursor
      hasNextPage
    }
    edges {
      node {
        ... on Repository {
          nameWithOwner
          description
          stargazerCount
          forkCount
          url
          updatedAt
          issues {
            totalCount
          }
          watchers {
            totalCount
          }
        }
      }
    }
  }
}
"""

def generate_star_ranges(start=1000, end=450000, step=1000):
    ranges = []
    for s in range(start, end, step):
        e = s + step - 1
        ranges.append(f"stars:{s}..{e}")
    ranges.append(f"stars:>={end}")
    return ranges

def save_state(star_range, cursor, data):
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    prefix = star_range.replace(":", "_").replace("..", "_to_").replace(">=", "gte")
    with open(os.path.join(OUTPUT_DIR, f"{prefix}_cursor.json"), "w") as f:
        json.dump({"cursor": cursor}, f)
    with open(os.path.join(OUTPUT_DIR, f"{prefix}_data.json"), "w") as f:
        json.dump(data, f, indent=2)

def load_state(star_range):
    prefix = star_range.replace(":", "_").replace("..", "_to_").replace(">=", "gte")
    cursor_path = os.path.join(OUTPUT_DIR, f"{prefix}_cursor.json")
    data_path = os.path.join(OUTPUT_DIR, f"{prefix}_data.json")
    if os.path.exists(cursor_path) and os.path.exists(data_path):
        with open(cursor_path) as f:
            cursor = json.load(f).get("cursor")
        with open(data_path) as f:
            data = json.load(f)
        return cursor, data
    return None, []

def fetch_star_range(star_range):
    cursor, all_data = load_state(star_range)
    retry_delay = 1
    max_delay = 300
    print(f"\n▶ Fetching range: {star_range}")

    while True:
        variables = {
            "after": cursor,
            "range": f"{star_range} sort:stars-desc"
        }
        try:
            res = requests.post(API_URL, json={"query": query_template, "variables": variables}, headers=HEADERS)
        except requests.exceptions.RequestException as e:
            print(f"❌ Network error: {e}. Retrying in {retry_delay}s...")
            time.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, max_delay)
            continue

        if res.status_code == 403:
            reset = int(res.headers.get("X-RateLimit-Reset", time.time() + 60))
            wait_time = max(reset - time.time(), 60)
            print(f"⏳ Rate limited. Sleeping for {wait_time:.1f}s...")
            time.sleep(wait_time)
            continue

        if res.status_code != 200:
            print(f"❌ Error {res.status_code}: {res.text}")
            time.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, max_delay)
            continue

        data = res.json()
        if "errors" in data:
            print(f"❌ GraphQL error: {data['errors']}")
            break

        all_data.append(data)
        page_info = data["data"]["search"]["pageInfo"]
        cursor = page_info["endCursor"]
        save_state(star_range, cursor, all_data)

        if not page_info["hasNextPage"]:
            print(f"✅ Finished range: {star_range}. Pages fetched: {len(all_data)}")
            break

        retry_delay = 1  # Reset retry delay on success
        time.sleep(1)

def run_all():
    ranges = generate_star_ranges(1000, 450000, STEP)
    for r in ranges:
        fetch_star_range(r)

if __name__ == "__main__":
    run_all()



▶ Fetching range: stars:1000..1999
⏳ Rate limited. Sleeping for 60.0s...
✅ Finished range: stars:1000..1999. Pages fetched: 20

▶ Fetching range: stars:2000..2999
⏳ Rate limited. Sleeping for 60.0s...
⏳ Rate limited. Sleeping for 60.0s...
✅ Finished range: stars:2000..2999. Pages fetched: 20

▶ Fetching range: stars:3000..3999
⏳ Rate limited. Sleeping for 60.0s...
⏳ Rate limited. Sleeping for 60.0s...
✅ Finished range: stars:3000..3999. Pages fetched: 20

▶ Fetching range: stars:4000..4999
⏳ Rate limited. Sleeping for 60.0s...
⏳ Rate limited. Sleeping for 60.0s...
✅ Finished range: stars:4000..4999. Pages fetched: 20

▶ Fetching range: stars:5000..5999
✅ Finished range: stars:5000..5999. Pages fetched: 20

▶ Fetching range: stars:6000..6999
⏳ Rate limited. Sleeping for 60.0s...
⏳ Rate limited. Sleeping for 60.0s...
✅ Finished range: stars:6000..6999. Pages fetched: 20

▶ Fetching range: stars:7000..7999
⏳ Rate limited. Sleeping for 60.0s...
✅ Finished range: stars:7000..7999. Pages fe