# GitHub Issues Downloader
This notebook fetches all issues (open, closed, timeline, etc.) from a GitHub repo with token rotation and saves them to JSON.
Feel free to upgrade to MongoDB later.

In [1]:
from dotenv import load_dotenv
import os
import sys

# Load environment variables from .env file
load_dotenv()

import json
import time
import requests
from pymongo import MongoClient
from loguru import logger

## Logging

In [None]:
logger.remove()
logger.add(sys.stdout, level="INFO", format="<green>{time}</green> | <level>{message}</level>")
logger.add("logs/github_downloader.log", level="DEBUG", rotation="1 MB")

2

## token management

In [3]:
TOKENS = os.getenv("GITHUB_TOKENS", "").split(",")
assert TOKENS, "❌ Set GITHUB_TOKENS environment variable with your tokens."
token_index = 0

def get_next_token():
    global token_index
    token = TOKENS[token_index]
    token_index = (token_index + 1) % len(TOKENS)
    return token

def get_headers():
    return {
        "Authorization": f"token {get_next_token()}",
        "Accept": "application/vnd.github.v3+json"
    }

def handle_rate_limit(headers):
    remaining = int(headers.get("X-RateLimit-Remaining", 1))
    reset_time = int(headers.get("X-RateLimit-Reset", 0))
    if remaining < 10:
        sleep_time = reset_time - int(time.time()) + 5
        if sleep_time > 0:
            logger.warning(f"⏳ Rate limit hit. Sleeping for {sleep_time} seconds...")
            time.sleep(sleep_time)

## Mongo DB 

In [4]:
client = MongoClient("mongodb://localhost:27017/")
db = client["github_data"]
issues_col = db["issues"]

def save_issue(issue_data):
    issues_col.replace_one({"issue.number": issue_data["issue"]["number"]}, issue_data, upsert=True)


## check point

In [5]:
CHECKPOINT_FILE = "checkpoint.json"

def save_checkpoint(issue_number):
    with open(CHECKPOINT_FILE, "w") as f:
        json.dump({"last_issue": issue_number}, f)

def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE) as f:
            return json.load(f).get("last_issue", 0)
    return 0

## GitHub APIs

In [None]:
# GitHub API Calls
def get_issues(repo, page=1):
    url = f"https://api.github.com/repos/{repo}/issues?state=all&per_page=100&page={page}"
    response = requests.get(url, headers=get_headers())
    handle_rate_limit(response.headers)
    if response.status_code != 200:
        logger.error(f"Issue fetch failed: {response.json()}")
        return []
    return response.json()

def get_comments(repo, number):
    url = f"https://api.github.com/repos/{repo}/issues/{number}/comments"
    response = requests.get(url, headers=get_headers())
    handle_rate_limit(response.headers)
    return response.json()

def get_timeline(repo, number):
    url = f"https://api.github.com/repos/{repo}/issues/{number}/timeline"
    headers = get_headers()
    headers["Accept"] = "application/vnd.github+json"
    response = requests.get(url, headers=headers)
    handle_rate_limit(response.headers)
    return response.json()

## Script

In [8]:
REPO = "ballerina-platform/ballerina-lang" 
start_from = load_checkpoint()
page = (start_from // 100) + 1
logger.info(f"🚀 Starting from issue #{start_from}, page {page}")

while True:
    issues = get_issues(REPO, page=page)
    if not issues:
        logger.info("✅ All issues fetched.")
        break

    for issue in issues:
        if "pull_request" in issue:
            continue  # skip PRs
        issue_number = issue["number"]
        if issue_number <= start_from:
            continue

        comments = get_comments(REPO, issue_number)
        timeline = get_timeline(REPO, issue_number)

        save_issue({
            "issue": issue,
            "comments": comments,
            "timeline": timeline
        })

        save_checkpoint(issue_number)
        logger.success(f"Issue #{issue_number} saved ✅")
        time.sleep(0.5)

    page += 1

[32m2025-05-15T12:22:00.410361+0530[0m | [1m🚀 Starting from issue #34009, page 341[0m


[32m2025-05-15T12:22:01.055633+0530[0m | [31m[1mIssue fetch failed: {'message': 'Pagination with the page parameter is not supported for large datasets, please use cursor based pagination (after/before)', 'documentation_url': 'https://docs.github.com/rest/issues/issues#list-repository-issues', 'status': '422'}[0m
[32m2025-05-15T12:22:01.056604+0530[0m | [1m✅ All issues fetched.[0m


### page -> curser

In [13]:
GITHUB_API = "https://api.github.com"

# Load MongoDB collection
collection = db["issues"]  # Ensure this is defined earlier

# Starting URL
url = f"{GITHUB_API}/repos/{REPO}/issues?state=all&per_page=100"
headers = {"Authorization": f"token {get_next_token()}"}

logger.info("🚀 Starting GitHub issue download (cursor-based)")

while url:
    logger.info(f"Fetching URL: {url}")
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        logger.error(f"Issue fetch failed: {response.json()}")
        break

    issues = response.json()
    if not issues:
        logger.info("✅ All issues fetched.")
        break

    for issue in issues:
        if "pull_request" in issue:
            continue  # Skip PRs

        issue_id = issue["id"]
        issue_number = issue["number"]

        # Skip if already saved
        if collection.find_one({"issue.id": issue_id}):
            logger.debug(f"Issue #{issue_number} already exists, skipping.")
            continue

        # Fetch details
        comments = get_comments(REPO, issue_number)
        timeline = get_timeline(REPO, issue_number)

        # Save to DB
        save_issue({
            "issue": issue,
            "comments": comments,
            "timeline": timeline
        })

        logger.success(f"Issue #{issue_number} saved ✅")
        time.sleep(0.5)

    # Get the next cursor (Link header)
    links = response.headers.get("Link", "")
    next_url = None
    for link in links.split(","):
        if 'rel="next"' in link:
            next_url = link.split(";")[0].strip()[1:-1]
    url = next_url


[32m2025-05-15T19:01:45.313616+0530[0m | [1m🚀 Starting GitHub issue download (cursor-based)[0m
[32m2025-05-15T19:01:45.314622+0530[0m | [1mFetching URL: https://api.github.com/repos/ballerina-platform/ballerina-lang/issues?state=all&per_page=100[0m


[32m2025-05-15T19:01:47.938321+0530[0m | [32m[1mIssue #44106 saved ✅[0m
[32m2025-05-15T19:01:48.515838+0530[0m | [1mFetching URL: https://api.github.com/repositories/73930305/issues?state=all&per_page=100&after=Y3Vyc29yOnYyOpLPAAABljDAzUDOsk337w%3D%3D&page=2[0m
[32m2025-05-15T19:01:50.063294+0530[0m | [1mFetching URL: https://api.github.com/repositories/73930305/issues?state=all&per_page=100&after=Y3Vyc29yOnYyOpLPAAABlX8DD_DOrTugoQ%3D%3D&page=3[0m
[32m2025-05-15T19:01:51.559326+0530[0m | [1mFetching URL: https://api.github.com/repositories/73930305/issues?state=all&per_page=100&after=Y3Vyc29yOnYyOpLPAAABlNpzVLjOqPlSIA%3D%3D&page=4[0m
[32m2025-05-15T19:01:53.038392+0530[0m | [1mFetching URL: https://api.github.com/repositories/73930305/issues?state=all&per_page=100&after=Y3Vyc29yOnYyOpLPAAABk6_uyJDOorHWyg%3D%3D&page=5[0m
[32m2025-05-15T19:01:54.508326+0530[0m | [1mFetching URL: https://api.github.com/repositories/73930305/issues?state=all&per_page=100&after=Y3Vyc