# GitHub Issues Downloader
This notebook fetches all issues (open, closed, timeline, etc.) from a GitHub repo with token rotation and saves them to JSON.
Feel free to upgrade to MongoDB later.

In [1]:
from dotenv import load_dotenv
import os
import sys

# Load environment variables from .env file
load_dotenv()

import json
import time
import requests
from pymongo import MongoClient
from loguru import logger

## Logging

In [2]:
logger.remove()
logger.add(sys.stdout, level="INFO", format="<green>{time}</green> | <level>{message}</level>")
logger.add("logs/github_downloader_cross.log", level="DEBUG", rotation="10 MB")

2

## token management

In [3]:
TOKENS = os.getenv("GITHUB_TOKENS", "").split(",")
assert TOKENS, "❌ Set GITHUB_TOKENS environment variable with your tokens."
token_index = 0

def get_next_token():
    global token_index
    token = TOKENS[token_index]
    token_index = (token_index + 1) % len(TOKENS)
    return token

def get_headers():
    return {
        "Authorization": f"token {get_next_token()}",
        "Accept": "application/vnd.github.v3+json"
    }

def handle_rate_limit(headers):
    remaining = int(headers.get("X-RateLimit-Remaining", 1))
    reset_time = int(headers.get("X-RateLimit-Reset", 0))
    if remaining < 10:
        sleep_time = reset_time - int(time.time()) + 5
        if sleep_time > 0:
            logger.warning(f"⏳ Rate limit hit. Sleeping for {sleep_time} seconds...")
            time.sleep(sleep_time)

## Mongo DB 

In [4]:
client = MongoClient("mongodb://localhost:27017/")
db = client["github_data"]
issues_col = db["issues_cross"]

def save_issue(issue_data):
    issues_col.replace_one({"issue.number": issue_data["issue"]["number"]}, issue_data, upsert=True)


## GitHub APIs

In [None]:
# GitHub API Calls
def get_comments(repo, number):
    url = f"https://api.github.com/repos/{repo}/issues/{number}/comments"
    response = requests.get(url, headers=get_headers())
    handle_rate_limit(response.headers)
    return response.json()

def get_timeline(repo, number):
    url = f"https://api.github.com/repos/{repo}/issues/{number}/timeline"
    headers = get_headers()
    headers["Accept"] = "application/vnd.github+json"
    response = requests.get(url, headers=headers)
    handle_rate_limit(response.headers)
    return response.json()

## Script

### page -> curser

In [None]:
GITHUB_API = "https://api.github.com"
REPO = "ballerina-platform/ballerina-lang" 
# Load MongoDB collection
collection = db["issues_cross"] 

# Starting URL
url = f"{GITHUB_API}/repos/{REPO}/issues?state=all&per_page=100"
headers = {"Authorization": f"token {get_next_token()}"}

logger.info("🚀 Starting GitHub issue download (cursor-based)")

while url:
    logger.info(f"Fetching URL: {url}")
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        logger.error(f"Issue fetch failed: {response.json()}")
        break

    issues = response.json()
    if not issues:
        logger.info("✅ All issues fetched.")
        break

    for issue in issues:
        if "pull_request" in issue:
            continue  # Skip PRs

        issue_id = issue["id"]
        issue_number = issue["number"]

        # Skip if already saved
        if collection.find_one({"issue.id": issue_id}):
            logger.debug(f"Issue #{issue_number} already exists, skipping.")
            continue

        # Fetch details
        comments = get_comments(REPO, issue_number)
        timeline = get_timeline(REPO, issue_number)

        # Save to DB
        save_issue({
            "issue": issue,
            "comments": comments,
            "timeline": timeline
        })

        logger.success(f"Issue #{issue_number} saved ✅")
        time.sleep(0.5)

    # Get the next cursor (Link header)
    links = response.headers.get("Link", "")
    next_url = None
    for link in links.split(","):
        if 'rel="next"' in link:
            next_url = link.split(";")[0].strip()[1:-1]
    url = next_url


[32m2025-05-21T07:03:50.984657+0530[0m | [1m🚀 Starting GitHub issue download (cursor-based)[0m
[32m2025-05-21T07:03:50.984657+0530[0m | [1mFetching URL: https://api.github.com/repos/ballerina-platform/ballerina-lang/issues?state=all&per_page=100[0m
[32m2025-05-21T07:03:52.293400+0530[0m | [1mFetching URL: https://api.github.com/repositories/73930305/issues?state=all&per_page=100&after=Y3Vyc29yOnYyOpLPAAABlkJtK4jOsugSGA%3D%3D&page=2[0m
[32m2025-05-21T07:03:53.494386+0530[0m | [1mFetching URL: https://api.github.com/repositories/73930305/issues?state=all&per_page=100&after=Y3Vyc29yOnYyOpLPAAABlY3aCwDOrcx93Q%3D%3D&page=3[0m
[32m2025-05-21T07:03:54.627504+0530[0m | [1mFetching URL: https://api.github.com/repositories/73930305/issues?state=all&per_page=100&after=Y3Vyc29yOnYyOpLPAAABlPQmHLDOqY3omA%3D%3D&page=4[0m
[32m2025-05-21T07:03:55.895064+0530[0m | [1mFetching URL: https://api.github.com/repositories/73930305/issues?state=all&per_page=100&after=Y3Vyc29yOnYyOpLPAAA