# GitHub Pull Request Downloader
This notebook fetches all issues (open, closed, timeline, etc.) from a GitHub repo and saves them to MongoDB.

In [8]:
from dotenv import load_dotenv
import os
import sys

# Load environment variables from .env file
load_dotenv()

import json
import time
import requests
from pymongo import MongoClient
from loguru import logger

## Logging

In [9]:
logger.remove()
logger.add(sys.stdout, level="INFO", format="<green>{time}</green> | <level>{message}</level>")
logger.add("logs/github_downloader_pr.log", level="DEBUG", rotation="10 MB")

4

## token management

In [10]:
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
if not GITHUB_TOKEN:
    logger.error("GITHUB_TOKEN is not set in the environment variables.")
    sys.exit(1)

def get_headers():
    return {
        "Authorization": f"token {GITHUB_TOKEN}",
        "Accept": "application/vnd.github.v3+json"
    }
    
def handle_rate_limit(headers):
    remaining = int(headers.get("X-RateLimit-Remaining", 1))
    reset_time = int(headers.get("X-RateLimit-Reset", 0))
    if remaining < 10:
        sleep_time = reset_time - int(time.time()) + 5
        if sleep_time > 0:
            logger.warning(f"⏳ Rate limit hit. Sleeping for {sleep_time} seconds...")
            time.sleep(sleep_time)

## Mongo DB 

In [11]:
client = MongoClient("mongodb://localhost:27017/")
db = client["github_data"]
collection = db["pull_requests"]

In [12]:
def save_pr(pr_data):
    collection.replace_one({"pull_request.number": pr_data["pull_request"]["number"]}, pr_data, upsert=True)

## GitHub APIs

In [13]:
# GitHub API Calls

def get_comments(repo, number):
    url = f"https://api.github.com/repos/{repo}/issues/{number}/comments"
    response = requests.get(url, headers=get_headers())
    handle_rate_limit(response.headers)
    return response.json()

def get_timeline(repo, number):
    url = f"https://api.github.com/repos/{repo}/issues/{number}/timeline"
    headers = get_headers()
    headers["Accept"] = "application/vnd.github+json"
    response = requests.get(url, headers=headers)
    handle_rate_limit(response.headers)
    return response.json()

## Script

### page -> curser

In [21]:
GITHUB_API = "https://api.github.com"
REPO = "ballerina-platform/ballerina-lang" 
# Load MongoDB collection
collection = db["pull_requests"] 

# Starting URL
url = f"{GITHUB_API}/repos/{REPO}/pulls?state=all&per_page=100"

logger.info("🚀 Starting GitHub pull_request download (cursor-based)")

while url:
    logger.info(f"Fetching URL: {url}")
    response = requests.get(url, headers=get_headers())
    handle_rate_limit(response.headers)

    if response.status_code != 200:
        logger.error(f"Pr fetch failed: {response.json()}")
        break

    pulls = response.json()
    if not pulls:
        logger.info("✅ All prs fetched.")
        break

    for pull in pulls:
      
        pr_id = pull["id"]
        pr_number = pull["number"]

        # Skip if already saved
        if collection.find_one({"pull_request.number": pr_number}):
            logger.debug(f"pull_request #{pr_number} already exists, skipping.")
            continue

        # Fetch details
        comments = get_comments(REPO, pr_number)
        timeline = get_timeline(REPO, pr_number)

        # Save to DB
        save_pr({
            "pull_request": pull,
            "comments": comments,
            "timeline": timeline
        })

        logger.success(f"pull_request #{pr_number} saved ✅")
        time.sleep(0.5)

    # Get the next cursor (Link header)
    links = response.headers.get("Link", "")
    next_url = None
    for link in links.split(","):
        if 'rel="next"' in link:
            next_url = link.split(";")[0].strip()[1:-1]
    url = next_url


[32m2025-05-22T10:28:36.959544+0530[0m | [1m🚀 Starting GitHub pull_request download (cursor-based)[0m
[32m2025-05-22T10:28:36.959544+0530[0m | [1mFetching URL: https://api.github.com/repos/ballerina-platform/ballerina-lang/pulls?state=all&per_page=100[0m
[32m2025-05-22T10:28:36.959544+0530[0m | [1mFetching URL: https://api.github.com/repos/ballerina-platform/ballerina-lang/pulls?state=all&per_page=100[0m
[32m2025-05-22T10:28:38.289825+0530[0m | [1mFetching URL: https://api.github.com/repositories/73930305/pulls?state=all&per_page=100&page=2[0m
[32m2025-05-22T10:28:38.289825+0530[0m | [1mFetching URL: https://api.github.com/repositories/73930305/pulls?state=all&per_page=100&page=2[0m
[32m2025-05-22T10:28:39.719581+0530[0m | [1mFetching URL: https://api.github.com/repositories/73930305/pulls?state=all&per_page=100&page=3[0m
[32m2025-05-22T10:28:39.719581+0530[0m | [1mFetching URL: https://api.github.com/repositories/73930305/pulls?state=all&per_page=100&page=3