In [1]:
#Step 1: Import important libraries  
import requests               # To send HTTP requests to the GitHub API
import pandas as pd           # To handle tabular data and CSV operations
import time                   # To add delays and manage rate limits
import os                     # To access environment variables
from tqdm import tqdm         # To show a progress bar while the script runs

In [2]:
# Step 2: Set up access to GitHub 
Token = os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN")  # Add your token here
headers = {
    "Authorization": f"Bearer {Token}",
    "Accept": "application/vnd.github+json"
}

In [3]:
#Step 3: Define search parameters 
query = "ehrQL+language:python+org:opensafely"  # Search for ehrQL in Python files within OpenSafely org
base_url = "https://api.github.com/search/code"
per_page = 100       # GitHub max results per page
max_pages = 5        # Adjust as needed
exclude_keywords = ['documentation', 'research-template', 'tutorials']  # Repos to skip

# Store unique repositories
unique_repos = set()

In [5]:
for page in range(1, max_pages + 1):
    print(f"Searching GitHub - Page {page}")
    url = f"{base_url}?q={query}&per_page={per_page}&page={page}"
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Error: {response.status_code} - {response.text}")
        break
    
    results = response.json().get("items", [])
    if not results:
        print("No more results.")
        break

    for item in results:
        repo_name = item["repository"]["full_name"]
        if any(keyword in repo_name for keyword in exclude_keywords):
            print(f"Skipping repo: {repo_name}")
            continue
        # Add repo to set (only unique repos are kept)
        unique_repos.add(repo_name)
    
    time.sleep(1)

Searching GitHub - Page 1
Skipping repo: opensafely/documentation
Skipping repo: opensafely/documentation
Skipping repo: opensafely/research-template
Searching GitHub - Page 2
Searching GitHub - Page 3
Skipping repo: opensafely/documentation
Searching GitHub - Page 4
Searching GitHub - Page 5
No more results.


In [6]:
df = pd.DataFrame({"Repository": list(unique_repos)})
df.to_csv("opensafely_ehrql_repos.csv", index=False)
print(f"Found {len(unique_repos)} unique repositories containing ehrQL code.")
print("Results saved to opensafely_ehrql_repos.csv")

Found 65 unique repositories containing ehrQL code.
Results saved to opensafely_ehrql_repos.csv


In [7]:
df

Unnamed: 0,Repository
0,opensafely/ehrql-workflow
1,opensafely/opioids-covid-research
2,opensafely/opioids-data-curation
3,opensafely/prophy_effects_Sotro_Molnup
4,opensafely/waiting-list
...,...
60,opensafely/dataset-definition-testing-examples
61,opensafely/dfms-covid19-new
62,opensafely/ehrql-tutorial
63,opensafely/metformin_covid
