In [1]:
#Step 1: Import important libraries  
import requests               # To send HTTP requests to the GitHub API 
import pandas as pd           # To handle tabular data and CSV operations
import time                   # To add delays and manage rate limits
import logging                # To record what the script is doing at every given point (log errors, warnings, and process steps)
from tqdm import tqdm         # To show a progress bar while the script runs
from pathlib import Path      # To manage file paths
import os

In [2]:
#Step 2: Setup access to GitHub 

Token = os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN")  #This is where you'll add your token
ORG = "opensafely"
HEADERS = {
    "Authorization": f"Bearer {Token}",
    "Accept": "application/vnd.github+json"
}

In [3]:
#Step 3: Define search parameters to extract all repos and ehrQL from GitHub under OpenSafely Org.and set storage.

query = "ehrQL+language:python+org:opensafely"  # search for code files containing ehrQL in Python within OpenSafely org
base_url = "https://api.github.com/search/code"

per_page = 100  # Pagination: max 100 results per page allowed by GitHub API
max_pages = 5

exclude_keywords = ['documentation', 'research-template', 'tutorials']

repo_creation_cache = {}  # Cache to avoid fetching repo info multiple times
all_results = [] #list for saving results 

In [4]:
#Step 4: Loop through GitHub search pages
        #Process each search result and skip repos with the excluded keywords defined earlier. 
        #Extract the creation date for repos (will be useful for sorting repos in the streamlit app)

for page in range(1, max_pages + 1):
    print(f"Searching GitHub - Page {page}")
    url = f"{base_url}?q={query}&per_page={per_page}&page={page}"
    response = requests.get(url, headers=HEADERS)
    
    if response.status_code != 200:
        print(f"Error fetching page {page}: {response.status_code}, message: {response.text}")
        break
    
    results = response.json().get("items", [])
    if not results:
        print("No more results.")
        break
    
    for item in results:
        repo_name = item["repository"]["full_name"].lower()
        
       
        if any(keyword in repo_name for keyword in exclude_keywords):   # Skip repos with excluded keywords
            continue
        
        
        if repo_name in repo_creation_cache:    # Get repo creation date (cached for efficiency)
            created_on = repo_creation_cache[repo_name]
        else:
            repo_api_url = f"https://api.github.com/repos/{repo_name}"
            repo_resp = requests.get(repo_api_url, headers=HEADERS)
            created_on = repo_resp.json().get("created_at") if repo_resp.status_code == 200 else None
            repo_creation_cache[repo_name] = created_on
        
        # File URLs
        file_url = item["html_url"]
        raw_url = file_url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")
        
        # Append results in the desired column order
        all_results.append({
            "Repository": item["repository"]["full_name"],  
            "Created_on": created_on,                       
            "File_Name": item["name"],
            "File_Path": item["path"],
            "File_URL": file_url,
            "Raw_URL": raw_url
        })
        
        time.sleep(1)  


Searching GitHub - Page 1
Searching GitHub - Page 2
Searching GitHub - Page 3
Searching GitHub - Page 4
Searching GitHub - Page 5
No more results.


In [5]:
# Step 5:Create fiile URLs, append results in the desired column order and save

df = pd.DataFrame(all_results)

output_path = "opensafely_ehrql_code_files.csv"

df.to_csv(output_path, index=False)

print(f"Saved {len(all_results)} records to {output_path}")

Saved 301 records to opensafely_ehrql_code_files.csv


In [6]:
df

Unnamed: 0,Repository,Created_on,File_Name,File_Path,File_URL,Raw_URL
0,opensafely/cis-pop-validation-ehrql,2022-09-29T15:19:10Z,codelists_ehrql.py,analysis/codelists_ehrql.py,https://github.com/opensafely/cis-pop-validati...,https://raw.githubusercontent.com/opensafely/c...
1,opensafely/dummy-data-workshop,2024-11-19T08:39:46Z,ehrql_dataset_definition.py,analysis/ehrql_dataset_definition.py,https://github.com/opensafely/dummy-data-works...,https://raw.githubusercontent.com/opensafely/d...
2,opensafely/early-inflammatory-arthritis,2022-02-22T10:53:22Z,codelists_ehrQL.py,analysis/codelists_ehrQL.py,https://github.com/opensafely/early-inflammato...,https://raw.githubusercontent.com/opensafely/e...
3,opensafely/asthma_sro,2022-02-07T12:00:24Z,ehrql_codelists_ast.py,analysis/ehrQL_code/ehrql_codelists_ast.py,https://github.com/opensafely/asthma_sro/blob/...,https://raw.githubusercontent.com/opensafely/a...
4,opensafely/end-of-life-carequality,2023-08-15T14:45:00Z,dataset_definition_ehrql_example.py,analysis/ehrql/dataset_definition_ehrql_exampl...,https://github.com/opensafely/end-of-life-care...,https://raw.githubusercontent.com/opensafely/e...
...,...,...,...,...,...,...
296,opensafely/covid-vaccine-history,2024-10-08T15:30:34Z,codelists.py,analysis/1-extract/codelists.py,https://github.com/opensafely/covid-vaccine-hi...,https://raw.githubusercontent.com/opensafely/c...
297,opensafely/asthma_sro,2022-02-07T12:00:24Z,ehrql_measures_test2.py,analysis/ehrQL_code/ehrql_measures_test2.py,https://github.com/opensafely/asthma_sro/blob/...,https://raw.githubusercontent.com/opensafely/a...
298,opensafely/pifu-data-exploration,2025-05-16T08:42:05Z,measures.py,analysis/measures.py,https://github.com/opensafely/pifu-data-explor...,https://raw.githubusercontent.com/opensafely/p...
299,opensafely/comparative-booster-ehrql-poc,2023-01-11T12:21:15Z,test_variables_lib.py,analysis/test_variables_lib.py,https://github.com/opensafely/comparative-boos...,https://raw.githubusercontent.com/opensafely/c...


In [7]:
# Step 6: Load features exactly as written
feature_file = Path("ehrQL_features.txt")
features_to_search = [line.strip() for line in feature_file.read_text(encoding="utf-8").splitlines() if line.strip()]

In [8]:
# Step 7: Set up logging, Prepare download directory and DataFrame
logging.basicConfig(
    filename='feature_search.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

download_dir = Path("downloaded_files")
download_dir.mkdir(exist_ok=True)

df = df.drop_duplicates(subset=["Raw_URL"]).reset_index(drop=True)

In [9]:
# Step 6: Initialise counters and repo map
feature_counts = {feature: 0 for feature in features_to_search}
feature_repo_map = {feature: set() for feature in features_to_search}

In [10]:
# Step 7: Download raw files
logging.info("Starting file downloads...")
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Downloading files"):
    raw_url = row["Raw_URL"]
    file_path = download_dir / f"file_{idx}.txt"

    if not file_path.exists():
        try:
            resp = requests.get(raw_url, timeout=20)
            if resp.status_code == 200:
                file_path.write_text(resp.text, encoding="utf-8")
            else:
                logging.warning(f"Failed to fetch file (status {resp.status_code}) | URL: {raw_url}")
        except Exception as e:
            logging.error(f"Error fetching {raw_url} | Reason: {e}")
        time.sleep(0.4)

logging.info("All files downloaded.")


Downloading files: 100%|██████████| 301/301 [4:23:31<00:00, 52.53s/it]      


In [11]:
# Step 8: Parse files using slicing (full scan)

logging.info("Starting feature parsing...")

for idx, row in tqdm(df.iterrows(), total=len(df), desc="Parsing files"):
    repo_name = row["Repository"]
    file_path = download_dir / f"file_{idx}.txt"

    try:
        try:
            file_content = file_path.read_text(encoding="utf-8")
        except UnicodeDecodeError:
            file_content = file_path.read_text(encoding="latin-1", errors="ignore")
    except Exception as e:
        logging.error(f"Error reading {file_path} | Reason: {e}")
        continue

    content_lower = file_content.casefold()
    text_len = len(content_lower)

    for feature in features_to_search:
        feature_lower = feature.casefold()
        base = feature_lower.split("(")[0]  # part before arguments
        base_len = len(base)
        start = 0
        count = 0

        while True:
            pos = content_lower.find(base, start)
            if pos == -1:
                break

            end_pos = pos + base_len
            temp_pos = end_pos
#Write out a summary of what the code below was intended for (it wasn't implemented in the script)
#If you want to extend this code, this is where you should think of parenthesis and arguments.
            # Skip optional whitespace before '('(possibly ignore)
            # while temp_pos < text_len and content_lower[temp_pos].isspace():
            #     temp_pos += 1

            # # If '(' follows, treat as full function call
            # if temp_pos < text_len and content_lower[temp_pos] == "(":
            #     close_pos = content_lower.find(")", temp_pos)
            #     if close_pos != -1:
            #         count += 1
            #         start = close_pos + 1   #at the moment, this works for the scope of this project
            # else:                           # Count even if no arguments
            count += 1
            start = end_pos

        if count > 0:
            feature_counts[feature] += count
            feature_repo_map[feature].add(repo_name)


Parsing files: 100%|██████████| 301/301 [00:11<00:00, 26.44it/s]


In [12]:
# Step 9: Save counts
df_counts = pd.DataFrame(
    [{"Feature": feat, "Count": feature_counts[feat]} for feat in features_to_search],
    columns=["Feature", "Count"]
)
df_counts.to_csv("ehrQL_feature_counts.csv", index=False)

repo_rows = []
for feat, repos in feature_repo_map.items():
    for repo in sorted(repos):
        repo_rows.append({"Feature": feat, "Repository": repo, "Raw_URL": raw_url})

df_repos = pd.DataFrame(repo_rows, columns=["Feature", "Repository, Raw_URL"])
df_repos.to_csv("feature-repo_map.csv", index=False)

logging.info("Feature counts and repo map exported.")
print("Files saved: ehrQL_feature_counts.csv, feature-repo_map.csv")


Files saved: ehrQL_feature_counts.csv, feature-repo_map.csv
