In [1]:
#Step 1: Import important libraries  

import requests               # To send HTTP requests to the GitHub API 
import pandas as pd           # To handle tabular data and CSV operations
import time                   # To add delays and manage rate limits
import re                     # To search text using patterns (regular expressions)
import logging                # To record what the script is doing at every given point (log errors, warnings, and process steps)
from tqdm import tqdm         # To show a progress bar while the script runs
from pathlib import Path      # To manage file paths 
import os

In [2]:
# Step 2: Set up access to GitHub and define search parameters 

Token = os.getenv("GITHUB_PERSONAL_ACCESS_TOKEN")  #This is where you'll add your token  

headers = {
    "Authorization": f"Bearer {Token}",
    "Accept": "application/vnd.github+json"
}

query = "ehrQL+language:python + org:opensafely"  # Search for code files containing ehrQL in Python within OpenSafely org  
base_url = "https://api.github.com/search/code"      

per_page = 100   # Pagination: max 100 results per page allowed by GitHub API
max_pages = 5  
all_results = []

exclude_keywords = ['documentation', 'research-template', 'tutorials'] # Define keywords to skip

#loop through each page 1-5 and search, sending the search request to GitHub each time.
#For each code file found, you collect:its name, where it is in the project, the project it's from the and the link to view it online 
#and the raw URL to download its contents directly. 
#Pause for 1 second after each page, so GitHub doesn't block the requests for being too fast.


for page in range(1, max_pages + 1): 
    print(f"Searching GitHub - Page {page}")
    
    url = f"{base_url}?q={query}&per_page={per_page}&page={page}"
    response = requests.get(url, headers=headers)

    results = response.json().get("items", [])
    
    if not results:
        print("No more results.")
        break

    for item in results:
        repo_name = item["repository"]["full_name"].lower()  # Lowercase for case-insensitive matching
        
        # Skip repos containing any of the excluded keywords
        
        if any(keyword in repo_name for keyword in exclude_keywords):
            print(f"Skipping repo: {repo_name}")
            continue  # Skip this repo and move to next item
        
        file_url = item["html_url"]
        raw_url = file_url.replace("github.com", "raw.githubusercontent.com").replace("/blob/", "/")

        all_results.append({
            "File_Name": item["name"],
            "File_Path": item["path"],
            "Repository": item["repository"]["full_name"],
            "File_URL": file_url,
            "Raw_URL": raw_url
        })

    time.sleep(1)  # Respect GitHub API rate limits

Searching GitHub - Page 1
Skipping repo: opensafely/documentation
Skipping repo: opensafely/documentation
Skipping repo: opensafely/research-template
Searching GitHub - Page 2
Searching GitHub - Page 3
Skipping repo: opensafely/documentation
Searching GitHub - Page 4
Searching GitHub - Page 5
No more results.


In [None]:
#Step 3: Export result to text file 

df = pd.DataFrame(all_results)

df.to_csv("ehrql_search_results.csv", index=False)

print("Results saved")

Results saved


In [None]:
df

Unnamed: 0,File_Name,File_Path,File_URL,Raw_URL,Repository
0,codelists_ehrql.py,analysis/codelists_ehrql.py,https://github.com/opensafely/cis-pop-validati...,https://raw.githubusercontent.com/opensafely/c...,opensafely/cis-pop-validation-ehrql
1,ehrql_dataset_definition.py,analysis/ehrql_dataset_definition.py,https://github.com/opensafely/dummy-data-works...,https://raw.githubusercontent.com/opensafely/d...,opensafely/dummy-data-workshop
2,codelists_ehrQL.py,analysis/codelists_ehrQL.py,https://github.com/opensafely/early-inflammato...,https://raw.githubusercontent.com/opensafely/e...,opensafely/early-inflammatory-arthritis
3,ehrql_codelists_ast.py,analysis/ehrQL_code/ehrql_codelists_ast.py,https://github.com/opensafely/asthma_sro/blob/...,https://raw.githubusercontent.com/opensafely/a...,opensafely/asthma_sro
4,dataset_definition_ehrql_example.py,analysis/ehrql/dataset_definition_ehrql_exampl...,https://github.com/opensafely/end-of-life-care...,https://raw.githubusercontent.com/opensafely/e...,opensafely/end-of-life-carequality
5,plot_measures.py,analysis/ehrQL/plot_measures.py,https://github.com/opensafely/pincer-measures/...,https://raw.githubusercontent.com/opensafely/p...,opensafely/pincer-measures
6,codelists_ehrQL.py,analysis/codelists_ehrQL.py,https://github.com/opensafely/disease_incidenc...,https://raw.githubusercontent.com/opensafely/d...,opensafely/disease_incidence
7,costs_2018.py,analysis/costs_2018.py,https://github.com/opensafely/ckd-healthcare-u...,https://raw.githubusercontent.com/opensafely/c...,opensafely/ckd-healthcare-use
8,generate_yaml.py,generate_yaml.py,https://github.com/opensafely/open-pathology/b...,https://raw.githubusercontent.com/opensafely/o...,opensafely/open-pathology
9,measures_carehome.py,analysis/measures_carehome.py,https://github.com/opensafely/opioids-covid-re...,https://raw.githubusercontent.com/opensafely/o...,opensafely/opioids-covid-research


In [None]:
# Step 4:  Read and without escaping special features from the text file

feature_file = Path("features.txt")

features_to_search = []

with feature_file.open(encoding="utf-8") as f:
    for line in f:
        line = line.strip()  # Remove whitespace/newline
        if line:             # Skip empty lines
            features_to_search.append(line)  # Keep special characters as-is
            

In [None]:
# Step 5: Set up logging to track the script’s progress, record any issues

logging.basicConfig(
    filename='feature_search.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)


In [None]:
# Step 6: Create download directory

download_dir = Path("downloaded_files")
download_dir.mkdir(exist_ok=True)

In [None]:
# Step 7: Prepare dictionary to count any number of  features

feature_counts = {feature: 0 for feature in features_to_search}

In [None]:
# Step 8: Download files

logging.info("Starting file downloads...")
for index, row in tqdm(df.iterrows(), total=len(df), desc="Downloading files"):
    raw_url = row['Raw_URL']
    filename = download_dir / f"file_{index}.txt"

    if filename.exists():
        continue

    try:
        response = requests.get(raw_url, timeout=10)
        if response.status_code == 200:
            filename.write_text(response.text, encoding="utf-8")
        else:
            logging.warning(f"Failed to fetch file. Status code: {response.status_code} | URL: {raw_url}")
    except Exception as e:
        logging.error(f"Error fetching {raw_url} | Reason: {e}")

    time.sleep(0.5)

logging.info("All files downloaded.")

Downloading files: 100%|█████████████████████████████████████████████████████████████| 302/302 [00:03<00:00, 79.23it/s]


In [None]:
# Step 9: Parse files for literal matches

logging.info("Starting feature parsing...")
for file_path in tqdm(download_dir.glob("*.txt"), desc="Parsing files"):
    try:
        file_content = file_path.read_text(encoding="utf-8")
        for feature in features_to_search:
            count = file_content.count(feature)  # Literal string match
            if count > 0:
                logging.info(f"Found {count} instances of '{feature}' in {file_path.name}")
            feature_counts[feature] += count

    except Exception as e:
        logging.error(f"Error reading {file_path} | Reason: {e}")

Parsing files: 302it [00:03, 78.45it/s]


In [None]:
# Step 10: Display results

print("Total Count of Each Feature:")
for feature, total in feature_counts.items():
    print(f"{feature}: {total}")

Total Count of Each Feature:
show(): 0
create_dataset(): 82
dataset(): 82
define_population(): 0
add_column(column_name, ehrql_query): 0
configure_dummy_data(population_size=10, legacy=False, timeout=60, additional_population_constraint=None): 0
PatientFrame(): 0
exists_for_patient(): 2182
count_for_patient(): 422
EventFrame(): 0
where(condition): 0
except_where(condition): 0
sort_by(*sort_values): 0
SortedEventFrame(): 0
first_for_patient(): 431
last_for_patient(): 369
BoolPatientSeries(): 0
is_null(): 317
is_not_null(): 344
when_null_then(other): 0
is_in(other): 0
is_not_in(other): 0
map_values(mapping, default=None): 0
as_int(): 2
BoolEventSeries(): 0
count_distinct_for_patient(): 90
StrPatientSeries(): 0
contains(other): 0
StrEventSeries(): 0
minimum_for_patient(): 338
maximum_for_patient(): 42
IntPatientSeries(): 0
as_float(): 0
IntEventSeries(): 0
sum_for_patient(): 39
mean_for_patient(): 2
FloatPatientSeries(): 0
FloatEventSeries(): 0
to_first_of_year(): 1
to_first_of_month(): 2

In [None]:
# Step 11: Export to CSV

df_features = pd.DataFrame(list(feature_counts.items()), columns=['Feature', 'Count'])

df_features.to_csv('feature_counts.csv', index=False)

logging.info("Feature counts exported to feature_counts.csv")

print("File Saved")

File Saved


In [None]:
#Next find a way to fine tune the script to run a substring search to capture functions that have customized queries.
#Goal will be to find a way to get the script to read ehrl fuctions wherever they appear. Irrespective of the argument 
#Build your streamlit dashboard 
#Build a poster of your project for the closing ceremoney so you can talk about it then.