This is a Webscraping tool for the Gene Expression Omnibus (GEO) database

The search tool is great on GEO, but this can help you narrow down the number/type of columns in the phenotype table and lets you filter data more specifically

In [None]:
# Loading in selenium, specifically for Virtual Machines
!apt-get install chromium-driver
!pip install selenium biopython

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from Bio import Entrez
import time
def web_driver():
    options = webdriver.ChromeOptions()
    options.add_argument("--verbose")
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("--window-size=1920,1200")
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    return driver
driver = web_driver()

In [None]:
# searches geo for your specified data - query needs to be in format of a geo search
def search_geo(query, max_results, batch_size): 
    all_geo_ids = []
    for start in range(0, max_results, batch_size):
        handle = Entrez.esearch(db="gds", term=query, retmax=batch_size, retstart=start)
        record = Entrez.read(handle)
        handle.close()
        all_geo_ids.extend(record["IdList"])
        time.sleep(1/3) # Change to 1/10 if you have an NCBI account
    return all_geo_ids

# gets summary, title, and other info
def fetch_metadata(geo_id):
    handle = Entrez.efetch(db="gds", id=geo_id, rettype="xml")
    data = handle.read()
    handle.close()
    return data

# returns the important info
def parse_metadata(metadata, geo_id):
    
    title = metadata[metadata.find("\n") + 4:metadata.find("(Submitter supplied)")].strip()
    
    driver = web_driver()
    driver.get(f'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE{geo_id[3:]}')
    element = driver.find_element(By.XPATH, "//a[contains(@href, 'GSM')]")
    link = element.get_attribute("href") # creating GEO link for extracting phenotype columns
    
    driver.get(link)
    char = driver.find_element(By.XPATH, "//tr[td[text()='Characteristics']]/td[@style='text-align: justify']")
    cols = char.text.count('\n') + 1 # counts amount of phenotype columns
    driver.quit()
    return {"Title": title, "Columns": cols}

# Query in format of geo search
query = '("whole blood"[Title] OR "whole blood"[Description]) AND ("ncRNA"[All Fields] OR "non-coding RNA"[All Fields] OR "miRNA"[All Fields] OR "lncRNA"[All Fields]) AND "Homo sapiens"[Organism] AND gse[Filter]'
geo_ids = search_geo(query, 5, 1)

dataset_metadata = []
for geo_id in geo_ids:
    metadata = fetch_metadata(geo_id)
    parsed_data = parse_metadata(metadata, geo_id)
    dataset_metadata.append(parsed_data)

for dataset in dataset_metadata:
    print(f"Title: {dataset['Title']}")
    print(f"Columns: {dataset['Columns']}")

In [None]:
# Extract the information you need
for i in range(len(geo_ids)):
    if(dataset_metadata[i]['Columns'] >= 3):
        print(f"GSE{geo_ids[i][3:]}") # The 3: removes the 200 in front of all GSE numbers

In [None]:
# For testing purposes - find specific indices here
print(metadata[metadata.find("\n")+4:metadata.find("(Submitter supplied)")])
print(metadata[metadata.find("\nOrganism:")+11:metadata.find("\nType")])
print(metadata[metadata.find("\nType")+8:metadata.find("\nPlatform")])
driver = web_driver()
driver.get('https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE243219')
element = driver.find_element(By.XPATH, "//a[contains(@href, 'GSM')]")
link = element.get_attribute("href")
driver.quit()

driver = web_driver()
driver.get(link)
char = driver.find_element(By.XPATH, "//tr[td[text()='Characteristics']]/td[@style='text-align: justify']")
print(char.text.count('\n')+1)