Dynamic website scraping of UKBB AF browser: https://afb.ukbiobank.ac.uk/
* extracts allele count and total allele N table for each ancestry

In [8]:
import pandas as pd
from selenium import webdriver
from bs4 import BeautifulSoup

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

* Create text files with UKBB URLs of interest, read URL text file

In [19]:
with open('ssns_ctrl_url.txt', 'r') as file:
    # Read the lines of the file into a list
    urls = [line.strip() for line in file.readlines()]

In [18]:
len(urls)

103

In [11]:
dataframes = []
timeout_urls = []
driver = webdriver.Chrome()
# Iterate over each URL
for url in urls:
    # Send a GET request to the URL
    driver.get(url)
    try:
        # If no error message, wait for page to load, extract variant
        variant_element = WebDriverWait(driver, 8).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, '.MuiTypography-root.MuiTypography-h5.css-zq6grw'))
        )
        variant = variant_element.text.strip()
        #print(variant)
    except TimeoutException:
        #print(f"Could not fetch data for URL: {url} within the specified time")
        # if the 'variant' is not extracted, meaning page wasn't loaded/got an error, save the URL into a list
        timeout_urls.append(url)
    # Parse the HTML content of the webpage
    soup = BeautifulSoup(driver.page_source, 'lxml')
    # Find all population divs
    population_divs = soup.find_all('div', class_='ag-row')
    # Create an empty list to store AC/AN table for population
    data = []
    
    # Iterate over each population div
    for population_div in population_divs:
        # Find population name, allele count, and allele number within each population div
        population_name_element = population_div.find('div', attrs={'col-id': 'population'})
        allele_count_element = population_div.find('div', attrs={'col-id': 'alleleCount'})
        allele_num_element = population_div.find('div', attrs={'col-id': 'alleleNum'})
        
        # Check if all elements were found
        if population_name_element and allele_count_element and allele_num_element:
            population_name = population_name_element.text.strip()
            allele_count = allele_count_element.text.strip()
            allele_num = allele_num_element.text.strip()
            
            # Append population data to the list
            data.append([population_name, allele_count, allele_num])

    # save AC/AN for each population in a dataframe
    df = pd.DataFrame(data, columns=['Population', 'Allele Count', 'Allele Num'])
    
    # Add a new column 'Variant' with the extracted variant value, so we know which SNP we are scraping
    df['Variant'] = variant
    dataframes.append(df)

In [12]:
# put all variants extracted together
final_df = pd.concat(dataframes, ignore_index=True)
final_df.tail()

In [14]:
# save extracted dataframe and also timeout URLs
final_df.to_csv('final_dataframe_ssns_6.csv', index=False)
file_path = "timeout_urls_ssns_6.txt"

# Open the file in write mode
with open(file_path, "w") as file:
    # Iterate over each element in the list
    for element in timeout_urls:
        # Write the element to the file followed by a newline character
        file.write(element + "\n")